In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score, precision_score
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv('india_housing_prices.csv')
print("Data loaded successfully. Shape:", df.shape)
df.head()

Data loaded successfully. Shape: (250000, 23)


Unnamed: 0,ID,State,City,Locality,Property_Type,BHK,Size_in_SqFt,Price_in_Lakhs,Price_per_SqFt,Year_Built,...,Age_of_Property,Nearby_Schools,Nearby_Hospitals,Public_Transport_Accessibility,Parking_Space,Security,Amenities,Facing,Owner_Type,Availability_Status
0,1,Tamil Nadu,Chennai,Locality_84,Apartment,1,4740,489.76,0.1,1990,...,35,10,3,High,No,No,"Playground, Gym, Garden, Pool, Clubhouse",West,Owner,Ready_to_Move
1,2,Maharashtra,Pune,Locality_490,Independent House,3,2364,195.52,0.08,2008,...,17,8,1,Low,No,Yes,"Playground, Clubhouse, Pool, Gym, Garden",North,Builder,Under_Construction
2,3,Punjab,Ludhiana,Locality_167,Apartment,2,3642,183.79,0.05,1997,...,28,9,8,Low,Yes,No,"Clubhouse, Pool, Playground, Gym",South,Broker,Ready_to_Move
3,4,Rajasthan,Jodhpur,Locality_393,Independent House,2,2741,300.29,0.11,1991,...,34,5,7,High,Yes,Yes,"Playground, Clubhouse, Gym, Pool, Garden",North,Builder,Ready_to_Move
4,5,Rajasthan,Jaipur,Locality_466,Villa,4,4823,182.9,0.04,2002,...,23,4,9,Low,No,Yes,"Playground, Garden, Gym, Pool, Clubhouse",East,Builder,Ready_to_Move


In [3]:
# 1. Regression Target: Predict Future Price (5 years at 8% CAGR)
df['Future_Price_5Y'] = df['Price_in_Lakhs'] * (1.08 ** 5)

# 2. Classification Target: "Good Investment"
# Logic: Below median city price + High Transport access + Security features
city_median_pps = df.groupby('City')['Price_per_SqFt'].transform('median')

df['Is_Cheap'] = (df['Price_per_SqFt'] <= city_median_pps).astype(int)
df['Good_Transport'] = (df['Public_Transport_Accessibility'] == 'High').astype(int)
df['Has_Security'] = (df['Security'] == 'Yes').astype(int)

# Multi-factor score (Score >= 2 is labeled as Good Investment)
df['Investment_Score'] = df['Is_Cheap'] + df['Good_Transport'] + df['Has_Security']
df['Good_Investment'] = (df['Investment_Score'] >= 2).astype(int)

print(f"Target distribution for Investment:\n{df['Good_Investment'].value_counts(normalize=True)}")

Target distribution for Investment:
Good_Investment
0    0.57398
1    0.42602
Name: proportion, dtype: float64


In [4]:
le_cols = ['State', 'City', 'Locality', 'Property_Type', 'Furnished_Status', 
           'Public_Transport_Accessibility', 'Parking_Space', 'Security', 
           'Facing', 'Owner_Type', 'Availability_Status']

encoders = {}
for col in le_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    encoders[col] = le

# Remove helper columns and IDs
cols_to_drop = ['ID', 'Is_Cheap', 'Good_Transport', 'Has_Security', 'Investment_Score', 'Amenities']
df_ml = df.drop(columns=cols_to_drop)
df_ml.head()

Unnamed: 0,State,City,Locality,Property_Type,BHK,Size_in_SqFt,Price_in_Lakhs,Price_per_SqFt,Year_Built,Furnished_Status,...,Nearby_Schools,Nearby_Hospitals,Public_Transport_Accessibility,Parking_Space,Security,Facing,Owner_Type,Availability_Status,Future_Price_5Y,Good_Investment
0,15,6,483,0,1,4740,489.76,0.1,1990,0,...,10,3,0,0,0,3,2,0,719.618119,0
1,11,33,434,1,3,2364,195.52,0.08,2008,2,...,8,1,1,0,1,1,1,1,287.283026,1
2,13,25,75,0,2,3642,183.79,0.05,1997,1,...,9,8,1,1,0,2,0,0,270.047807,0
3,14,21,326,1,2,2741,300.29,0.11,1991,0,...,5,7,0,1,1,1,1,0,441.224528,1
4,14,19,407,2,4,4823,182.9,0.04,2002,1,...,4,9,1,0,1,0,1,0,268.740105,1


In [5]:
X = df_ml.drop(columns=['Future_Price_5Y', 'Good_Investment'])
y_reg = df_ml['Future_Price_5Y']
y_clf = df_ml['Good_Investment']

X_train, X_test, y_reg_train, y_reg_test, y_clf_train, y_clf_test = train_test_split(
    X, y_reg, y_clf, test_size=0.2, random_state=42
)

# Regression Model
print("Training Price Prediction Model...")
reg_model = RandomForestRegressor(n_estimators=50, max_depth=10, random_state=42, n_jobs=-1)
reg_model.fit(X_train, y_reg_train)

# Classification Model
print("Training Investment Classifier...")
clf_model = RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42, n_jobs=-1)
clf_model.fit(X_train, y_clf_train)

Training Price Prediction Model...
Training Investment Classifier...


0,1,2
,n_estimators,50
,criterion,'gini'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [6]:
# Regression Evaluation
y_reg_pred = reg_model.predict(X_test)
print(f"Regression MAE: {mean_absolute_error(y_reg_test, y_reg_pred):.2f} Lakhs")

# Classification Evaluation
y_clf_pred = clf_model.predict(X_test)
print(f"Classification Accuracy: {accuracy_score(y_clf_test, y_clf_pred):.2%}")

# Export for Streamlit
with open('reg_model.pkl', 'wb') as f: pickle.dump(reg_model, f)
with open('clf_model.pkl', 'wb') as f: pickle.dump(clf_model, f)
with open('encoders.pkl', 'wb') as f: pickle.dump(encoders, f)
with open('model_features.pkl', 'wb') as f: pickle.dump(X.columns.tolist(), f)

Regression MAE: 0.03 Lakhs
Classification Accuracy: 99.91%
