In [48]:
import pandas as pd

In [49]:
data = pd.read_csv("india_housing_prices.csv")

Handling Null Values

In [50]:
data.isnull().sum()

ID                                0
State                             0
City                              0
Locality                          0
Property_Type                     0
BHK                               0
Size_in_SqFt                      0
Price_in_Lakhs                    0
Price_per_SqFt                    0
Year_Built                        0
Furnished_Status                  0
Floor_No                          0
Total_Floors                      0
Age_of_Property                   0
Nearby_Schools                    0
Nearby_Hospitals                  0
Public_Transport_Accessibility    0
Parking_Space                     0
Security                          0
Amenities                         0
Facing                            0
Owner_Type                        0
Availability_Status               0
dtype: int64

Adding the Target Features

In [51]:
data['r'] = 0.05
data.loc[data['Public_Transport_Accessibility'] == 'High', 'r'] += 0.015
data.loc[data['Age_of_Property'] < 5, 'r'] += 0.01

data['Future_Investment_Value'] = data['Price_in_Lakhs'] * (1 + data['r'])**5

def get_class(row):
    if row['r'] >= 0.08 and row['Age_of_Property'] < 10:
        return 'Best Investment'
    elif row['r'] >= 0.07:
        return 'Better Investment, Can be invested even better'
    else:
        return 'Worst Investment'

data['Investment_Class'] = data.apply(get_class, axis=1)

In [52]:
data = data.drop('r', axis=1)

In [53]:
data = data.drop('ID', axis=1)

Splitting the Features and Target columns

In [54]:
X = data.drop(["Future_Investment_Value", "Investment_Class", "Price_in_Lakhs"], axis=1)
y = data['Future_Investment_Value']
y1 = data['Investment_Class']

In [55]:
X.columns

Index(['State', 'City', 'Locality', 'Property_Type', 'BHK', 'Size_in_SqFt',
       'Price_per_SqFt', 'Year_Built', 'Furnished_Status', 'Floor_No',
       'Total_Floors', 'Age_of_Property', 'Nearby_Schools', 'Nearby_Hospitals',
       'Public_Transport_Accessibility', 'Parking_Space', 'Security',
       'Amenities', 'Facing', 'Owner_Type', 'Availability_Status'],
      dtype='object')

Defining columns Encoder

In [56]:
target_cols = ['State', 'City', 'Locality']
ohe_cols = ['Property_Type', 'Facing' , 'Owner_Type']
label_cols = ['Security', 'Parking_Space', 'Public_Transport_Accessibility', 'Furnished_Status', 'Availability_Status']
multilabel_cols = ['Amenities']
# ordinal_cols = ['Furnished_Status', 'Availability_Status']
num_cols = ['BHK', 'Size_in_SqFt','Price_per_SqFt', 'Year_Built', 'Floor_No', 'Total_Floors', 'Age_of_Property','Nearby_Schools', 'Nearby_Hospitals' ]

One Hot Encoder

In [57]:
from sklearn.preprocessing import OneHotEncoder 

In [58]:
ohe = OneHotEncoder(
    handle_unknown="ignore",
    sparse_output=False
)

X_ohe = ohe.fit_transform(X[ohe_cols])
X_ohe = pd.DataFrame(
    X_ohe,
    columns=ohe.get_feature_names_out(ohe_cols)
)

Target Encoder

In [59]:
import category_encoders as ce

In [60]:
target_encoder = ce.TargetEncoder(cols=target_cols)
X_target = target_encoder.fit_transform(X[target_cols], y)
X_target = pd.DataFrame(X_target)

Label Encoder

In [61]:
from sklearn.preprocessing import LabelEncoder

In [62]:
label_encoders = {}
X_label = pd.DataFrame()

for col in label_cols:
    le = LabelEncoder()
    X_label[col] = le.fit_transform(X[col])
    label_encoders[col] = le


Multi Label Encoder

In [63]:
from sklearn.preprocessing import MultiLabelBinarizer

In [64]:
mlb_encoders = {}
X_multi = pd.DataFrame()

for col in multilabel_cols:
    mlb = MultiLabelBinarizer()
    transformed = mlb.fit_transform(X[col])
    df = pd.DataFrame(
        transformed,
        columns=[f"{col}_{c}" for c in mlb.classes_]
    )
    X_multi = pd.concat([X_multi, df], axis=1)
    mlb_encoders[col] = mlb

In [65]:
from sklearn.preprocessing import StandardScaler

In [66]:
scaler = StandardScaler()
X_num = scaler.fit_transform(X[num_cols])
X_num = pd.DataFrame(X_num, columns=num_cols)

In [67]:
X_final = pd.concat([
    X_target.reset_index(drop=True),
    X_ohe.reset_index(drop=True),
    X_label.reset_index(drop=True),
    X_multi.reset_index(drop=True),
    X_num.reset_index(drop=True)
], axis=1)

X_final.columns = X_final.columns.astype(str)

In [68]:
X_final

Unnamed: 0,State,City,Locality,Property_Type_Apartment,Property_Type_Independent House,Property_Type_Villa,Facing_East,Facing_North,Facing_South,Facing_West,...,Amenities_y,BHK,Size_in_SqFt,Price_per_SqFt,Year_Built,Floor_No,Total_Floors,Age_of_Property,Nearby_Schools,Nearby_Hospitals
0,337.051601,337.252886,328.236649,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1,-1.412483,1.530202,-0.234015,-1.684245,0.786006,-1.672471,1.684245,1.563291,-0.869827
1,334.524840,336.985253,335.465786,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1,0.000427,-0.296641,-0.386983,0.150887,0.674249,0.518589,-0.150887,0.868516,-1.566241
2,332.284246,332.679420,343.801354,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1,-0.706028,0.685979,-0.616433,-0.970582,0.450736,1.325822,0.970582,1.215904,0.871208
3,334.445292,333.636525,315.938188,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1,-0.706028,-0.006776,-0.157532,-1.582293,0.674249,1.210503,1.582293,-0.173645,0.523001
4,334.445292,335.246271,325.328122,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1,0.706882,1.594018,-0.692917,-0.460823,-1.337367,-1.557152,0.460823,-0.521032,1.219416
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249995,335.759180,333.120914,329.859139,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1,1.413337,-0.580356,0.071919,-1.174486,0.450736,-0.980558,1.174486,0.868516,0.174794
249996,337.051601,337.252886,328.204082,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1,-1.412483,-0.351231,-0.922368,0.660646,0.003710,1.095184,-0.660646,0.868516,1.567623
249997,335.593270,334.209452,324.826275,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1,0.706882,0.123163,-0.386983,-0.358872,0.674249,1.671779,0.358872,1.563291,-0.521620
249998,332.031767,329.306372,339.504708,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1,-1.412483,-1.230823,-0.081048,0.048936,-0.108046,-0.519282,-0.048936,-0.868419,0.523001


In [69]:
from scipy.stats import skew


num_cols = ['Size_in_SqFt', 'Price_in_Lakhs', 'Price_per_SqFt', 'Age_of_Property']

for col in num_cols:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bridge = Q1 - (1.5 * IQR)
    upper_bridge = Q3 + (1.5 * IQR)

    data.loc[data[col] >= upper_bridge, col] = upper_bridge
    data.loc[data[col] <= lower_bridge, col] = lower_bridge

  data.loc[data[col] >= upper_bridge, col] = upper_bridge
  data.loc[data[col] >= upper_bridge, col] = upper_bridge


In [70]:
import numpy as np
for col in num_cols:
    s = skew(data[col])
    if s > 0.75:
       
        data[col] = np.log1p(data[col])
        print(f"Applied Log Transform to {col} (Skew: {s:.2f})")

Applied Log Transform to Price_per_SqFt (Skew: 1.00)


Train model

Decision Tree

In [71]:
from sklearn.model_selection import train_test_split

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size = 0.3, random_state = 0)

In [73]:
from sklearn.tree import DecisionTreeRegressor

In [74]:
x = [i for i in range(1, 20)]
train_score = []
test_score = []

for depth in x:
    DTR = DecisionTreeRegressor(max_depth=depth, random_state=0)
    DTR.fit(X_train, y_train)
    
    train_score.append(DTR.score(X_train, y_train))
    test_score.append(DTR.score(X_test, y_test))

In [75]:
train_score

[0.5070582621696267,
 0.5934110483792397,
 0.7591891976249963,
 0.8226606910417211,
 0.8980423624482825,
 0.9314657388948778,
 0.9618454184495779,
 0.9740418124626333,
 0.9845617859689122,
 0.9895648005928441,
 0.9927305729673955,
 0.994630717347083,
 0.9959190651572203,
 0.9968827794860876,
 0.9976454907152161,
 0.9983000074057725,
 0.9988379476403587,
 0.9992531292323998,
 0.9995472100980723]

In [76]:
test_score

[0.5042135581171547,
 0.5900597910350667,
 0.7572093737220499,
 0.8217328467442322,
 0.8967415537176188,
 0.9299739347906094,
 0.9610315193287521,
 0.97344361162369,
 0.9841021553732474,
 0.9890534263432311,
 0.9920424834192899,
 0.9934713860135862,
 0.9941086869626838,
 0.994243541923976,
 0.9939860393233865,
 0.9935544885007471,
 0.9931049200440999,
 0.9927103799491451,
 0.9923984541575798]

In [77]:
from sklearn.tree import DecisionTreeClassifier

In [78]:
X_train, X_test, y1_train, y1_test = train_test_split(X_final, y1, test_size = 0.3, random_state = 0)

In [79]:
x = [i for i in range(1, 20)]
train_score = []
test_score = []

for depth in x:
    DTR = DecisionTreeClassifier(max_depth=depth, random_state=0)
    DTR.fit(X_train, y1_train)
    
    train_score.append(DTR.score(X_train, y1_train))
    test_score.append(DTR.score(X_test, y1_test))

In [80]:
train_score

[0.9703542857142857,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0]

In [81]:
test_score

[0.9704533333333333,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0]

In [82]:
model = DecisionTreeClassifier(max_depth=2, random_state=0)

In [83]:
model.fit(X_train, y1_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,2
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,0
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [84]:
model_score = model.score(X_test, y1_test)
print("The model score is :", model_score)

The model score is : 1.0


In [85]:
import pickle

In [86]:
artifacts = {
    "model": model,
    "target_encoder": target_encoder,
    "ohe": ohe,
    "label_encoder": label_encoders,
    "mlb_encoder": mlb_encoders,
    "scaler": scaler,
    "final_columns": X_final.columns.tolist()
}

with open("model_artifacts.pkl", "wb") as f:
    pickle.dump(artifacts, f)


In [87]:
with open("model_artifacts.pkl", "rb") as f:
    model = pickle.load(f)

In [88]:
model['model'].feature_names_in_

array(['State', 'City', 'Locality', 'Property_Type_Apartment',
       'Property_Type_Independent House', 'Property_Type_Villa',
       'Facing_East', 'Facing_North', 'Facing_South', 'Facing_West',
       'Owner_Type_Broker', 'Owner_Type_Builder', 'Owner_Type_Owner',
       'Security', 'Parking_Space', 'Public_Transport_Accessibility',
       'Furnished_Status', 'Availability_Status', 'Amenities_ ',
       'Amenities_,', 'Amenities_C', 'Amenities_G', 'Amenities_P',
       'Amenities_a', 'Amenities_b', 'Amenities_d', 'Amenities_e',
       'Amenities_g', 'Amenities_h', 'Amenities_l', 'Amenities_m',
       'Amenities_n', 'Amenities_o', 'Amenities_r', 'Amenities_s',
       'Amenities_u', 'Amenities_y', 'BHK', 'Size_in_SqFt',
       'Price_per_SqFt', 'Year_Built', 'Floor_No', 'Total_Floors',
       'Age_of_Property', 'Nearby_Schools', 'Nearby_Hospitals'],
      dtype=object)

KNN Classification

In [89]:
from sklearn.neighbors import KNeighborsClassifier

In [91]:
X_train, X_test, y1_train, y1_test = train_test_split(X_final, y1, test_size = 0.3, random_state = 0)

In [90]:
KNNC = KNeighborsClassifier(n_neighbors= 5)

In [93]:
KNNC.fit(X_train, y1_train)

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [94]:
KNNC_score = KNNC.score(X_test, y1_test)
KNNC_score

0.9710266666666667

Random Forest

In [95]:
from sklearn.ensemble import RandomForestClassifier

In [96]:
RC = RandomForestClassifier(n_estimators=100,
    random_state=42)

In [97]:
RC.fit(X_train, y1_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [100]:
RC_score = RC.score(X_test, y1_test)
RC_score

1.0