In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

In [4]:
pip show numpy

Name: numpy
Version: 1.23.5
Summary: NumPy is the fundamental package for array computing with Python.
Home-page: https://www.numpy.org
Author: Travis E. Oliphant et al.
Author-email: 
License: BSD
Location: /Users/hrishityelchuri/anaconda3/lib/python3.10/site-packages
Requires: 
Required-by: astropy, bokeh, Bottleneck, category-encoders, contourpy, datashader, datashape, gensim, h5py, holoviews, hvplot, imagecodecs, ImageHash, imageio, imbalanced-learn, matplotlib, mlxtend, numba, numexpr, opencv-python, opt-einsum, pandas, patsy, phik, pyerfa, PyWavelets, scikit-image, scikit-learn, scipy, seaborn, shap, statsmodels, tables, tensorboard, tensorflow-macos, tifffile, transformers, visions, wordcloud, xarray, xgboost, ydata-profiling
Note: you may need to restart the kernel to use updated packages.


In [3]:
df = pd.read_csv('gurgaon_properties_post_feature_selection_2.csv')

In [4]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 63,7.9,4.0,4.0,3+,Under Construction,3950.0,0.0,0.0,1.0,Medium,High Floor
1,flat,sector 63a,3.7,3.0,3.0,3+,Under Construction,2667.0,0.0,0.0,1.0,Medium,Low Floor
2,flat,manesar,1.2,3.0,3.0,3+,Moderately Old,2944.0,1.0,0.0,1.0,Low,Mid Floor
3,house,sector 33,11.5,5.0,6.0,3+,Relatively New,4680.0,1.0,0.0,1.0,Medium,Mid Floor
4,flat,sector 67a,1.85,3.0,4.0,3,New Property,1941.0,0.0,1.0,0.0,High,Mid Floor


In [5]:
df.shape

(3554, 13)

In [6]:
df['furnishing_type'].value_counts()

1.0    2354
0.0    1015
2.0     185
Name: furnishing_type, dtype: int64

In [7]:
# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished
df['furnishing_type'] = df['furnishing_type'].replace({0.0:'unfurnished',1.0:'semifurnished',2.0:'furnished'})

In [8]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 63,7.9,4.0,4.0,3+,Under Construction,3950.0,0.0,0.0,semifurnished,Medium,High Floor
1,flat,sector 63a,3.7,3.0,3.0,3+,Under Construction,2667.0,0.0,0.0,semifurnished,Medium,Low Floor
2,flat,manesar,1.2,3.0,3.0,3+,Moderately Old,2944.0,1.0,0.0,semifurnished,Low,Mid Floor
3,house,sector 33,11.5,5.0,6.0,3+,Relatively New,4680.0,1.0,0.0,semifurnished,Medium,Mid Floor
4,flat,sector 67a,1.85,3.0,4.0,3,New Property,1941.0,0.0,1.0,unfurnished,High,Mid Floor


In [9]:
X = df.drop(columns=['price'])
y = df['price']

In [10]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

### Ordinal Encoding

In [11]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [13]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode)
    ], 
    remainder='passthrough'
)

In [14]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [15]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [16]:
scores.mean(),scores.std()

(0.7198670562402387, 0.04206605218811255)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [18]:
pipeline.fit(X_train,y_train)

In [19]:
y_pred = pipeline.predict(X_test)

In [20]:
y_pred = np.expm1(y_pred)

np.expm1 is used when models have been trained on log-transformed target variables

In [21]:
y_pred = pipeline.predict(X_test)

In [22]:
mean_absolute_error(np.expm1(y_test),y_pred)

1.4678206264894795

In [23]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [24]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

In [25]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [26]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [27]:
model_output

[['linear_reg', 0.7198670562402387, 0.9645315534589922],
 ['svr', 0.7555934841035177, 0.8883675551661968],
 ['ridge', 0.7198709241512696, 0.9645938075585608],
 ['LASSO', 0.05710877167975701, 1.5281147599599396],
 ['decision tree', 0.7869608021288085, 0.7280662210340364],
 ['random forest', 0.8862221631484897, 0.5282819297088933],
 ['extra trees', 0.8707839827711125, 0.5840376492419613],
 ['gradient boosting', 0.8766178503102126, 0.5477488897254736],
 ['adaboost', 0.7554752470196019, 0.8500610772274468],
 ['mlp', 0.8027213149185425, 0.732029743289345],
 ['xgboost', 0.8922606189954652, 0.4776694657287853]]

In [28]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [29]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.892261,0.477669
5,random forest,0.886222,0.528282
7,gradient boosting,0.876618,0.547749
6,extra trees,0.870784,0.584038
4,decision tree,0.786961,0.728066
9,mlp,0.802721,0.73203
8,adaboost,0.755475,0.850061
1,svr,0.755593,0.888368
0,linear_reg,0.719867,0.964532
2,ridge,0.719871,0.964594


### OneHotEncoding

In [30]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',handle_unknown='ignore'),['sector','agePossession','furnishing_type'])
    ], 
    remainder='passthrough'
)

In [31]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [32]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



In [33]:
scores.mean(),scores.std()

(0.8487769415264432, 0.023755948813277097)

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [35]:
pipeline.fit(X_train,y_train)

In [36]:
y_pred = pipeline.predict(X_test)

In [37]:
y_pred = np.expm1(y_pred)

In [38]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.6826246664623207

In [39]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [40]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [41]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [42]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [43]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.899629,0.472707
5,random forest,0.89414,0.476531
6,extra trees,0.895871,0.494876
7,gradient boosting,0.878599,0.53816
9,mlp,0.869488,0.573779
4,decision tree,0.803055,0.660062
0,linear_reg,0.848777,0.682625
2,ridge,0.848774,0.684281
8,adaboost,0.754393,0.828849
1,svr,0.760422,0.873663


### Target Encoder

In [44]:
!pip install category_encoders



In [44]:
import category_encoders as ce

columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [45]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [46]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [47]:
scores.mean(),scores.std()

(0.8199622664890187, 0.02506445202641817)

In [48]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [49]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [50]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [51]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [52]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.903227,0.459209
10,xgboost,0.902188,0.460126
6,extra trees,0.901631,0.472939
7,gradient boosting,0.890931,0.516251
9,mlp,0.846075,0.614066
8,adaboost,0.821527,0.714282
4,decision tree,0.826605,0.721176
0,linear_reg,0.819962,0.756369
2,ridge,0.819981,0.756486
1,svr,0.774124,0.860835


### Hyperparameter Tuning

In [53]:
from sklearn.model_selection import GridSearchCV

In [63]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__max_samples':[0.1, 0.25, 0.5, 1.0],
    'regressor__max_features': ['sqrt']
}

In [64]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [65]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [66]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [67]:
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [68]:
search.fit(X, y_transformed)

Fitting 10 folds for each of 64 candidates, totalling 640 fits


In [69]:
final_pipe = search.best_estimator_

In [70]:
search.best_params_

{'regressor__max_depth': 30,
 'regressor__max_features': 'sqrt',
 'regressor__max_samples': 1.0,
 'regressor__n_estimators': 300}

In [71]:
search.best_score_

0.9021515698640797

In [72]:
final_pipe.fit(X,y_transformed)

### Exporting the model

In [73]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [74]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=500))
])

In [75]:
pipeline.fit(X,y_transformed)

In [76]:
import pickle

with open('pipeline2.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [77]:
with open('df2.pkl', 'wb') as file:
    pickle.dump(X, file)

In [78]:
X

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 63,4.0,4.0,3+,Under Construction,3950.0,0.0,0.0,semifurnished,Medium,High Floor
1,flat,sector 63a,3.0,3.0,3+,Under Construction,2667.0,0.0,0.0,semifurnished,Medium,Low Floor
2,flat,manesar,3.0,3.0,3+,Moderately Old,2944.0,1.0,0.0,semifurnished,Low,Mid Floor
3,house,sector 33,5.0,6.0,3+,Relatively New,4680.0,1.0,0.0,semifurnished,Medium,Mid Floor
4,flat,sector 67a,3.0,4.0,3,New Property,1941.0,0.0,1.0,unfurnished,High,Mid Floor
...,...,...,...,...,...,...,...,...,...,...,...,...
3549,house,sector 72,4.0,5.0,3+,Relatively New,7000.0,0.0,0.0,unfurnished,Low,Mid Floor
3550,flat,sector 28,2.0,2.0,2,Moderately Old,1109.0,0.0,0.0,semifurnished,Low,Low Floor
3551,flat,sector 28,3.0,4.0,3,Old Property,3133.0,1.0,1.0,semifurnished,Low,Mid Floor
3552,flat,sector 79,2.0,2.0,3+,New Property,1223.0,0.0,0.0,unfurnished,Low,Mid Floor


### Trying out the predictions

In [79]:
X.columns

Index(['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category'],
      dtype='object')

In [80]:
X.iloc[0].values

array(['flat', 'sector 63', 4.0, 4.0, '3+', 'Under Construction', 3950.0,
       0.0, 0.0, 'semifurnished', 'Medium', 'High Floor'], dtype=object)

In [81]:
data = [['house', 'sector 102', 4, 3, '3+', 'New Property', 2750, 0, 0, 'unfurnished', 'Low', 'Low Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,house,sector 102,4,3,3+,New Property,2750,0,0,unfurnished,Low,Low Floor


In [82]:
np.expm1(pipeline.predict(one_df))

array([2.69086743])

In [83]:
X.dtypes

property_type       object
sector              object
bedRoom            float64
bathroom           float64
balcony             object
agePossession       object
built_up_area      float64
servant room       float64
store room         float64
furnishing_type     object
luxury_category     object
floor_category      object
dtype: object

In [85]:
X.isnull().sum()

property_type      0
sector             0
bedRoom            0
bathroom           0
balcony            0
agePossession      0
built_up_area      0
servant room       0
store room         0
furnishing_type    0
luxury_category    0
floor_category     0
dtype: int64

[CV 4/10] END regressor__max_depth=None, regressor__max_features=auto, regressor__max_samples=0.1, regressor__n_estimators=50;, score=nan total time=   0.0s
[CV 9/10] END regressor__max_depth=None, regressor__max_features=auto, regressor__max_samples=0.1, regressor__n_estimators=50;, score=nan total time=   0.0s
[CV 3/10] END regressor__max_depth=None, regressor__max_features=auto, regressor__max_samples=0.1, regressor__n_estimators=100;, score=nan total time=   0.0s
[CV 8/10] END regressor__max_depth=None, regressor__max_features=auto, regressor__max_samples=0.1, regressor__n_estimators=100;, score=nan total time=   0.0s
[CV 7/10] END regressor__max_depth=None, regressor__max_features=auto, regressor__max_samples=0.1, regressor__n_estimators=200;, score=nan total time=   0.0s
[CV 7/10] END regressor__max_depth=None, regressor__max_features=auto, regressor__max_samples=0.1, regressor__n_estimators=300;, score=nan total time=   0.0s
[CV 5/10] END regressor__max_depth=None, regressor__ma

In [77]:
pip show scikit-learn

Name: scikit-learn
Version: 1.3.2
Summary: A set of python modules for machine learning and data mining
Home-page: http://scikit-learn.org
Author: 
Author-email: 
License: new BSD
Location: /Users/hrishityelchuri/anaconda3/lib/python3.10/site-packages
Requires: joblib, numpy, scipy, threadpoolctl
Required-by: category-encoders, imbalanced-learn, mlxtend, shap
Note: you may need to restart the kernel to use updated packages.


In [79]:
!pip install --upgrade scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.4.0-1-cp310-cp310-macosx_12_0_arm64.whl (10.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.6/10.6 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting joblib>=1.2.0
  Using cached joblib-1.3.2-py3-none-any.whl (302 kB)
Installing collected packages: joblib, scikit-learn
  Attempting uninstall: joblib
    Found existing installation: joblib 1.1.1
    Uninstalling joblib-1.1.1:
      Successfully uninstalled joblib-1.1.1
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.3.2
    Uninstalling scikit-learn-1.3.2:
      Successfully uninstalled scikit-learn-1.3.2
Successfully installed joblib-1.3.2 scikit-learn-1.4.0


In [12]:
pip show scikit-learn

Name: scikit-learn
Version: 1.4.0
Summary: A set of python modules for machine learning and data mining
Home-page: https://scikit-learn.org
Author: 
Author-email: 
License: new BSD
Location: /Users/hrishityelchuri/anaconda3/lib/python3.10/site-packages
Requires: joblib, numpy, scipy, threadpoolctl
Required-by: category-encoders, imbalanced-learn, mlxtend, shap
Note: you may need to restart the kernel to use updated packages.
