In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("../data/gurgaon_properties_post_feature_selection_top_8.csv")
data.head(5)

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,built_up_area,study room,servant room,store room
0,flat,sector 36,0.82,3,2,850,0,0,0
1,flat,sector 89,0.95,2,2,1226,1,1,0
2,flat,sohna road,0.32,2,2,1000,0,0,0
3,flat,sector 92,1.6,3,4,1615,0,1,0
4,flat,sector 102,0.48,2,2,582,0,0,1


In [3]:
data.columns

Index(['property_type', 'sector', 'price', 'bedRoom', 'bathroom',
       'built_up_area', 'study room', 'servant room', 'store room'],
      dtype='object')

- For categorical columns, we will try ordinal encoding, one hot encoding and label encoding to see which performs better.
- Numerical columns will be Standard scaled, to avoid scaling based issues while using models which are distance based.

In [4]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression

X = data.drop(columns=['price'])
y = np.log1p(data['price'])

numerical_cols = ['bedRoom', 'bathroom', 'built_up_area', 'study room', 'servant room', 'store room']
categorical_cols = ['property_type', 'sector']

transform_oe = ColumnTransformer([
    ('num', StandardScaler(), numerical_cols),
    ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_cols)
], remainder='passthrough')

pipeline_oe = Pipeline([
    ('preprocess', transform_oe),
    ('regression', LinearRegression())
])

transform_ohe = ColumnTransformer([
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols)
], remainder='passthrough')
pipeline_ohe = Pipeline([
    ('preprocess', transform_ohe),
    ('regression', LinearRegression())
])


In [5]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
score_oe = cross_val_score(pipeline_oe, X ,y, cv=kfold, scoring='r2' )

In [6]:
score_oe.mean(), score_oe.std()

(0.7281933649288062, 0.03502794974411478)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
pipeline_oe.fit(X_train, y_train)

In [9]:
from sklearn.metrics import mean_absolute_error

preds = pipeline_oe.predict(X_test)
preds = np.expm1(preds)
mean_absolute_error(np.expm1(y_test), preds)

0.9711457909147353

- Create a function which takes in the parameters : Transformer object, Model object


In [16]:
def compute_score(preprocessor, model, X, y, X_train, y_train, X_test, y_test):

    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regression', model)
    ])

    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    score = cross_val_score(pipeline, X ,y, cv=kfold, scoring='r2')

    pipeline.fit(X_train, y_train)

    preds = pipeline.predict(X_test)
    preds = np.expm1(preds)
    mae = mean_absolute_error(np.expm1(y_test), preds)

    return score, mae

In [11]:
from sklearn.linear_model import Lasso, Ridge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, AdaBoostRegressor
from xgboost import XGBRegressor

model_list = {'Lasso': Lasso(), 
              'Ridge':Ridge(), 
              'LinearRegression' : LinearRegression(), 
              'SVR' : SVR(), 
              'DecisionTreeRegressor' : DecisionTreeRegressor(), 
              'RandomForestRegressor' : RandomForestRegressor(), 
              'GradientBoostingRegressor':GradientBoostingRegressor(), 
              'ExtraTreesRegressor' : ExtraTreesRegressor(), 
              'AdaBoostRegressor': AdaBoostRegressor(), 
              'XGBRegressor' : XGBRegressor()}

In [21]:
model_names = []
r2_scores = []
mae_scores = []

#Checking ordinal encoded scores for the models
for model in model_list.keys():
    model_names.append(model)
    #print(type(model_list[model]))
    r2,mae = compute_score(transform_oe, model_list[model], X, y, X_train, y_train, X_test, y_test)
    r2_scores.append(r2)
    mae_scores.append(mae)

oe_results = pd.DataFrame(data={'model':model_names, 'r2_score':r2_scores, 'mae_score':mae_scores})


In [22]:
oe_results

Unnamed: 0,model,r2_score,mae_score
0,Lasso,"[0.04467450416292362, 0.07152362021857284, 0.0...",1.532484
1,Ridge,"[0.7008763656296529, 0.7493917451962038, 0.746...",0.97104
2,LinearRegression,"[0.7008720523381627, 0.7493470943573584, 0.746...",0.971146
3,SVR,"[0.7503771176206572, 0.8015318748557375, 0.772...",0.85291
4,DecisionTreeRegressor,"[0.755762256177297, 0.8288595873690083, 0.8486...",0.694922
5,RandomForestRegressor,"[0.8972638940354232, 0.9035556716533266, 0.912...",0.513914
6,GradientBoostingRegressor,"[0.8799888373394564, 0.8857970646785437, 0.891...",0.582285
7,ExtraTreesRegressor,"[0.8898689160607062, 0.8776247341951403, 0.887...",0.555886
8,AdaBoostRegressor,"[0.7558146626073751, 0.7828471073376868, 0.791...",0.81516
9,XGBRegressor,"[0.8995885416092424, 0.9005375862105067, 0.913...",0.52076
