In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor,GradientBoostingRegressor,ExtraTreesRegressor

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
df=pd.read_csv('post_feature_selection2.csv')

In [3]:
df.head(
)

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,0.0,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,0.0,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,0.0,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,1.0,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,0.0,High,Mid Floor


In [4]:
df['furnishing_type']=df['furnishing_type'].replace({0.0:'unfurnished',1.0:'semifurnished',2.0:'furnished'})

In [5]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,unfurnished,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,unfurnished,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,unfurnished,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,semifurnished,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,unfurnished,High,Mid Floor


In [6]:
X = df.drop(columns=['price'])
y = df['price']

In [7]:
y_transformed = np.log1p(y)

#  using Ordinal encoding to find the best model

In [8]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category','floor_category']

In [9]:
preprocessor=ColumnTransformer(transformers=[('num',StandardScaler(),
                                              ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
                                            ('cat',OrdinalEncoder(),columns_to_encode)],remainder='passthrough')

In [10]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [11]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [12]:
model_output=[]

for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [13]:
model_output

[['linear_reg', 0.7363096633436828, 0.9463822160089356],
 ['svr', 0.7642012011196353, 0.8472636473483922],
 ['ridge', 0.7363125343993554, 0.946338774185337],
 ['LASSO', 0.05943378064493573, 1.528905986892753],
 ['decision tree', 0.7735763778397409, 0.7372949651509049],
 ['random forest', 0.880801719806772, 0.5253536433629981],
 ['extra trees', 0.8693048974504423, 0.5525101248623453],
 ['gradient boosting', 0.8726426764128883, 0.5757310367406021],
 ['adaboost', 0.7612883287483123, 0.8278685344712808],
 ['mlp', 0.8119340169858702, 0.6803817259756759],
 ['xgboost', 0.8917010012719994, 0.5113240614244203]]

In [14]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])


In [15]:
model_df.sort_values(['mae'])


Unnamed: 0,name,r2,mae
10,xgboost,0.891701,0.511324
5,random forest,0.880802,0.525354
6,extra trees,0.869305,0.55251
7,gradient boosting,0.872643,0.575731
9,mlp,0.811934,0.680382
4,decision tree,0.773576,0.737295
8,adaboost,0.761288,0.827869
1,svr,0.764201,0.847264
2,ridge,0.736313,0.946339
0,linear_reg,0.73631,0.946382


# using ohe


In [16]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first'),['sector','agePossession','furnishing_type'])
    ], 
    remainder='passthrough'
)

In [17]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [18]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [19]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [20]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])
model_df.sort_values(['mae'])
# as expected we can see that the linear models perform better with ohe

Unnamed: 0,name,r2,mae
6,extra trees,0.895127,0.473872
10,xgboost,0.896218,0.488796
5,random forest,0.891779,0.506373
9,mlp,0.865833,0.539062
7,gradient boosting,0.876532,0.570002
0,linear_reg,0.854609,0.649751
2,ridge,0.854678,0.652894
4,decision tree,0.804782,0.708205
8,adaboost,0.756447,0.829998
1,svr,0.769741,0.834124


# using target encoder

In [21]:
!pip install category_encoders --user



DEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063

[notice] A new release of pip is available: 23.3.1 -> 23.3.2




[notice] To update, run: python.exe -m pip install --upgrade pip


In [22]:
import category_encoders as ce

columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [23]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [24]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [25]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [26]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.901667,0.456759
6,extra trees,0.9017,0.462859
10,xgboost,0.900643,0.483409
7,gradient boosting,0.889187,0.511005
4,decision tree,0.828705,0.516856
9,mlp,0.850216,0.609622
8,adaboost,0.817679,0.712815
0,linear_reg,0.829522,0.713011
2,ridge,0.829536,0.713523
1,svr,0.782917,0.818851


# hyperparameter tuning

In [27]:
from sklearn.model_selection import GridSearchCV

In [28]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300,400,500],
    'regressor__max_depth': [None, 10, 20, 30,40],
    'regressor__max_samples':[0.1, 0.25, 0.5, 1.0],
    'regressor__max_features': ['auto', 'sqrt']
}

In [29]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category','floor_category']
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [30]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [31]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [32]:
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [33]:
search.fit(X, y_transformed)


Fitting 10 folds for each of 240 candidates, totalling 2400 fits


1200 fits failed out of a total of 2400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
677 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Arun\AppData\Roaming\Python\Python38\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Arun\AppData\Roaming\Python\Python38\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\Arun\AppData\Roaming\Python\Python38\site-packages\sklearn\pipeline.py", line 427, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\Arun\AppData\Roaming\Python\Python38\site-packages\

In [34]:
final_pipe = search.best_estimator_


In [35]:
search.best_params_


{'regressor__max_depth': 20,
 'regressor__max_features': 'sqrt',
 'regressor__max_samples': 1.0,
 'regressor__n_estimators': 500}

In [36]:
search.best_score_


0.9029415897960789

In [37]:
final_pipe.fit(X,y_transformed)


In [43]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [44]:

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=500))
])

In [45]:
pipeline.fit(X,y_transformed)


# Importng the data and pipeline

In [46]:

import pickle

with open('pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [47]:

with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)

In [38]:
#from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
#from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder

In [39]:
#columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category','floor_category']
#preprocessor = ColumnTransformer(
    #transformers=[
        #('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        #('cat', OrdinalEncoder(), columns_to_encode),
        #('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        #('target_enc', ce.TargetEncoder(), ['sector'])
   # ], 
   # remainder='passthrough'
#)

In [40]:
# space = {
#         'max_depth':hp.choice('max_depth', np.arange(50, 500, 1, dtype=int)),
#         'n_estimators':hp.choice('n_estimators', np.arange(1000, 10000, 10, dtype=int)),
#         'colsample_bytree':hp.quniform('colsample_bytree', 0.5, 1.0, 0.1),
#         'min_child_weight':hp.choice('min_child_weight', np.arange(250, 350, 10, dtype=int)),
#         'subsample':hp.quniform('subsample', 0.7, 0.9, 0.1),
#         'eta':hp.quniform('eta', 0.1, 0.3, 0.1),
        
#         'objective':'reg:absoluteerror',
        
        
#         'eval_metric': 'mae',
#     }

In [41]:
# def score(params):
#     model = XGBRegressor(**params)
    
#     score=mean_absolute_error(np.expm1(y_test),y_pred)
#     print(score)
#     return {'loss': score, 'status': STATUS_OK} 

# pipeline = Pipeline([
#     ('preprocessor', preprocessor),
#     ('regressor',XGBRegressor())
#     ])
    
# X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
# early_stopping_rounds=10
# eval_set=[(X_train, y_train), (X_test, y_test)]

    
# pipeline.fit(X_train, y_train,regressor__eval_set=eval_set, regressor__early_stopping_rounds=early_stopping_rounds)
# y_pred = pipeline.predict(X_test).clip(0, 20)
# y_pred = np.expm1(y_pred)

In [42]:

    
#best = fmin(score, space, algo=tpe.suggest, max_evals=1000)

#trials = Trials()


# Return the best parameters
#space_eval(space, best_params)