In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv('concatenated_properties.csv')

In [3]:
df.head()

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category,price
0,0.0,36.0,3,2,2.0,1.0,850.0,0,0,0.0,1.0,1.0,0.82
1,0.0,95.0,2,2,2.0,1.0,1226.0,1,0,0.0,1.0,2.0,0.95
2,0.0,103.0,2,2,1.0,1.0,1000.0,0,0,0.0,1.0,0.0,0.32
3,0.0,99.0,3,4,4.0,3.0,1615.0,1,0,1.0,0.0,2.0,1.6
4,0.0,5.0,2,2,1.0,3.0,582.0,0,1,0.0,0.0,2.0,0.48


In [4]:
df['furnishing_type'].value_counts()

furnishing_type
0.0    3313
1.0    2449
2.0     233
Name: count, dtype: int64

In [5]:
# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished
df['furnishing_type'] = df['furnishing_type'].replace({0.0:'unfurnished',1.0:'semifurnished',2.0:'furnished'})

In [6]:
df.head()

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category,price
0,0.0,36.0,3,2,2.0,1.0,850.0,0,0,unfurnished,1.0,1.0,0.82
1,0.0,95.0,2,2,2.0,1.0,1226.0,1,0,unfurnished,1.0,2.0,0.95
2,0.0,103.0,2,2,1.0,1.0,1000.0,0,0,unfurnished,1.0,0.0,0.32
3,0.0,99.0,3,4,4.0,3.0,1615.0,1,0,semifurnished,0.0,2.0,1.6
4,0.0,5.0,2,2,1.0,3.0,582.0,0,1,unfurnished,0.0,2.0,0.48


In [7]:
X = df.drop(columns=['price'])
y = df['price']

In [8]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

### Ordinal Encoding

In [104]:
columns_to_encode = ['property_type', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [10]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode)
    ], 
    remainder='passthrough'
)

In [11]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [12]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [13]:
scores.mean(),scores.std()

(0.6100832720576961, 0.0342502732752871)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [15]:
pipeline.fit(X_train,y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [16]:
X_train[X_train['sector']==78]

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
5590,0.0,78.0,2,2,2.0,3.0,1000.0,0,0,unfurnished,1.0,0.0
5033,0.0,78.0,2,2,2.0,3.0,1226.0,0,0,unfurnished,1.0,0.0
1577,0.0,78.0,3,3,3.0,4.0,1622.0,1,0,unfurnished,2.0,2.0
719,0.0,78.0,2,2,2.0,1.0,1444.0,0,0,unfurnished,1.0,0.0
5837,0.0,78.0,3,3,3.0,4.0,980.0,0,0,unfurnished,1.0,2.0


In [17]:
y_pred = pipeline.predict(X_test)

In [18]:
y_pred = np.expm1(y_pred)

In [19]:
mean_absolute_error(np.expm1(y_test),y_pred)

1.2561281731363703

In [20]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [21]:
import lightgbm as lgb
from sklearn.linear_model import Ridge,Lasso

from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

In [22]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor(),
    'lightgmb':lgb.LGBMRegressor()
    
}

In [23]:
model_output = []
for model_name,model in model_dict.items():
    print("training ", model_name)
    model_output.append(scorer(model_name, model))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003136 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 421
[LightGBM] [Info] Number of data points in the train set: 5395, number of used features: 12
[LightGBM] [Info] Start training from score 1.137423
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000717 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 421
[LightGBM] [Info] Number of data points in the train set: 5395, number of used features: 12
[LightGBM] [Info] Start training from score 1.137706
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000586 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, y

In [24]:
model_output

[['linear_reg', 0.6100832720576961, 1.2561281731363703],
 ['svr', 0.660080105915036, 1.1363064983005118],
 ['ridge', 0.6100858453791262, 1.2560434518525558],
 ['LASSO', 0.01457304924892382, 1.7794689283831295],
 ['decision tree', 0.6195497344743005, 1.1613510506431528],
 ['random forest', 0.8015090850372797, 0.8258841531414512],
 ['extra trees', 0.7767936420495476, 0.9023157459265818],
 ['gradient boosting', 0.7825743921315793, 0.883140993584859],
 ['adaboost', 0.5983032788865058, 1.2924645331468394],
 ['mlp', 0.7024151490026618, 1.0717160422274876],
 ['xgboost', 0.8077878846662279, 0.8028085732450079],
 ['lightgmb', 0.8108875354760856, 0.8209388534977639]]

In [25]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [26]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.807788,0.802809
11,lightgmb,0.810888,0.820939
5,random forest,0.801509,0.825884
7,gradient boosting,0.782574,0.883141
6,extra trees,0.776794,0.902316
9,mlp,0.702415,1.071716
1,svr,0.66008,1.136306
4,decision tree,0.61955,1.161351
2,ridge,0.610086,1.256043
0,linear_reg,0.610083,1.256128


### OneHotEncoding

In [101]:
df.info()
df['sector'] = df['sector'].astype(str)
df['property_type'] = df['property_type'].astype(str)
df['furnishing_type'] = df['furnishing_type'].astype(str)
df['balcony'] = df['balcony'].astype(str)
df['agePossession'] = df['agePossession'].astype(str)
df['floor_category'] = df['floor_category'].astype(str)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5995 entries, 0 to 5994
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   property_type    5995 non-null   float64
 1   sector           5995 non-null   float64
 2   bedRoom          5995 non-null   int64  
 3   bathroom         5995 non-null   int64  
 4   balcony          5995 non-null   float64
 5   agePossession    5995 non-null   float64
 6   built_up_area    5995 non-null   float64
 7   servant room     5995 non-null   int64  
 8   store room       5995 non-null   int64  
 9   furnishing_type  5995 non-null   object 
 10  luxury_category  5995 non-null   float64
 11  floor_category   5995 non-null   float64
 12  price            5995 non-null   float64
dtypes: float64(8), int64(4), object(1)
memory usage: 609.0+ KB


In [105]:
columns_to_encode

['property_type',
 'balcony',
 'agePossession',
 'furnishing_type',
 'luxury_category',
 'floor_category']

In [106]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',handle_unknown='ignore'),['sector'])
    ], 
    remainder='passthrough'
)

In [107]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [108]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [109]:
scores.mean()

0.7261980641338146

In [110]:
scores.std()

0.035892378315302495

In [111]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [112]:
pipeline.fit(X_train,y_train)

In [113]:
y_pred = pipeline.predict(X_test)

In [114]:
y_pred = np.expm1(y_pred)

In [115]:
mean_absolute_error(np.expm1(y_test),y_pred)

1.014313716533486

In [116]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [118]:
model_dict = {
   'linear_reg':LinearRegression(),
    
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'svr':SVR(),
    'decision tree': DecisionTreeRegressor(),
   
    
    'gradient boosting': GradientBoostingRegressor(),
    'xgboost':XGBRegressor(),
    'lightgmb':lgb.LGBMRegressor(),
   
     'random forest':RandomForestRegressor()
}

In [119]:
model_output = []
for model_name,model in model_dict.items():
    print("training ", model_name)
    model_output.append(scorer(model_name, model))

training  linear_reg
training  ridge
training  LASSO
training  svr
training  decision tree
training  gradient boosting
training  xgboost
training  lightgmb
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000638 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 485
[LightGBM] [Info] Number of data points in the train set: 5395, number of used features: 100
[LightGBM] [Info] Start training from score 1.137423
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000664 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 487
[LightGBM] [Info] Number of data points in the train set: 5395, number of used features: 101
[LightGBM] [Info] Start training from score 1.137706
[LightGBM] [Info] Auto-choosing ro

KeyboardInterrupt: 

In [120]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [121]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,xgboost,0.803891,0.786734
7,lightgmb,0.796555,0.825904
3,svr,0.796088,0.854341
5,gradient boosting,0.757501,0.919696
0,linear_reg,0.726198,1.014314
1,ridge,0.726347,1.016522
4,decision tree,0.636085,1.023628
2,LASSO,-0.000968,1.78686


### OneHotEncoding With PCA

In [122]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector'])
    ], 
    remainder='passthrough'
)

In [123]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('regressor', LinearRegression())
])

In [124]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [125]:
scores.mean()

0.6452133981050195

In [126]:
scores.std()

0.031438404610813604

In [127]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [128]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
   
    
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor(),
     
}

In [129]:
model_output = []
for model_name,model in model_dict.items():
     print("training ", model_name)
     model_output.append(scorer(model_name, model))

training  linear_reg
training  svr


KeyboardInterrupt: 

In [67]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [68]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
1,svr,0.748409,0.958863
0,linear_reg,0.643085,1.185201
2,ridge,0.643107,1.185249
4,decision tree,0.461908,1.408686
3,LASSO,-0.000968,1.78686


### Target Encoder

In [130]:
import category_encoders as ce

columns_to_encode = ['property_type', 'balcony',  'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [131]:
X

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,0.0,36.0,3,2,2.0,1.0,850.0,0,0,unfurnished,1.0,1.0
1,0.0,95.0,2,2,2.0,1.0,1226.0,1,0,unfurnished,1.0,2.0
2,0.0,103.0,2,2,1.0,1.0,1000.0,0,0,unfurnished,1.0,0.0
3,0.0,99.0,3,4,4.0,3.0,1615.0,1,0,semifurnished,0.0,2.0
4,0.0,5.0,2,2,1.0,3.0,582.0,0,1,unfurnished,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...
5990,0.0,15.0,4,4,4.0,3.0,3240.0,0,0,unfurnished,2.0,1.0
5991,0.0,113.0,3,3,3.0,3.0,2050.0,0,0,unfurnished,1.0,1.0
5992,0.0,54.0,4,4,4.0,3.0,3000.0,0,0,unfurnished,2.0,1.0
5993,0.0,82.0,2,2,2.0,3.0,726.0,0,0,unfurnished,1.0,1.0


In [132]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [133]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



In [134]:
scores.mean(),scores.std()

(0.6125183659916784, 0.03586355966293123)

In [135]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [136]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [None]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [None]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [None]:
model_df.sort_values(['mae'])

### Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__max_samples':[0.1, 0.25, 0.5, 1.0],
    'regressor__max_features': ['auto', 'sqrt']
}

In [None]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [None]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [None]:
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [None]:
search.fit(X, y_transformed)

In [None]:
final_pipe = search.best_estimator_

In [None]:
search.best_params_

In [None]:
search.best_score_

In [None]:
final_pipe.fit(X,y_transformed)

### Exporting the model

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=500))
])

In [None]:
pipeline.fit(X,y_transformed)

In [None]:
import pickle

with open('pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [None]:
with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)

In [None]:
X

### Trying out the predictions

In [None]:
X.columns

In [None]:
X.iloc[0].values

In [None]:
data = [['house', 'sector 102', 4, 3, '3+', 'New Property', 2750, 0, 0, 'unfurnished', 'Low', 'Low Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df


In [None]:
np.expm1(pipeline.predict(one_df))

In [None]:
X.dtypes

In [None]:
sorted(X['sector'].unique().tolist())