### Import Librairies

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestRegressor,
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    AdaBoostRegressor
)

from sklearn.neural_network import MLPRegressor



from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

## Reading Data

In [2]:
df = pd.read_csv(r"D:\campusx python\Project\real estate project\notebook\gurgaon_properties_post_feature_selection_v2.csv")
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,0.0,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,0.0,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,0.0,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,1.0,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,0.0,High,Mid Floor


In [3]:
df['furnishing_type'].value_counts()

furnishing_type
0.0    2349
1.0    1018
2.0     187
Name: count, dtype: int64

In [4]:
# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished
df['furnishing_type'] = df['furnishing_type'].replace({0.0:'unfurnished',1.0:'semifurnished',2.0:'furnished'})

In [5]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,unfurnished,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,unfurnished,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,unfurnished,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,semifurnished,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,unfurnished,High,Mid Floor


In [6]:
X = df.drop(columns=['price'])
y = df['price']

In [7]:
# Applying log1p traansformation bcz our price variable is right skewed
y_transformed = np.log1p(y)

## Encoding

### Ordinal Encoding

In [8]:
def data_dtype(df):
    num = []
    cat= []
    for i in df.columns:
        if df[i].dtype == 'int64' or df[i].dtype == 'float64':
            num.append(i)
        else:
            cat.append(i)
    return num,cat
            

In [9]:
col_to_encode = ['property_type','sector','balcony','agePossession','furnishing_type','luxury_category','floor_category']

In [10]:
num,cat = data_dtype(df)

In [11]:
num

['price', 'bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']

In [12]:
if 'price' in num:
    num.remove('price')

In [13]:
num

['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']

In [14]:
cat

['property_type',
 'sector',
 'balcony',
 'agePossession',
 'furnishing_type',
 'luxury_category',
 'floor_category']

In [15]:
# Create a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num),
        ('cat', OrdinalEncoder(), cat)
    ],
    remainder='passthrough'
)

In [16]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [17]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [18]:
scores.mean(),scores.std()

(0.7363096633436828, 0.03238005754429936)

In [19]:
x_train, x_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [20]:
pipeline.fit(x_train,y_train)

In [21]:
y_pred = pipeline.predict(x_test)

In [22]:
y_pred = np.expm1(y_pred)

In [23]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.9463822160089356

In [24]:
def score(model_name,model):
    output = []
    
    output.append(model_name)
    
    # Creating a pipeline
    pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    # Spliting the data
    
    x_train, x_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    # fitting the data
    pipeline.fit(x_train,y_train)
    
    y_pred = pipeline.predict(x_test)
    y_pred = np.expm1(y_pred)
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [25]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [26]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(score(model_name, model))

In [27]:
model_output

[['linear_reg', 0.7363096633436828, 0.9463822160089356],
 ['svr', 0.7642012011196353, 0.8472636473483922],
 ['ridge', 0.7363125343993554, 0.946338774185337],
 ['LASSO', 0.05943378064493573, 1.528905986892753],
 ['decision tree', 0.777598199382272, 0.7357860588466782],
 ['random forest', 0.8823600401612378, 0.5399752270168212],
 ['extra trees', 0.8682398929405309, 0.5542006685280579],
 ['gradient boosting', 0.8725298048769522, 0.5761539390688156],
 ['adaboost', 0.753375549993276, 0.8355110857611908],
 ['mlp', 0.8102988272433441, 0.747982243928904],
 ['xgboost', 0.8894876835260124, 0.5040475141482346]]

In [28]:
model_df = pd.DataFrame(model_output,columns=['name','r2','mae'])

In [29]:
model_df

Unnamed: 0,name,r2,mae
0,linear_reg,0.73631,0.946382
1,svr,0.764201,0.847264
2,ridge,0.736313,0.946339
3,LASSO,0.059434,1.528906
4,decision tree,0.777598,0.735786
5,random forest,0.88236,0.539975
6,extra trees,0.86824,0.554201
7,gradient boosting,0.87253,0.576154
8,adaboost,0.753376,0.835511
9,mlp,0.810299,0.747982


### OneHotEncoding

In [30]:
convert_to_ohe = ['sector','agePossession','furnishing_type']

In [31]:
# for  i in convert_to_ohe:
#     if i in cat:
#         cat.remove(i)

In [32]:
# cat

In [33]:
# Create a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num),
        ('cat', OrdinalEncoder(), cat),
        ('cat1',OneHotEncoder(drop='first'),convert_to_ohe)
    ],
    remainder='passthrough'
)

In [34]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [35]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [36]:
scores.mean(),scores.std()

(0.8546094810971422, 0.015997422908695623)

In [37]:
x_train, x_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [38]:
pipeline.fit(x_train,y_train)

In [39]:
y_pred = pipeline.predict(x_test)
y_pred = np.expm1(y_pred)
mean_absolute_error(np.expm1(y_test),y_pred)

0.6497514315131458

In [40]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [41]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [42]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [43]:
model_df = pd.DataFrame(model_output,columns=['name','r2','mae'])
model_df.sort_values('mae')

Unnamed: 0,name,r2,mae
6,extra trees,0.894576,0.473696
10,xgboost,0.89585,0.493456
5,random forest,0.890587,0.505059
9,mlp,0.865276,0.559302
7,gradient boosting,0.876653,0.569176
0,linear_reg,0.854609,0.649751
2,ridge,0.854678,0.652894
4,decision tree,0.806642,0.694677
1,svr,0.769741,0.834124
8,adaboost,0.753787,0.84538


### Target Encoder

In [46]:
# pip install category_encoders

In [47]:
import category_encoders as ce

columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [48]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [49]:
scores.mean(),scores.std()

(0.8546094810971422, 0.015997422908695623)

In [50]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [51]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [52]:
model_df = pd.DataFrame(model_output,columns=['name','r2','mae'])
model_df.sort_values('mae')

Unnamed: 0,name,r2,mae
10,xgboost,0.904798,0.447518
5,random forest,0.901446,0.454814
6,extra trees,0.901197,0.4611
7,gradient boosting,0.889281,0.51065
4,decision tree,0.832816,0.555256
9,mlp,0.85272,0.610803
8,adaboost,0.81603,0.695094
0,linear_reg,0.829522,0.713011
2,ridge,0.829536,0.713523
1,svr,0.782917,0.818851


### Hyperperameter tuning

In [71]:
from sklearn.model_selection import RandomizedSearchCV

In [66]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [75]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb', XGBRegressor())
])

In [76]:
param_grid = {
    'xgb__n_estimators': [100, 200, 300],
    'xgb__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'xgb__max_depth': [3, 5, 7, 10],
    'xgb__subsample': [0.6, 0.8, 1.0],
    'xgb__colsample_bytree': [0.6, 0.8, 1.0],
    'xgb__gamma': [0, 0.1, 0.2, 0.3]
}

In [77]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [78]:
random_search = RandomizedSearchCV(pipeline, param_distributions=param_grid, n_iter=50, 
                                   cv=kfold, scoring='r2', random_state=42, n_jobs=-1, verbose=2)

In [79]:
random_search.fit(X, y_transformed)

Fitting 10 folds for each of 50 candidates, totalling 500 fits


In [81]:
# Get the best parameters and the best score
best_params = random_search.best_params_
best_score = random_search.best_score_

In [90]:
best_params

{'xgb__subsample': 0.8,
 'xgb__n_estimators': 300,
 'xgb__max_depth': 5,
 'xgb__learning_rate': 0.05,
 'xgb__gamma': 0,
 'xgb__colsample_bytree': 0.8}

In [83]:
print('Best R^2 score:', best_score)

Best R^2 score: 0.9072079235408491


In [87]:
best_model = random_search.best_estimator_

In [88]:
scores = cross_val_score(best_model, X, y_transformed, cv=kfold, scoring='r2')

In [89]:
scores.mean(),scores.std()

(0.9072079235408491, 0.013642303365022085)

In [91]:
best_model.fit(X,y_transformed)

### Exporting the model

In [92]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [93]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb', XGBRegressor())
])

In [94]:
pipeline.fit(X,y_transformed)

In [95]:
import pickle

with open('pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [96]:
with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)

### Trying out the prediction

In [105]:
data = [['house', 'sector 49', 3, 3, '3+', 'New Property', 1750, 0, 0, 'unfurnished', 'Low', 'Low Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,house,sector 49,3,3,3+,New Property,1750,0,0,unfurnished,Low,Low Floor


In [106]:
np.expm1(pipeline.predict(one_df))

array([2.8327115], dtype=float32)