# **ML Model Building**

In [1]:
# importing the necessary libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import pickle

In [2]:
# reading the cleaned data file
df = pd.read_csv("Cleaned_House_Rent_Train.csv")
# top five rows
df.head()

Unnamed: 0,type,locality,latitude,longitude,lease_type,negotiable,furnishing,parking,property_size,property_age,...,SC,GP,PARK,RWH,STP,HK,PB,VP,type_encode,locality_encode
0,BHK2,Bellandur,12.929557,77.67228,2,1,2,4,1400,4.0,...,1,0,1,1,1,0,1,1,3,376
1,BHK3,Thiruvanmiyur,12.98287,80.262012,3,0,2,4,1350,6.0,...,1,0,1,0,0,0,1,1,4,1687
2,BHK1,Attiguppe,12.955991,77.531634,3,1,2,2,600,3.0,...,0,0,0,0,0,0,0,0,2,228
3,BHK3,Kodihalli,12.963903,77.649446,3,1,2,4,1500,15.0,...,0,0,1,0,0,0,0,1,4,1058
4,BHK1,"Seetharampalya,Hoodi",12.986196,77.718314,3,1,2,4,1080,0.0,...,0,0,0,0,0,0,1,0,2,1554


using random forest algorithm to know the feature importances of all input variables

In [3]:
# input(X)
X = df[['type_encode', 'latitude', 'longitude', 'lease_type', 'negotiable','furnishing', 'parking', 'property_size', 'property_age', 
        'bathroom', 'facing', 'cup_board', 'floor', 'total_floor', 'water_supply', 'building_type', 'balconies','LIFT', 'GYM',
        'INTERNET', 'AC','CLUB', 'INTERCOM', 'POOL', 'CPA','FS', 'SERVANT', 'SECURITY', 'SC', 'GP', 'PARK', 'RWH', 'STP', 'HK','PB', 'VP']]
# output(Y)
Y = df["rent"]

In [4]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()  
rf.fit(X, Y)

pred_y_train = rf.predict(X)                                                    
                                                                                              
from sklearn.metrics import mean_squared_error
print("Train MSE:", mean_squared_error(Y, pred_y_train))        
                                                                                                         
from sklearn.metrics import r2_score                                                                        
print("Train r2_score:", r2_score(Y, pred_y_train))                                

Train MSE: 0.004568366478148225
Train r2_score: 0.974866326448634


In [5]:
# feature importances
pd.DataFrame(list(zip(X.columns, rf.feature_importances_)), columns = ["features", "importance"]).sort_values("importance", ascending = False)

Unnamed: 0,features,importance
7,property_size,0.602244
2,longitude,0.133652
1,latitude,0.049465
0,type_encode,0.04202
13,total_floor,0.019006
9,bathroom,0.018498
8,property_age,0.016893
17,LIFT,0.016596
5,furnishing,0.010495
6,parking,0.009969


selecting the top 9 features to use it for ML model

In [6]:
# input(X)
X = df[['type_encode', 'latitude', 'longitude', "property_size", "total_floor", "property_age",  "bathroom", "LIFT", "furnishing"]]
# output(Y)
Y = df["rent"]

In [7]:
# Train_test_split for model training and model evaluation
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 42)

# Linear regression

In [8]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
S_X_train = ss.fit_transform(X_train)
S_X_test = ss.transform(X_test)

lr.fit(S_X_train, Y_train)

pred_y_train = lr.predict(S_X_train)
pred_y_test = lr.predict(S_X_test)

from sklearn.metrics import mean_squared_error
print("Train MSE:", mean_squared_error(Y_train, pred_y_train))
print("Test MSE:", mean_squared_error(Y_test, pred_y_test))
print("\n")

from sklearn.metrics import r2_score
print("Train r2_score:", r2_score(Y_train, pred_y_train))
print("Test r2_score:", r2_score(Y_test, pred_y_test))

Train MSE: 0.05778182626371728
Test MSE: 0.08648056777050336


Train r2_score: 0.6812637982085156
Test r2_score: 0.5271121692954486


XGBRegressor, RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor

In [9]:
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

# train test split(train - 70% and test - 30%)
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.3, random_state = 42)

# algorithms
algorithms = [XGBRegressor, RandomForestRegressor, ExtraTreesRegressor, 
              GradientBoostingRegressor, HistGradientBoostingRegressor]

for i in algorithms:
    model = i()
    model.fit(X_train, Y_train)

    pred_y_train = model.predict(X_train)
    pred_y_test = model.predict(X_test)

    print("Algorithm:", i.__name__)
    
    from sklearn.metrics import mean_squared_error
    print("Train MSE:", mean_squared_error(Y_train, pred_y_train),"----->", "Test MSE:", mean_squared_error(Y_test, pred_y_test))         

    from sklearn.metrics import r2_score
    print("Train r2_score:", r2_score(Y_train, pred_y_train),"-->", "Test r2_score:", r2_score(Y_test, pred_y_test))
    print("\n")

Algorithm: XGBRegressor
Train MSE: 0.01657427783980261 -----> Test MSE: 0.0329062976333595
Train r2_score: 0.9085729422606935 --> Test r2_score: 0.8200637657045401


Algorithm: RandomForestRegressor
Train MSE: 0.004936784610449749 -----> Test MSE: 0.034204832394638986
Train r2_score: 0.9727677008923917 --> Test r2_score: 0.8129631961524822


Algorithm: ExtraTreesRegressor
Train MSE: 6.583539342362844e-05 -----> Test MSE: 0.037189719321830954
Train r2_score: 0.9996368386982523 --> Test r2_score: 0.7966414172802161


Algorithm: GradientBoostingRegressor
Train MSE: 0.03436234343655715 -----> Test MSE: 0.036586070449588184
Train r2_score: 0.8104503865690353 --> Test r2_score: 0.7999422536768928


Algorithm: HistGradientBoostingRegressor
Train MSE: 0.026171733170383307 -----> Test MSE: 0.03189032557119396
Train r2_score: 0.8556314439257129 --> Test r2_score: 0.8256192429281501




selecting histgradientboosting as the best model as it gives the high r2 score out of all the other models

In [17]:
# hyperparameter tuning for histgradientboosting model
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

model = HistGradientBoostingRegressor()

# Define the hyperparameter grid to search
param_grid = {
    'learning_rate': [0.1, 0.15, 0.2, 0.25, 0.3],
    'max_depth': np.linspace(20, 40, 3, dtype = int),
    'min_samples_leaf': np.linspace(20, 40, 3, dtype = int),
    # 'l2_regularization': np.linspace(0, 1, 3)
}

# Use grid search to find the best hyperparameters
grid_search = GridSearchCV(model, param_grid, cv = 5, scoring = 'neg_mean_squared_error', n_jobs = -1)
grid_search.fit(X_train, Y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Train the model with the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, Y_train)


# Evaluate the model on the test set
y_pred = best_model.predict(X_test)
mse = mean_squared_error(Y_test, y_pred)
r2 = r2_score(Y_test, y_pred)

print(f"Best Hyperparameters: {best_params}")
print("\n")
print(f"Mean Squared Error on Test Set: {mse}")
print("\n")
print(f"r2 score on Test Set: {r2}")

Best Hyperparameters: {'learning_rate': 0.15, 'max_depth': 40, 'min_samples_leaf': 20}


Mean Squared Error on Test Set: 0.03161312311178931


r2 score on Test Set: 0.8271350247167445


# Saving the best model

In [18]:
with open("house_rent_regression_model.pkl", "wb") as file:
    pickle.dump(best_model, file)

# making the predictions on the test data

In [19]:
test_df = pd.read_csv("Cleaned_House_Rent_Test.csv")

In [20]:
with open("house_rent_regression_model.pkl", "rb") as file:
    model = pickle.load(file)

In [21]:
columns = test_df[["type_encode", "latitude", "longitude", "property_size", "total_floor", "property_age", "bathroom", "LIFT", "furnishing"]]

In [22]:
predictions = np.exp(model.predict(columns))

In [23]:
# predicted house rent price
for i in predictions:
    print(i)

26480.069305759767
10789.6479405429
10920.451918272925
10090.415136742562
24605.567705140977
16557.13515379098
27301.606138441686
14226.964799057323
25652.424436983074
14738.099940778664
16725.740323747443
23952.17409188581
15082.525233591174
24665.506334489823
10281.74240099193
12809.125165418449
11751.14210325506
11474.559496325186
23393.317791404
21986.146891950506
28308.91805321887
13139.482997394298
12446.156320671218
16270.664899677113
10374.975096512213
14603.797563186972
17370.095599850298
14617.60377276024
10951.091394510873
25615.10918984167
22276.408400604712
13045.734583588679
23863.91481214721
11078.829990689637
17368.595417815624
39170.35178955836
12029.977822796145
11302.488727585136
22791.583363246194
9972.223423382587
16659.9906127431
21877.50798880105
20724.02939845829
11527.105990298269
34157.5968722341
17435.188727632518
33314.29712056961
24326.32902423421
16823.413060610113
17129.139915893527
14803.887200040881
16283.905089117996
10650.869626780994
22789.9973480026