In [1]:
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [2]:
def evaluation_metrics(y_test,y_pred):
    r2 = r2_score(y_test,y_pred)
    return r2

In [4]:
import pandas as pd

df = pd.read_csv(r'/content/analysis_df.csv')
df.head()

Unnamed: 0,type,latitude,longitude,lease_type,gym,lift,swimming_pool,negotiable,furnishing,parking,...,property_age,bathroom,facing,cup_board,floor,total_floor,water_supply,building_type,balconies,rent
0,BHK2,12.929557,77.67228,ANYONE,0,1,0,1,SEMI_FURNISHED,BOTH,...,4.0,2.0,NE,2.0,3.0,4.0,CORPORATION,AP,2.0,22000.0
1,BHK3,12.98287,80.262012,FAMILY,0,1,0,0,SEMI_FURNISHED,BOTH,...,6.0,3.0,E,3.0,1.0,5.0,CORPORATION,AP,3.0,28000.0
2,BHK1,12.955991,77.531634,FAMILY,0,0,0,1,SEMI_FURNISHED,TWO_WHEELER,...,3.0,1.0,E,1.0,1.0,2.0,CORPORATION,IH,0.0,8000.0
3,BHK3,12.963903,77.649446,FAMILY,0,0,0,1,SEMI_FURNISHED,BOTH,...,15.0,3.0,E,4.0,0.0,0.0,CORPORATION,IH,1.0,45000.0
4,BHK3,12.967144,77.750662,ANYONE,1,1,1,1,FULLY_FURNISHED,BOTH,...,5.0,3.0,NE,5.0,3.0,9.0,CORPORATION,AP,3.0,32000.0


In [5]:
x = df.drop(['latitude','longitude','rent'], axis = 1)
y = df['rent']

print(x.shape)
print(y.shape)

(17012, 18)
(17012,)


In [6]:
x.describe()

Unnamed: 0,gym,lift,swimming_pool,negotiable,property_size,property_age,bathroom,cup_board,floor,total_floor,balconies
count,17012.0,17012.0,17012.0,17012.0,17012.0,17012.0,17012.0,17012.0,17012.0,17012.0,17012.0
mean,0.199095,0.328004,0.147602,0.71573,1057.368857,6.889842,1.845815,2.214966,1.711028,3.469845,1.130966
std,0.399331,0.4695,0.354715,0.451079,613.524439,5.607976,0.713688,1.582874,1.955555,2.767769,0.997033
min,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,700.0,3.0,1.0,1.0,0.0,2.0,0.0
50%,0.0,0.0,0.0,1.0,1005.0,5.0,2.0,2.0,1.0,3.0,1.0
75%,0.0,1.0,0.0,1.0,1270.0,10.0,2.0,3.0,2.0,4.0,2.0
max,1.0,1.0,1.0,1.0,50000.0,100.0,21.0,40.0,25.0,26.0,13.0


In [7]:
import numpy as np

x['property_size'] = np.log(x['property_size'])

In [8]:
x['property_age'] = np.log(x['property_age'])

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=42)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(13609, 18)
(3403, 18)
(13609,)
(3403,)


In [10]:
x_train.sample()

Unnamed: 0,type,lease_type,gym,lift,swimming_pool,negotiable,furnishing,parking,property_size,property_age,bathroom,facing,cup_board,floor,total_floor,water_supply,building_type,balconies
76,BHK2,BACHELOR,0,0,0,1,SEMI_FURNISHED,TWO_WHEELER,6.684612,1.609438,2.0,E,4.0,1.0,2.0,CORPORATION,IF,1.0


In [11]:
# Encoding Categorical Columns

transformer = ColumnTransformer([('trans1', OneHotEncoder(sparse_output = False, handle_unknown = 'error', drop = 'first'), [0,1,2,3,4,5,6,7,11,15,16])], remainder = 'passthrough')

x_train_transformed = transformer.fit_transform(x_train)
x_test_transformed  = transformer.transform(x_test)

In [12]:
# Hyperparamater Tuning

from sklearn.model_selection import GridSearchCV

# Defining a Base model
base_model = RandomForestRegressor(random_state = 42)

# Make a dictionary of hyperparameters values to search
search_space = {"n_estimators":[100,120], "bootstrap" : [True, False]}

# making an GridSearchCV Object
GS = GridSearchCV(estimator = base_model,
                  param_grid = search_space,
                  scoring = 'r2',
                  refit = 'r2', # this will return the model that is tested with the r2 metrices
                  cv = 3,
                  verbose = 4)

GS.fit(x_train_transformed,y_train)

best_params = GS.best_params_ # To get only the best hyperparameter values that we searched for
n_est = best_params.values()
print('n_estimator',n_est)

# Training data metrics

best_score = GS.best_score_ # score according to the metric we passes in refit
print('R2 Score :',best_score)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV 1/3] END ..bootstrap=True, n_estimators=100;, score=0.672 total time=   7.1s
[CV 2/3] END ..bootstrap=True, n_estimators=100;, score=0.667 total time=   4.5s
[CV 3/3] END ..bootstrap=True, n_estimators=100;, score=0.670 total time=   5.2s
[CV 1/3] END ..bootstrap=True, n_estimators=120;, score=0.673 total time=   6.6s
[CV 2/3] END ..bootstrap=True, n_estimators=120;, score=0.667 total time=   6.1s
[CV 3/3] END ..bootstrap=True, n_estimators=120;, score=0.669 total time=   5.3s
[CV 1/3] END .bootstrap=False, n_estimators=100;, score=0.408 total time=   7.9s
[CV 2/3] END .bootstrap=False, n_estimators=100;, score=0.418 total time=   6.7s
[CV 3/3] END .bootstrap=False, n_estimators=100;, score=0.393 total time=   8.1s
[CV 1/3] END .bootstrap=False, n_estimators=120;, score=0.409 total time=  14.2s
[CV 2/3] END .bootstrap=False, n_estimators=120;, score=0.418 total time=   8.9s
[CV 3/3] END .bootstrap=False, n_estimators=120;,

In [13]:
final_model = RandomForestRegressor(n_estimators = 100, bootstrap = True, random_state = 42)
final_model.fit(x_train_transformed, y_train)

In [14]:
# Testing data metrics

y_pred = final_model.predict(x_test_transformed)

score = evaluation_metrics(y_test, y_pred)
print('The r2 score is',round(score, 1) * 100, '%')

The r2 score is 70.0 %


In [16]:
# Saving the transformer

pickle.dump(transformer, open(r'/content/transformer.pkl','wb'))
# User input prediction

In [17]:
# User input prediction

type = 'BHK2'
lease_type = 'BACHELOR'
gym	= 0
lift = 0
swimming_pool = 0
negotiable = 1
furnishing = 'SEMI_FURNISHED'
parking = 'BOTH'
property_size = 1000
log_property_size = np.log(property_size)
property_age = 10
log_property_age = np.log(property_age)
bathroom = 2
facing = 'E'
cup_board = 2
floor = 1
total_floor = 4
water_supply = 'CORPORATION'
building_type = 'IF'
balconies = 3

user_df = pd.DataFrame([[type,lease_type,gym,lift,swimming_pool,negotiable,furnishing,parking,property_size,property_age,bathroom,facing,cup_board,floor,total_floor,water_supply,building_type,balconies]], columns = ['type', 'lease_type', 'gym', 'lift', 'swimming_pool', 'negotiable', 'furnishing', 'parking', 'property_size', 'property_age', 'bathroom', 'facing', 'cup_board', 'floor', 'total_floor', 'water_supply', 'building_type', 'balconies'])
user_df

tranformed_df = transformer.transform(user_df)

y_pred = final_model.predict(tranformed_df)
print('The rental Price is', round(y_pred[0]))

The rental Price is 21030
