# IMPORT DATA AND DEPENCIES

In [None]:
# run !pip install lazypredict optuna dask[dataframe] if this is the first time of running this project.

In [2]:
# import the libraries

import pandas as pd
import numpy as np
import math

import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

import optuna
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.metrics import mean_squared_error
from lazypredict.Supervised import LazyRegressor


sns.set_style('darkgrid')
sns.set_palette('husl')

In [3]:
# load the dataset
data = pd.read_csv('House_Rent_Dataset.csv')

# DATA PREPROCESSING

In [4]:
data.head()

Unnamed: 0,Posted On,BHK,Rent,Size,Floor,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact
0,2022-05-18,2,10000,1100,Ground out of 2,Super Area,Bandel,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner
1,2022-05-13,2,20000,800,1 out of 3,Super Area,"Phool Bagan, Kankurgachi",Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
2,2022-05-16,2,17000,1000,1 out of 3,Super Area,Salt Lake City Sector 2,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
3,2022-07-04,2,10000,800,1 out of 2,Super Area,Dumdum Park,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner
4,2022-05-09,2,7500,850,1 out of 2,Carpet Area,South Dum Dum,Kolkata,Unfurnished,Bachelors,1,Contact Owner


In [7]:
data.shape

(4746, 12)

In [8]:
data.isnull().sum()

Posted On            0
BHK                  0
Rent                 0
Size                 0
Floor                0
Area Type            0
Area Locality        0
City                 0
Furnishing Status    0
Tenant Preferred     0
Bathroom             0
Point of Contact     0
dtype: int64

In [9]:
# check for column value counts 

for col_name in data.columns:
    print(f'There are {data[col_name].nunique()} different {col_name}')

There are 81 different Posted On
There are 6 different BHK
There are 243 different Rent
There are 615 different Size
There are 480 different Floor
There are 3 different Area Type
There are 2235 different Area Locality
There are 6 different City
There are 3 different Furnishing Status
There are 3 different Tenant Preferred
There are 8 different Bathroom
There are 3 different Point of Contact


In [10]:
# extract two columns apartment_floor and no_of_floors from the floor column
pattern = r'(.*) out of (.*)'

data[['apartment_floor','no_of_floors']] = data['Floor'].str.extract(pattern)

In [12]:
data['apartment_floor'].unique()

array(['Ground', '1', '2', '4', '3', '5', '7', '8', 'Upper Basement',
       '11', 'Lower Basement', '6', '14', '43', '13', '18', '17', '9',
       '19', '60', '34', '12', '26', '25', '53', '16', '10', '39', '32',
       '47', '28', '20', '15', '65', '40', '37', '22', '21', '30', '35',
       '33', '44', '41', '46', '27', '45', '48', '50', '24', '23', '29',
       '49', '36', '76', nan], dtype=object)

In [13]:
data['no_of_floors'].unique()

array(['2', '3', '1', '4', '5', '14', '8', '6', '19', '10', '7', '13',
       '78', '18', '12', '24', '31', '21', '23', '20', '9', '22', '58',
       '16', '66', '48', '40', '44', '42', '41', '60', '32', '30', '29',
       '89', '15', '11', '28', '17', '45', '35', '75', '38', '51', '43',
       '25', '27', '26', '76', '36', '37', '55', '68', '77', '50', '59',
       '62', '39', '52', '54', '33', '46', '85', '71', '81', '34', nan],
      dtype=object)

In [19]:
data['is_upper_basement'] = [1 if x == 'Upper Basement' else 0 for x in data['apartment_floor']]
data['is_ground'] = [1 if x == 'Ground' else 0 for x in data['apartment_floor']]
data['is_lower_basement'] = [1 if x == 'Lower Basement' else 0 for x in data['apartment_floor']]

data['apartment_floor'] = [0 if x in ['Lower Basement','Upper Basement', 'Ground'] else x for x in data['apartment_floor']]

In [20]:
data.head(4)

Unnamed: 0,Posted On,BHK,Rent,Size,Floor,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact,apartment_floor,no_of_floors,is_upper_basement,is_ground,is_lower_basement
0,2022-05-18,2,10000,1100,Ground out of 2,Super Area,Bandel,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner,0,2,0,1,0
1,2022-05-13,2,20000,800,1 out of 3,Super Area,"Phool Bagan, Kankurgachi",Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner,1,3,0,0,0
2,2022-05-16,2,17000,1000,1 out of 3,Super Area,Salt Lake City Sector 2,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner,1,3,0,0,0
3,2022-07-04,2,10000,800,1 out of 2,Super Area,Dumdum Park,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner,1,2,0,0,0


In [22]:
data.drop(columns = ['Posted On','Area Locality','Floor'], inplace = True)

In [23]:
data.isnull().sum()


BHK                  0
Rent                 0
Size                 0
Area Type            0
City                 0
Furnishing Status    0
Tenant Preferred     0
Bathroom             0
Point of Contact     0
apartment_floor      4
no_of_floors         4
is_upper_basement    0
is_ground            0
is_lower_basement    0
dtype: int64

In [24]:
data.dropna(inplace = True)
data.reset_index(drop = True, inplace = True)

In [28]:
cat_cols = ['Area Type', 'City', 'Furnishing Status', 'Tenant Preferred', 'Point of Contact']
num_cols = [x for x in data.columns if x not in cat_cols]
num_cols

['BHK',
 'Rent',
 'Size',
 'Bathroom',
 'apartment_floor',
 'no_of_floors',
 'is_upper_basement',
 'is_ground',
 'is_lower_basement']

In [30]:
num_df = data[num_cols].drop(columns = ['Rent'])
num_cols.remove('Rent')
scaler = StandardScaler()
num_df = scaler.fit_transform(num_df)
num_df = pd.DataFrame(data = num_df, columns= num_cols)

In [31]:
encoder = LabelEncoder()
cat_df = data[cat_cols]

for col in cat_cols:
    cat_df[col] = encoder.fit_transform(cat_df[col])

In [34]:
X = pd.concat([cat_df, num_df], axis = 1)
y = data['Rent'].values

In [35]:
X.head()

Unnamed: 0,Area Type,City,Furnishing Status,Tenant Preferred,Point of Contact,BHK,Size,Bathroom,apartment_floor,no_of_floors,is_upper_basement,is_ground,is_lower_basement
0,2,4,2,1,2,-0.1,0.21,0.04,-0.6,-0.53,-0.07,2.03,-0.05
1,2,4,1,1,2,-0.1,-0.26,-1.09,-0.42,-0.42,-0.07,-0.49,-0.05
2,2,4,1,1,2,-0.1,0.05,-1.09,-0.42,-0.42,-0.07,-0.49,-0.05
3,2,4,2,1,2,-0.1,-0.26,-1.09,-0.42,-0.53,-0.07,-0.49,-0.05
4,1,4,2,0,2,-0.1,-0.19,-1.09,-0.42,-0.53,-0.07,-0.49,-0.05


# DATA VISUALIZATION

# BASE MODEL TRAINING

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state =23)
reg = LazyRegressor(verbose=0,ignore_warnings=False, custom_metric=None )
models,predictions = reg.fit(X_train, X_test, y_train, y_test)

 74%|███████▍  | 31/42 [00:06<00:01,  6.82it/s]

QuantileRegressor model failed to execute
Solver interior-point is not anymore available in SciPy >= 1.11.0.


100%|██████████| 42/42 [00:08<00:00,  5.10it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000461 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 380
[LightGBM] [Info] Number of data points in the train set: 3793, number of used features: 11
[LightGBM] [Info] Start training from score 35384.621144





In [37]:
predictions

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RandomForestRegressor,0.65,0.66,31167.21,0.81
BaggingRegressor,0.63,0.63,32308.18,0.1
LGBMRegressor,0.62,0.63,32444.61,0.06
HistGradientBoostingRegressor,0.62,0.63,32502.81,0.24
XGBRegressor,0.59,0.6,33801.14,0.19
ExtraTreesRegressor,0.58,0.59,34196.19,0.66
GradientBoostingRegressor,0.55,0.55,35512.27,0.3
GammaRegressor,0.52,0.52,36713.17,0.22
PoissonRegressor,0.49,0.49,37856.68,0.07
ElasticNet,0.45,0.46,39273.49,0.02


# HYPER-PARAMETER OPTIMIZATION

# MODEL RE-TRAINING