In [48]:
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error

In [2]:
df_house = pd.read_csv('House_Rent_Dataset.csv')
df_house.head()

Unnamed: 0,Posted On,BHK,Rent,Size,Floor,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact
0,2022-05-18,2,10000,1100,Ground out of 2,Super Area,Bandel,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner
1,2022-05-13,2,20000,800,1 out of 3,Super Area,"Phool Bagan, Kankurgachi",Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
2,2022-05-16,2,17000,1000,1 out of 3,Super Area,Salt Lake City Sector 2,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
3,2022-07-04,2,10000,800,1 out of 2,Super Area,Dumdum Park,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner
4,2022-05-09,2,7500,850,1 out of 2,Carpet Area,South Dum Dum,Kolkata,Unfurnished,Bachelors,1,Contact Owner


In [3]:
df_house.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4746 entries, 0 to 4745
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Posted On          4746 non-null   object
 1   BHK                4746 non-null   int64 
 2   Rent               4746 non-null   int64 
 3   Size               4746 non-null   int64 
 4   Floor              4746 non-null   object
 5   Area Type          4746 non-null   object
 6   Area Locality      4746 non-null   object
 7   City               4746 non-null   object
 8   Furnishing Status  4746 non-null   object
 9   Tenant Preferred   4746 non-null   object
 10  Bathroom           4746 non-null   int64 
 11  Point of Contact   4746 non-null   object
dtypes: int64(4), object(8)
memory usage: 445.1+ KB


In [7]:
df_house.isna().sum().sum()

0

In [8]:
df_house.describe()

Unnamed: 0,BHK,Rent,Size,Bathroom
count,4746.0,4746.0,4746.0,4746.0
mean,2.08386,34993.45,967.490729,1.965866
std,0.832256,78106.41,634.202328,0.884532
min,1.0,1200.0,10.0,1.0
25%,2.0,10000.0,550.0,1.0
50%,2.0,16000.0,850.0,2.0
75%,3.0,33000.0,1200.0,2.0
max,6.0,3500000.0,8000.0,10.0


In [13]:
categorical_columns = list(df_house.select_dtypes(include = 'object').columns)

In [25]:
def data_encoding(x):
    le = LabelEncoder()
    x = le.fit_transform(x)
    return x

In [29]:
df_house[categorical_columns] = df_house[categorical_columns].apply(lambda x: LabelEncoder().fit_transform(x))

In [30]:
df_house

Unnamed: 0,Posted On,BHK,Rent,Size,Floor,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact
0,0,2,10000,1100,0,0,0,0,0,0,2,0
1,0,2,20000,800,0,0,0,0,0,0,1,0
2,0,2,17000,1000,0,0,0,0,0,0,1,0
3,0,2,10000,800,0,0,0,0,0,0,1,0
4,0,2,7500,850,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
4741,0,2,15000,1000,0,0,0,0,0,0,2,0
4742,0,3,29000,2000,0,0,0,0,0,0,3,0
4743,0,3,35000,1750,0,0,0,0,0,0,3,0
4744,0,3,45000,1500,0,0,0,0,0,0,2,0


In [32]:
df_house.columns

Index(['Posted On', 'BHK', 'Rent', 'Size', 'Floor', 'Area Type',
       'Area Locality', 'City', 'Furnishing Status', 'Tenant Preferred',
       'Bathroom', 'Point of Contact'],
      dtype='object')

In [33]:
x = df_house[['BHK', 'Size', 'Floor', 'Area Type',
       'Area Locality', 'City', 'Furnishing Status', 'Tenant Preferred',
       'Bathroom', 'Point of Contact']]

y = df_house['Rent']

In [37]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [38]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(3796, 10)
(3796,)
(950, 10)
(950,)


In [42]:
# Logistic Regression
model = LinearRegression()
model.fit(x_train, y_train)
pred = model.predict(x_test)
print('MAE of Linear Regression:', round(mean_absolute_error(y_test, pred), 2))

MAE of Linear Regression: 26191.96


In [45]:
# Decision Tree
model2 = DecisionTreeRegressor()
model2.fit(x_train, y_train)
pred = model2.predict(x_test)
print('MAE of Decision Tree:', round(mean_absolute_error(y_test, pred), 2))

MAE of Linear Regression: 20592.75


In [46]:
# Support Vector
model3 = SVR()
model3.fit(x_train, y_train)
pred = model3.predict(x_test)
print('MAE of SVR:', round(mean_absolute_error(y_test, pred), 2))
print('RMSE of SVR:', round())

MAE of SVR: 24160.5


In [51]:
# Random Forest
model4 = RandomForestRegressor()
model4.fit(x_train, y_train)
pred = model4.predict(x_test)
print('MAE of SVR:', round(mean_absolute_error(y_test, pred), 2))
print('MAE of RMSE:', round(root_mean_squared_error(y_test, pred), 2))

MAE of SVR: 20118.77
MAE of RMSE: 54763.71
