###  Importing the Packages Which are Essential

In [3]:
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import StandardScaler

### Loading the cleaned DataFrame

In [4]:
df = pd.read_csv('cleaned_data.csv')

In [5]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,price,Price_sqft,locality,Division,region,latitude,longitude,area,Status,neworold,parking,Bedrooms,Bathrooms,Balcony,Lift,Furnished_status,type_of_building
0,0,5600000,4148.148,Noida Extension,Noida,east,28.60885,77.46056,1350.0,Under Construction,New Property,0,3,3,0,2,Semi-Furnished,Flat
1,1,8800000,5906.04,Sector 79,Gurgaon,west,28.374236,76.952416,1490.0,Under Construction,New Property,0,3,3,0,2,Semi-Furnished,Flat


### Removing the unwanted columns for prediction

In [6]:
df.columns

Index(['Unnamed: 0', 'price', 'Price_sqft', 'locality', 'Division', 'region',
       'latitude', 'longitude', 'area', 'Status', 'neworold', 'parking',
       'Bedrooms', 'Bathrooms', 'Balcony', 'Lift', 'Furnished_status',
       'type_of_building'],
      dtype='object')

In [7]:
# Columns which want to be dropped
cols = ['Unnamed: 0','latitude', 'longitude','locality','Status']
df = df.drop(columns=cols)

In [8]:
df.head()

Unnamed: 0,price,Price_sqft,Division,region,area,neworold,parking,Bedrooms,Bathrooms,Balcony,Lift,Furnished_status,type_of_building
0,5600000,4148.148,Noida,east,1350.0,New Property,0,3,3,0,2,Semi-Furnished,Flat
1,8800000,5906.04,Gurgaon,west,1490.0,New Property,0,3,3,0,2,Semi-Furnished,Flat
2,12820000,6918.239,Ghaziabad,east,2050.0,New Property,1,4,5,0,0,Unfurnished,Flat
3,3810000,3628.571,Greater Noida,east,1050.0,New Property,1,2,2,3,2,Unfurnished,Flat
4,6200000,4592.593,Noida,east,1350.0,Resale,1,2,2,3,3,Semi-Furnished,Flat


In [9]:
df = df.dropna()

### Encoding

##### We have 2 options to do it. We are trying both approches based on best results we are going the use the data which we needed

In [10]:
df['Division'].value_counts()

Division
Gurgaon                1603
Ghaziabad              1581
Noida                   943
Greater Noida           767
Faridabad               410
New Delhi - West        400
New Delhi - South       310
New Delhi - Dwarka      180
New Delhi - East        175
New Delhi - Rohini       81
New Delhi - North        53
New Delhi - Central      11
New Delhi                 7
Gurgaon - South           5
Gurgaon - North           1
Name: count, dtype: int64

In [11]:
# For our convinionace we maping huge forms to short forms
short_forms = {
    'New Delhi - West' : 'ND_West',
'New Delhi - South'    : 'ND_South',
'New Delhi - Dwarka'   : 'ND_Dwarka',
'New Delhi - East'     : 'ND_East',
'New Delhi - Rohini'   : 'ND_Rohini',
'New Delhi - North'    : 'ND_North',
'New Delhi - Central'  : 'ND_Central',
'Gurgaon - South'      : 'G_South',
'Gurgaon - North'      : 'G_North'
}
df['Division'] = df['Division'].replace(short_forms)

In [12]:
df['Division'].value_counts()

Division
Gurgaon          1603
Ghaziabad        1581
Noida             943
Greater Noida     767
Faridabad         410
ND_West           400
ND_South          310
ND_Dwarka         180
ND_East           175
ND_Rohini          81
ND_North           53
ND_Central         11
New Delhi           7
G_South             5
G_North             1
Name: count, dtype: int64

In [13]:
df = df[df['Division'] != 'Division']

In [14]:
df['Division'].value_counts()

Division
Gurgaon          1603
Ghaziabad        1581
Noida             943
Greater Noida     767
Faridabad         410
ND_West           400
ND_South          310
ND_Dwarka         180
ND_East           175
ND_Rohini          81
ND_North           53
ND_Central         11
New Delhi           7
G_South             5
G_North             1
Name: count, dtype: int64

#### One hot encoding

In [15]:
df.head()

Unnamed: 0,price,Price_sqft,Division,region,area,neworold,parking,Bedrooms,Bathrooms,Balcony,Lift,Furnished_status,type_of_building
0,5600000,4148.148,Noida,east,1350.0,New Property,0,3,3,0,2,Semi-Furnished,Flat
1,8800000,5906.04,Gurgaon,west,1490.0,New Property,0,3,3,0,2,Semi-Furnished,Flat
2,12820000,6918.239,Ghaziabad,east,2050.0,New Property,1,4,5,0,0,Unfurnished,Flat
3,3810000,3628.571,Greater Noida,east,1050.0,New Property,1,2,2,3,2,Unfurnished,Flat
4,6200000,4592.593,Noida,east,1350.0,Resale,1,2,2,3,3,Semi-Furnished,Flat


In [16]:
df.columns

Index(['price', 'Price_sqft', 'Division', 'region', 'area', 'neworold',
       'parking', 'Bedrooms', 'Bathrooms', 'Balcony', 'Lift',
       'Furnished_status', 'type_of_building'],
      dtype='object')

In [17]:
x_one_encode = pd.get_dummies(df,columns=['Division', 'region','neworold','Furnished_status', 'type_of_building'],drop_first=True)

In [18]:
x_one_encode

Unnamed: 0,price,Price_sqft,area,parking,Bedrooms,Bathrooms,Balcony,Lift,Division_G_North,Division_G_South,...,Division_New Delhi,Division_Noida,region_east,region_north,region_south,region_west,neworold_Resale,Furnished_status_Semi-Furnished,Furnished_status_Unfurnished,type_of_building_Individual House
0,5600000,4148.148,1350.0,0,3,3,0,2,False,False,...,False,True,True,False,False,False,False,True,False,False
1,8800000,5906.040,1490.0,0,3,3,0,2,False,False,...,False,False,False,False,False,True,False,True,False,False
2,12820000,6918.239,2050.0,1,4,5,0,0,False,False,...,False,False,True,False,False,False,False,False,True,False
3,3810000,3628.571,1050.0,1,2,2,3,2,False,False,...,False,False,True,False,False,False,False,False,True,False
4,6200000,4592.593,1350.0,1,2,2,3,3,False,False,...,False,True,True,False,False,False,True,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6522,6200000,5933.014,1045.0,1,2,2,4,2,False,False,...,False,True,True,False,False,False,True,True,False,False
6523,4510000,4254.717,1060.0,0,2,2,3,0,False,False,...,False,False,True,False,False,False,True,True,False,False
6524,7000000,3688.093,1898.0,0,4,3,5,0,False,False,...,False,False,True,False,False,False,True,True,False,False
6525,6500000,4642.857,1400.0,1,3,3,2,3,False,False,...,False,False,True,False,False,False,True,True,False,False


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6527 entries, 0 to 6526
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   price             6527 non-null   int64  
 1   Price_sqft        6527 non-null   float64
 2   Division          6527 non-null   object 
 3   region            6527 non-null   object 
 4   area              6527 non-null   float64
 5   neworold          6527 non-null   object 
 6   parking           6527 non-null   int64  
 7   Bedrooms          6527 non-null   int64  
 8   Bathrooms         6527 non-null   int64  
 9   Balcony           6527 non-null   int64  
 10  Lift              6527 non-null   int64  
 11  Furnished_status  6527 non-null   object 
 12  type_of_building  6527 non-null   object 
dtypes: float64(2), int64(6), object(5)
memory usage: 663.0+ KB


In [20]:
# Converting Every thing into numerics
x_one_encode = x_one_encode.astype(float)
x_one_encode # converted to float beacause all the values are objects

Unnamed: 0,price,Price_sqft,area,parking,Bedrooms,Bathrooms,Balcony,Lift,Division_G_North,Division_G_South,...,Division_New Delhi,Division_Noida,region_east,region_north,region_south,region_west,neworold_Resale,Furnished_status_Semi-Furnished,Furnished_status_Unfurnished,type_of_building_Individual House
0,5600000.0,4148.148,1350.0,0.0,3.0,3.0,0.0,2.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,8800000.0,5906.040,1490.0,0.0,3.0,3.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,12820000.0,6918.239,2050.0,1.0,4.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,3810000.0,3628.571,1050.0,1.0,2.0,2.0,3.0,2.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,6200000.0,4592.593,1350.0,1.0,2.0,2.0,3.0,3.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6522,6200000.0,5933.014,1045.0,1.0,2.0,2.0,4.0,2.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
6523,4510000.0,4254.717,1060.0,0.0,2.0,2.0,3.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
6524,7000000.0,3688.093,1898.0,0.0,4.0,3.0,5.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
6525,6500000.0,4642.857,1400.0,1.0,3.0,3.0,2.0,3.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [21]:
x_one_encode = x_one_encode.astype(int)
x_one_encode

Unnamed: 0,price,Price_sqft,area,parking,Bedrooms,Bathrooms,Balcony,Lift,Division_G_North,Division_G_South,...,Division_New Delhi,Division_Noida,region_east,region_north,region_south,region_west,neworold_Resale,Furnished_status_Semi-Furnished,Furnished_status_Unfurnished,type_of_building_Individual House
0,5600000,4148,1350,0,3,3,0,2,0,0,...,0,1,1,0,0,0,0,1,0,0
1,8800000,5906,1490,0,3,3,0,2,0,0,...,0,0,0,0,0,1,0,1,0,0
2,12820000,6918,2050,1,4,5,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
3,3810000,3628,1050,1,2,2,3,2,0,0,...,0,0,1,0,0,0,0,0,1,0
4,6200000,4592,1350,1,2,2,3,3,0,0,...,0,1,1,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6522,6200000,5933,1045,1,2,2,4,2,0,0,...,0,1,1,0,0,0,1,1,0,0
6523,4510000,4254,1060,0,2,2,3,0,0,0,...,0,0,1,0,0,0,1,1,0,0
6524,7000000,3688,1898,0,4,3,5,0,0,0,...,0,0,1,0,0,0,1,1,0,0
6525,6500000,4642,1400,1,3,3,2,3,0,0,...,0,0,1,0,0,0,1,1,0,0


In [22]:
y = x_one_encode['price']
x_one_encode = x_one_encode.drop(columns=['price', 'Price_sqft'])
x_one_encode

Unnamed: 0,area,parking,Bedrooms,Bathrooms,Balcony,Lift,Division_G_North,Division_G_South,Division_Ghaziabad,Division_Greater Noida,...,Division_New Delhi,Division_Noida,region_east,region_north,region_south,region_west,neworold_Resale,Furnished_status_Semi-Furnished,Furnished_status_Unfurnished,type_of_building_Individual House
0,1350,0,3,3,0,2,0,0,0,0,...,0,1,1,0,0,0,0,1,0,0
1,1490,0,3,3,0,2,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
2,2050,1,4,5,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,1,0
3,1050,1,2,2,3,2,0,0,0,1,...,0,0,1,0,0,0,0,0,1,0
4,1350,1,2,2,3,3,0,0,0,0,...,0,1,1,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6522,1045,1,2,2,4,2,0,0,0,0,...,0,1,1,0,0,0,1,1,0,0
6523,1060,0,2,2,3,0,0,0,0,1,...,0,0,1,0,0,0,1,1,0,0
6524,1898,0,4,3,5,0,0,0,1,0,...,0,0,1,0,0,0,1,1,0,0
6525,1400,1,3,3,2,3,0,0,1,0,...,0,0,1,0,0,0,1,1,0,0


In [23]:
y

0        5600000
1        8800000
2       12820000
3        3810000
4        6200000
          ...   
6522     6200000
6523     4510000
6524     7000000
6525     6500000
6526     6500000
Name: price, Length: 6527, dtype: int64

#### Model Creation

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

In [25]:
models = {
    'xgb' : XGBRegressor(learning_rate= 0.05),
    'random forrest' : RandomForestRegressor(n_estimators= 1000),
    'linear reg' : LinearRegression(),
}

In [26]:
x_train, x_test, y_train, y_test = train_test_split(x_one_encode,y, test_size=0.33, random_state= 42)

In [27]:
result = []

for key, model in models.items():
    model.fit(x_train,y_train)
    pred = model.predict(x_test)
    
    r2 = r2_score(y_true= y_test, y_pred= pred)
    mae = mean_absolute_error(y_true= y_test, y_pred= pred)
    mse = mean_squared_error(y_true= y_test, y_pred= pred)
    result.append({key : [r2, mae, mse]})

In [28]:
for idx in result:
    print(idx)

{'xgb': [0.8629834055900574, 938588.625, 1640180219904.0]}
{'random forrest': [0.8561932090127927, 937579.9241002228, 1721463317593.9985]}
{'linear reg': [0.812008598753073, 1148699.3726691618, 2250382607442.1475]}
