###  Importing the Packages Which are Essential

In [138]:
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

### Loading the cleaned DataFrame

In [139]:
df = pd.read_csv('delhi_house_data_cleaned_df.csv')

In [140]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,price,Price_sqft,locality,Division,region,latitude,longitude,area,Status,neworold,parking,Bedrooms,Bathrooms,Balcony,Lift,Furnished_status,type_of_building
0,0,5600000,4148.148,Noida Extension,Noida,east,28.60885,77.46056,1350.0,Under Construction,New Property,0,3,3,0,2,Semi-Furnished,Flat
1,1,8800000,5906.04,Sector 79,Gurgaon,west,28.37423557,76.95241631,1490.0,Under Construction,New Property,0,3,3,0,2,Semi-Furnished,Flat


### Removing the unwanted columns for prediction

In [141]:
df.columns

Index(['Unnamed: 0', 'price', 'Price_sqft', 'locality', 'Division', 'region',
       'latitude', 'longitude', 'area', 'Status', 'neworold', 'parking',
       'Bedrooms', 'Bathrooms', 'Balcony', 'Lift', 'Furnished_status',
       'type_of_building'],
      dtype='object')

In [142]:
# Columns which want to be dropped
cols = ['Unnamed: 0','latitude', 'longitude','locality','Status']
df = df.drop(columns=cols)

In [143]:
df.head()

Unnamed: 0,price,Price_sqft,Division,region,area,neworold,parking,Bedrooms,Bathrooms,Balcony,Lift,Furnished_status,type_of_building
0,5600000,4148.148,Noida,east,1350.0,New Property,0,3,3,0,2,Semi-Furnished,Flat
1,8800000,5906.04,Gurgaon,west,1490.0,New Property,0,3,3,0,2,Semi-Furnished,Flat
2,12820000,6918.239,Ghaziabad,east,2050.0,New Property,1,4,5,0,0,Unfurnished,Flat
3,3810000,3628.571,Greater Noida,east,1050.0,New Property,1,2,2,3,2,Unfurnished,Flat
4,6200000,4592.593,Noida,east,1350.0,Resale,1,2,2,3,3,Semi-Furnished,Flat


In [144]:
df = df.dropna()

### Encoding

##### We have 2 options to do it. We are trying both approches based on best results we are going the use the data which we needed

In [145]:
df['Division'].value_counts()

Division
Gurgaon                3206
Ghaziabad              3162
Noida                  1886
Greater Noida          1534
Faridabad               820
New Delhi - West        800
New Delhi - South       620
New Delhi - Dwarka      360
New Delhi - East        350
New Delhi - Rohini      162
New Delhi - North       106
New Delhi - Central      22
New Delhi                14
Gurgaon - South          10
Gurgaon - North           2
Division                  1
Name: count, dtype: int64

In [146]:
# For our convinionace we maping huge forms to short forms
short_forms = {
    'New Delhi - West' : 'ND_West',
'New Delhi - South'    : 'ND_South',
'New Delhi - Dwarka'   : 'ND_Dwarka',
'New Delhi - East'     : 'ND_East',
'New Delhi - Rohini'   : 'ND_Rohini',
'New Delhi - North'    : 'ND_North',
'New Delhi - Central'  : 'ND_Central',
'Gurgaon - South'      : 'G_South',
'Gurgaon - North'      : 'G_North'
}
df['Division'] = df['Division'].replace(short_forms)

In [147]:
df['Division'].value_counts()

Division
Gurgaon          3206
Ghaziabad        3162
Noida            1886
Greater Noida    1534
Faridabad         820
ND_West           800
ND_South          620
ND_Dwarka         360
ND_East           350
ND_Rohini         162
ND_North          106
ND_Central         22
New Delhi          14
G_South            10
G_North             2
Division            1
Name: count, dtype: int64

In [148]:
df = df[df['Division'] != 'Division']

In [149]:
df['Division'].value_counts()

Division
Gurgaon          3206
Ghaziabad        3162
Noida            1886
Greater Noida    1534
Faridabad         820
ND_West           800
ND_South          620
ND_Dwarka         360
ND_East           350
ND_Rohini         162
ND_North          106
ND_Central         22
New Delhi          14
G_South            10
G_North             2
Name: count, dtype: int64

#### One hot encoding

In [150]:
df.head()

Unnamed: 0,price,Price_sqft,Division,region,area,neworold,parking,Bedrooms,Bathrooms,Balcony,Lift,Furnished_status,type_of_building
0,5600000,4148.148,Noida,east,1350.0,New Property,0,3,3,0,2,Semi-Furnished,Flat
1,8800000,5906.04,Gurgaon,west,1490.0,New Property,0,3,3,0,2,Semi-Furnished,Flat
2,12820000,6918.239,Ghaziabad,east,2050.0,New Property,1,4,5,0,0,Unfurnished,Flat
3,3810000,3628.571,Greater Noida,east,1050.0,New Property,1,2,2,3,2,Unfurnished,Flat
4,6200000,4592.593,Noida,east,1350.0,Resale,1,2,2,3,3,Semi-Furnished,Flat


In [151]:
df.columns

Index(['price', 'Price_sqft', 'Division', 'region', 'area', 'neworold',
       'parking', 'Bedrooms', 'Bathrooms', 'Balcony', 'Lift',
       'Furnished_status', 'type_of_building'],
      dtype='object')

In [152]:
x_one_encode = pd.get_dummies(df,columns=['Division', 'region','neworold','Furnished_status', 'type_of_building'],drop_first=True)

In [153]:
x_one_encode

Unnamed: 0,price,Price_sqft,area,parking,Bedrooms,Bathrooms,Balcony,Lift,Division_G_North,Division_G_South,Division_Ghaziabad,Division_Greater Noida,Division_Gurgaon,Division_ND_Central,Division_ND_Dwarka,Division_ND_East,Division_ND_North,Division_ND_Rohini,Division_ND_South,Division_ND_West,Division_New Delhi,Division_Noida,region_east,region_north,region_south,region_west,neworold_Resale,Furnished_status_Semi-Furnished,Furnished_status_Unfurnished,type_of_building_Individual House
0,5600000,4148.148,1350.0,0,3,3,0,2,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,True,False,False
1,8800000,5906.04,1490.0,0,3,3,0,2,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False
2,12820000,6918.239,2050.0,1,4,5,0,0,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False
3,3810000,3628.571,1050.0,1,2,2,3,2,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False
4,6200000,4592.593,1350.0,1,2,2,3,3,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,True,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13051,6200000,5933.014,1045.0,1,2,2,4,2,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,True,True,False,False
13052,4510000,4254.717,1060.0,0,2,2,3,0,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,True,False,False
13053,7000000,3688.093,1898.0,0,4,3,5,0,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,True,False,False
13054,6500000,4642.857,1400.0,1,3,3,2,3,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,True,False,False


In [154]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13054 entries, 0 to 13055
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             13054 non-null  object
 1   Price_sqft        13054 non-null  object
 2   Division          13054 non-null  object
 3   region            13054 non-null  object
 4   area              13054 non-null  object
 5   neworold          13054 non-null  object
 6   parking           13054 non-null  object
 7   Bedrooms          13054 non-null  object
 8   Bathrooms         13054 non-null  object
 9   Balcony           13054 non-null  object
 10  Lift              13054 non-null  object
 11  Furnished_status  13054 non-null  object
 12  type_of_building  13054 non-null  object
dtypes: object(13)
memory usage: 1.4+ MB


In [155]:
# Converting Every thing into numerics
x_one_encode = x_one_encode.astype(float)
x_one_encode # converted to float beacause all the values are objects

Unnamed: 0,price,Price_sqft,area,parking,Bedrooms,Bathrooms,Balcony,Lift,Division_G_North,Division_G_South,Division_Ghaziabad,Division_Greater Noida,Division_Gurgaon,Division_ND_Central,Division_ND_Dwarka,Division_ND_East,Division_ND_North,Division_ND_Rohini,Division_ND_South,Division_ND_West,Division_New Delhi,Division_Noida,region_east,region_north,region_south,region_west,neworold_Resale,Furnished_status_Semi-Furnished,Furnished_status_Unfurnished,type_of_building_Individual House
0,5600000.0,4148.148,1350.0,0.0,3.0,3.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,8800000.0,5906.040,1490.0,0.0,3.0,3.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,12820000.0,6918.239,2050.0,1.0,4.0,5.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,3810000.0,3628.571,1050.0,1.0,2.0,2.0,3.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,6200000.0,4592.593,1350.0,1.0,2.0,2.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13051,6200000.0,5933.014,1045.0,1.0,2.0,2.0,4.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
13052,4510000.0,4254.717,1060.0,0.0,2.0,2.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
13053,7000000.0,3688.093,1898.0,0.0,4.0,3.0,5.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
13054,6500000.0,4642.857,1400.0,1.0,3.0,3.0,2.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [157]:
x_one_encode = x_one_encode.astype(int)
x_one_encode

Unnamed: 0,price,Price_sqft,area,parking,Bedrooms,Bathrooms,Balcony,Lift,Division_G_North,Division_G_South,Division_Ghaziabad,Division_Greater Noida,Division_Gurgaon,Division_ND_Central,Division_ND_Dwarka,Division_ND_East,Division_ND_North,Division_ND_Rohini,Division_ND_South,Division_ND_West,Division_New Delhi,Division_Noida,region_east,region_north,region_south,region_west,neworold_Resale,Furnished_status_Semi-Furnished,Furnished_status_Unfurnished,type_of_building_Individual House
0,5600000,4148,1350,0,3,3,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0
1,8800000,5906,1490,0,3,3,0,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
2,12820000,6918,2050,1,4,5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
3,3810000,3628,1050,1,2,2,3,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
4,6200000,4592,1350,1,2,2,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13051,6200000,5933,1045,1,2,2,4,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,1,0,0
13052,4510000,4254,1060,0,2,2,3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0
13053,7000000,3688,1898,0,4,3,5,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0
13054,6500000,4642,1400,1,3,3,2,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0
