In [1]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


# Code For Neural Network
Back propogation and declaration of the NN

In [89]:
import pandas as pd
import numpy as np

# Dataset

In [130]:
df = pd.read_csv('Bengaluru_House_Data.csv')
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [131]:
df.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

# Preprocessing and Wrangling

In [132]:
df = df.drop(["society"], axis = 1)

In [133]:
df.shape

(13320, 8)

In [134]:
df.isnull().sum()

area_type         0
availability      0
location          1
size             16
total_sqft        0
bath             73
balcony         609
price             0
dtype: int64

In [135]:
df = df.dropna()

In [136]:
df.shape

(12710, 8)

## NOTE:
            Here in the dataframe 'df' there are 8 columns in which 1 is 'price' column that means there are 

In [137]:
df.keys()

Index(['area_type', 'availability', 'location', 'size', 'total_sqft', 'bath',
       'balcony', 'price'],
      dtype='object')

In [138]:
df.dtypes

area_type        object
availability     object
location         object
size             object
total_sqft       object
bath            float64
balcony         float64
price           float64
dtype: object

#  
We should convert the data types of the given data into int or float format for using the data in making a model

### 1. Using one-hot-encoding 'area_type'

In [139]:
df = pd.concat([df, pd.get_dummies(df[['area_type']], prefix = 'area')], axis = 1)
df.drop(['area_type'], axis = 1, inplace = True)
df.head()

Unnamed: 0,availability,location,size,total_sqft,bath,balcony,price,area_Built-up Area,area_Carpet Area,area_Plot Area,area_Super built-up Area
0,19-Dec,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07,0,0,0,1
1,Ready To Move,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0,0,0,1,0
2,Ready To Move,Uttarahalli,3 BHK,1440,2.0,3.0,62.0,1,0,0,0
3,Ready To Move,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0,0,0,0,1
4,Ready To Move,Kothanur,2 BHK,1200,2.0,1.0,51.0,0,0,0,1


### Changing 'availability' in to binary data format i.e, into 0, 1 
        If the house is ready to move then 1, else 0

In [140]:
df[['availability']] = np.where(df[['availability']] == 'Ready To Move', 1, 0)
df.head()

Unnamed: 0,availability,location,size,total_sqft,bath,balcony,price,area_Built-up Area,area_Carpet Area,area_Plot Area,area_Super built-up Area
0,0,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07,0,0,0,1
1,1,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0,0,0,1,0
2,1,Uttarahalli,3 BHK,1440,2.0,3.0,62.0,1,0,0,0
3,1,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0,0,0,0,1
4,1,Kothanur,2 BHK,1200,2.0,1.0,51.0,0,0,0,1


### As the location column also contains many unique values, lets one-hot-encode that colum too

In [141]:
df = pd.concat([df, pd.get_dummies(df[['location']])], axis = 1)
df.drop(['location'], axis = 1, inplace = True)
df.head()

Unnamed: 0,availability,size,total_sqft,bath,balcony,price,area_Built-up Area,area_Carpet Area,area_Plot Area,area_Super built-up Area,...,"location_ravindra nagar, T.dasarahalli peenya",location_rr nagar,location_sankeswari,location_sapthagiri Layout,location_sarjapura main road,location_singapura paradise,location_t.c palya,location_tc.palya,location_vinayakanagar,location_whitefiled
0,0,2 BHK,1056,2.0,1.0,39.07,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,4 Bedroom,2600,5.0,3.0,120.0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,3 BHK,1440,2.0,3.0,62.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,3 BHK,1521,3.0,1.0,95.0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,2 BHK,1200,2.0,1.0,51.0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


### Remaing two columns i.e, 'size' and 'total_sqft' details are in strings with various types of data. Lets convert them based on our observation
            Acc. to my observation the 'total_sqft' clumn bares the values in different units, such as sq.yards, acres, etc., due to human input misripencies.
            And 'size' column consists of number of bedrooms the house has in form of string

In [142]:
df.dtypes

availability                     int64
size                            object
total_sqft                      object
bath                           float64
balcony                        float64
                                ...   
location_singapura paradise      uint8
location_t.c palya               uint8
location_tc.palya                uint8
location_vinayakanagar           uint8
location_whitefiled              uint8
Length: 1275, dtype: object

In [143]:
import re
a = df[['size']]
b = df[['total_sqft']]
a = list(a['size'])
b = list(b['total_sqft'])

In [148]:
df.shape

(12710, 1275)

In [144]:
for i in range(len(a)):
    a[i] = float(a[i].split()[0])
    if('ac' in b[i] or "Ac" in b[i] or "AC" in b[i]): # Acres to sqft
        b[i] = re.split(" |a|A", b[i])
        b[i] = float(b[i][0])*43560
    elif('m' in b[i] or "M" in b[i]): #sq_meters to sqft
        b[i] = re.split(" |s|S", b[i])
        b[i] = float(b[i][0])*10.763910
    elif('y' in b[i] or 'Y' in b[i]): #sqyards to sqft
        b[i] = re.split(" |s|S", b[i])
        b[i] = float(b[i][0])*9
    elif('ce' in b[i] or 'Ce' in b[i] or 'CE' in b[i]):        
        b[i] = re.split(" |c|C", b[i])
        b[i] = float(b[i][0])*435.54
    elif("gu" in b[i] or "G" in b[i]):
        b[i] = re.split(" |g|G", b[i])
        b[i] = float(b[i][0])*1089
    else:
        b[i] = re.split(" |s|S", b[i])
        b[i] = float(b[i][0])

In [151]:
df['size'] = a
df['total_sqft'] = b

In [152]:
df.dtypes

availability                     int64
size                           float64
total_sqft                     float64
bath                           float64
balcony                        float64
                                ...   
location_singapura paradise      uint8
location_t.c palya               uint8
location_tc.palya                uint8
location_vinayakanagar           uint8
location_whitefiled              uint8
Length: 1275, dtype: object

In [153]:
df.head()

Unnamed: 0,availability,size,total_sqft,bath,balcony,price,area_Built-up Area,area_Carpet Area,area_Plot Area,area_Super built-up Area,...,"location_ravindra nagar, T.dasarahalli peenya",location_rr nagar,location_sankeswari,location_sapthagiri Layout,location_sarjapura main road,location_singapura paradise,location_t.c palya,location_tc.palya,location_vinayakanagar,location_whitefiled
0,0,2.0,1056.0,2.0,1.0,39.07,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,4.0,2600.0,5.0,3.0,120.0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,3.0,1440.0,2.0,3.0,62.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,3.0,1521.0,3.0,1.0,95.0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,2.0,1200.0,2.0,1.0,51.0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


#  
Here we can see that all the data in our DataFrame is in int or float type data format

In [156]:
df.shape

(12710, 1275)

In [164]:
X = pd.DataFrame(df)
X.drop(['price'], axis = 1, inplace = True)
X.head()

Unnamed: 0,availability,size,total_sqft,bath,balcony,area_Built-up Area,area_Carpet Area,area_Plot Area,area_Super built-up Area,location_ Anekal,...,"location_ravindra nagar, T.dasarahalli peenya",location_rr nagar,location_sankeswari,location_sapthagiri Layout,location_sarjapura main road,location_singapura paradise,location_t.c palya,location_tc.palya,location_vinayakanagar,location_whitefiled
0,0,2.0,1056.0,2.0,1.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,4.0,2600.0,5.0,3.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,3.0,1440.0,2.0,3.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,3.0,1521.0,3.0,1.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1,2.0,1200.0,2.0,1.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [165]:
Y = df[['price']]
Y.head()

Unnamed: 0,price
0,39.07
1,120.0
2,62.0
3,95.0
4,51.0


In [None]:
import pandas as pd
import numpy as np
import re
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split as tts


df = pd.read_csv('Bengaluru_House_Data.csv')

df = df.drop(["society"], axis = 1)
df = df.dropna()

#one-hot-encoding
df = pd.concat([df, pd.get_dummies(df[['area_type']], prefix = 'area')], axis = 1)
df.drop(['area_type'], axis = 1, inplace = True)

df[['availability']] = np.where(df[['availability']] == 'Ready To Move', 1, 0)

df = pd.concat([df, pd.get_dummies(df[['location']])], axis = 1)
df.drop(['location'], axis = 1, inplace = True)


a = df[['size']]
b = df[['total_sqft']]
a = list(a['size'])
b = list(b['total_sqft'])

for i in range(len(a)):
    a[i] = float(a[i].split()[0])
    if('ac' in b[i] or "Ac" in b[i] or "AC" in b[i]): # Acres to sqft
        b[i] = re.split(" |a|A", b[i])
        b[i] = float(b[i][0])*43560
    elif('m' in b[i] or "M" in b[i]): #sq_meters to sqft
        b[i] = re.split(" |s|S", b[i])
        b[i] = float(b[i][0])*10.763910
    elif('y' in b[i] or 'Y' in b[i]): #sqyards to sqft
        b[i] = re.split(" |s|S", b[i])
        b[i] = float(b[i][0])*9
    elif('ce' in b[i] or 'Ce' in b[i] or 'CE' in b[i]):        
        b[i] = re.split(" |c|C", b[i])
        b[i] = float(b[i][0])*435.54
    elif("gu" in b[i] or "G" in b[i]):
        b[i] = re.split(" |g|G", b[i])
        b[i] = float(b[i][0])*1089
    else:
        b[i] = re.split(" |s|S", b[i])
        b[i] = float(b[i][0])
        
df['size'] = a
df['total_sqft'] = b



#splitting data
X = pd.DataFrame(df)
X.drop(['price'], axis = 1, inplace = True)
Y = df[['price']]

xtrain, xtest, ytrain, ytest = tts(X, Y, test_size = 0.2, random_state = 0)

xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', learning_rate = 0.1,
                max_depth = 14, alpha = 10, )
xg_reg.fit(xtrain,ytrain)

preds = xg_reg.predict(xtest)

print(r2_score(ytest, preds))

train_pred = xg_reg.predict(xtrain)
print(r2_score(ytrain, train_pred))

In [166]:
df.isnull().sum()

availability                   0
size                           0
total_sqft                     0
bath                           0
balcony                        0
                              ..
location_singapura paradise    0
location_t.c palya             0
location_tc.palya              0
location_vinayakanagar         0
location_whitefiled            0
Length: 1275, dtype: int64

In [167]:
import xgboost as xgb

In [180]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [169]:
from sklearn.model_selection import train_test_split as tts

xtrain, xtest, ytrain, ytest = tts(X, Y, test_size = 0.2, random_state = 0)

In [226]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', learning_rate = 0.1,
                max_depth = 14, alpha = 10, )

In [227]:
xg_reg.fit(xtrain,ytrain)

preds = xg_reg.predict(xtest)

In [228]:
print(r2_score(ytest, preds))

0.7125311198844106


In [229]:
train_pred = xg_reg.predict(xtrain)
print(r2_score(ytrain, train_pred))

0.9583774438740239
