In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df1= pd.read_csv("Bengaluru_House_Data.csv")
df1.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [4]:
df1.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [5]:
# removing particular columns
# removing null values also because comparing to total data points null points is 
# only 3 to 5 percent

df2=df1.copy()
df2.drop(['area_type','availability','society'],axis=1,inplace=True)
df2.dropna(inplace=True)
df2.reset_index(inplace=True, drop=True)
df2.head(10)

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,3.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0
4,Kothanur,2 BHK,1200,2.0,1.0,51.0
5,Whitefield,2 BHK,1170,2.0,1.0,38.0
6,Marathahalli,3 BHK,1310,3.0,1.0,63.25
7,Whitefield,3 BHK,1800,2.0,2.0,70.0
8,Whitefield,4 Bedroom,2785,5.0,3.0,295.0
9,7th Phase JP Nagar,2 BHK,1000,2.0,1.0,38.0


In [6]:
# preprocessing size column

df2['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '3 Bedroom', '1 BHK', '1 RK',
       '4 BHK', '1 Bedroom', '2 Bedroom', '6 Bedroom', '8 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '11 Bedroom', '43 Bedroom',
       '14 BHK', '8 BHK', '12 Bedroom', '10 Bedroom', '13 BHK'],
      dtype=object)

In [7]:
# changing size column from string to numeric 

df2['size']=df2['size'].apply(lambda x : float(x.split(' ')[0]))
df2.head(10)

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,Electronic City Phase II,2.0,1056,2.0,1.0,39.07
1,Chikka Tirupathi,4.0,2600,5.0,3.0,120.0
2,Uttarahalli,3.0,1440,2.0,3.0,62.0
3,Lingadheeranahalli,3.0,1521,3.0,1.0,95.0
4,Kothanur,2.0,1200,2.0,1.0,51.0
5,Whitefield,2.0,1170,2.0,1.0,38.0
6,Marathahalli,3.0,1310,3.0,1.0,63.25
7,Whitefield,3.0,1800,2.0,2.0,70.0
8,Whitefield,4.0,2785,5.0,3.0,295.0
9,7th Phase JP Nagar,2.0,1000,2.0,1.0,38.0


In [8]:
# preprocessing total_sqft attribute
# analysing what types of values are there and displaying only range like values

def is_float(x):
    try:
        float(x)
    except:
        return False
    
    return True

# ~ negation symbol interchanges boolean values 

df2[~df2['total_sqft'].apply(is_float)]

Unnamed: 0,location,size,total_sqft,bath,balcony,price
27,Yelahanka,4.0,2100 - 2850,4.0,0.0,186.000
114,Hebbal,4.0,3067 - 8156,4.0,0.0,477.000
129,8th Phase JP Nagar,2.0,1042 - 1105,2.0,0.0,54.005
153,Sarjapur,2.0,1145 - 1340,2.0,0.0,43.490
176,KR Puram,2.0,1015 - 1540,2.0,0.0,56.800
...,...,...,...,...,...,...
12384,Whitefield,2.0,850 - 1060,2.0,0.0,38.190
12399,Talaghattapura,3.0,1804 - 2273,3.0,0.0,122.000
12466,Harlur,2.0,1200 - 1470,2.0,0.0,72.760
12660,Hoodi,2.0,1133 - 1384,2.0,0.0,59.135


In [9]:
# converting range like values in string to numeric values
# range value is converted to mean value

# values with different units (eg: "50 sq meters") converted to none so that
# later it is removed from the dataframe

def convertSqftToNum(x):
    tokens=x.split('-')
    if len(tokens)==2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None
    
df2['total_sqft']=df2['total_sqft'].apply(convertSqftToNum)
df2.dropna(inplace=True)
df2.reset_index(inplace=True,drop=True)
df2.head(10)

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,Electronic City Phase II,2.0,1056.0,2.0,1.0,39.07
1,Chikka Tirupathi,4.0,2600.0,5.0,3.0,120.0
2,Uttarahalli,3.0,1440.0,2.0,3.0,62.0
3,Lingadheeranahalli,3.0,1521.0,3.0,1.0,95.0
4,Kothanur,2.0,1200.0,2.0,1.0,51.0
5,Whitefield,2.0,1170.0,2.0,1.0,38.0
6,Marathahalli,3.0,1310.0,3.0,1.0,63.25
7,Whitefield,3.0,1800.0,2.0,2.0,70.0
8,Whitefield,4.0,2785.0,5.0,3.0,295.0
9,7th Phase JP Nagar,2.0,1000.0,2.0,1.0,38.0


In [10]:
# grouping by location and calculating how many values are there within one location

df2['location']=df2['location'].apply(lambda x : x.strip())
location_count=df2.groupby("location")["location"].agg("count").sort_values()
location_count

location
1 Annasandrapalya                         1
Kengeri Satellite Town ( BDA SITE)        1
Kengeri Satellite Town KHB Apartment      1
Kengeri Satellite Town Stage II           1
Kirloskar layout, Basaveshwarnagar        1
                                       ... 
Thanisandra                             233
Kanakpura Road                          259
Electronic City                         302
Sarjapur  Road                          372
Whitefield                              514
Name: location, Length: 1248, dtype: int64

In [11]:
# there are too many unique values in location which is not great for any conversion
# methods like one-hot encoding.

# so checkLocation function groups them based on no.of datapoints per location using some
# threshold

def check_location(x):
    if location_count[x] <= 10:
        return 'others'
    else:
        return x
    
df2['location']=df2['location'].apply(check_location)
location_count=df2.groupby("location")["location"].count()
location_count

location
1st Block Jayanagar            12
1st Phase JP Nagar             21
2nd Phase Judicial Layout      11
2nd Stage Nagarbhavi           23
5th Phase JP Nagar             37
                             ... 
Yelahanka                     206
Yelahanka New Town             40
Yelenahalli                    12
Yeshwanthpur                   78
others                       2741
Name: location, Length: 236, dtype: int64

In [12]:
df2.head(20)

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,Electronic City Phase II,2.0,1056.0,2.0,1.0,39.07
1,Chikka Tirupathi,4.0,2600.0,5.0,3.0,120.0
2,Uttarahalli,3.0,1440.0,2.0,3.0,62.0
3,Lingadheeranahalli,3.0,1521.0,3.0,1.0,95.0
4,Kothanur,2.0,1200.0,2.0,1.0,51.0
5,Whitefield,2.0,1170.0,2.0,1.0,38.0
6,Marathahalli,3.0,1310.0,3.0,1.0,63.25
7,Whitefield,3.0,1800.0,2.0,2.0,70.0
8,Whitefield,4.0,2785.0,5.0,3.0,295.0
9,7th Phase JP Nagar,2.0,1000.0,2.0,1.0,38.0


## Outlier Detection

In [13]:
# assuming threshold is 300 sqft per bedroom in a house below that is unlikely

df2[df2["total_sqft"]/ df2["size"] < 300 ]

Unnamed: 0,location,size,total_sqft,bath,balcony,price
51,Murugeshpalya,6.0,1407.0,4.0,1.0,150.0
61,Devarachikkanahalli,8.0,1350.0,7.0,0.0,85.0
63,others,3.0,500.0,3.0,2.0,100.0
71,Kaval Byrasandra,2.0,460.0,1.0,0.0,22.0
81,Rajaji Nagar,6.0,710.0,6.0,3.0,160.0
...,...,...,...,...,...,...
12575,Laggere,7.0,1590.0,9.0,3.0,132.0
12577,others,9.0,1178.0,9.0,1.0,75.0
12632,Margondanahalli,5.0,1375.0,5.0,1.0,125.0
12654,Vidyaranyapura,5.0,774.0,5.0,3.0,70.0


In [14]:
# As the anomalies are less compared to total data points we can remove it

df2 =  df2[~ (df2["total_sqft"]/ df2["size"] < 300) ]

In [15]:
# added new attribute price_per_sqft to remove outliers
# data points lies between one sd is retained remaining are removed within each city

df2["price_per_sqft"]=df2["price"]*100000 / df2["total_sqft"]

def remove_pps_outliers(df):
    df_out= pd.DataFrame()
    for city, subdf in df.groupby("location"):
        m=np.mean(subdf["price_per_sqft"])
        sd=np.std(subdf["price_per_sqft"])
        temp_df= subdf[(subdf["price_per_sqft"] > (m-sd)) & (subdf["price_per_sqft"] < (m+sd))]
        df_out=pd.concat([df_out,temp_df], ignore_index=True)
    return df_out

print(df2.shape)
df2=remove_pps_outliers(df2)
print(df2.shape)

(12013, 7)
(9852, 7)


In [16]:
# removing outliers based on bathrooms 
# assuming threshold is n+1 bathrooms for n bedrooms

df2=df2[df2["bath"] < df2["size"]+2 ]
print(df2.shape)

# dropping th pps feature becoz it is added only to remove outliers and it is a reduncacy
df2=df2.drop("price_per_sqft", axis=1)
df2.reset_index(inplace=True,drop=True)
df2

(9771, 7)


Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,1st Block Jayanagar,4.0,2850.0,4.0,1.0,428.00
1,1st Block Jayanagar,3.0,1630.0,3.0,2.0,194.00
2,1st Block Jayanagar,3.0,1875.0,2.0,3.0,235.00
3,1st Block Jayanagar,3.0,1200.0,2.0,0.0,130.00
4,1st Block Jayanagar,2.0,1235.0,2.0,2.0,148.00
...,...,...,...,...,...,...
9766,others,2.0,1256.0,2.0,1.0,65.00
9767,others,2.0,1353.0,2.0,2.0,110.00
9768,others,1.0,812.0,1.0,0.0,26.00
9769,others,3.0,1440.0,2.0,2.0,63.93


In [17]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9771 entries, 0 to 9770
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    9771 non-null   object 
 1   size        9771 non-null   float64
 2   total_sqft  9771 non-null   float64
 3   bath        9771 non-null   float64
 4   balcony     9771 non-null   float64
 5   price       9771 non-null   float64
dtypes: float64(5), object(1)
memory usage: 458.1+ KB


In [63]:
# converting location feature to numeric using one hot encoding
dummies= pd.get_dummies(df2.location)
df3= pd.concat([df2, dummies], axis=1)
df3=df3.drop(["location", 'balcony'], axis=1)
df3.reset_index(inplace=True,drop=True)
df3.head()

Unnamed: 0,size,total_sqft,bath,price,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Phase JP Nagar,6th Phase JP Nagar,...,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,others
0,4.0,2850.0,4.0,428.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3.0,1630.0,3.0,194.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3.0,1875.0,2.0,235.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3.0,1200.0,2.0,130.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2.0,1235.0,2.0,148.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [64]:
print(df3.shape)

(9771, 240)


In [65]:
X=df3.drop('price',axis=1)
y=df3['price']

print(len(X),len(y))

9771 9771


## Model Building

In [66]:
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

#models which we are going to test it so there would be 10 total models.

algos={
    
    "linear_regression":{
        "model":LinearRegression(),
        "params":{
            'normalize':[True,False]
        }
    },
    
    "lasso":{
        "model":Lasso(),
        "params":{
            'alpha':[1,2],
            'selection':['random','cyclic']
        }
    },
    
    "decision_tree":{
        "model":DecisionTreeRegressor(),
        "params":{
            'criterion':['mse','friedman_mse'],
            'splitter':['best','random']
            
        }
    }
    
}


In [67]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit

scores=[]
cv = ShuffleSplit(n_splits=5, test_size=.25, random_state=0)

for algo_name , algo in algos.items():
    model=algo['model']
    params=algo['params']
    gs=GridSearchCV(model,params,cv=cv,return_train_score=False)
    gs.fit(X,y)
    scores.append({
        'model':algo_name,
        'best_score':gs.best_score_,
        'best_params':gs.best_params_
    })

    
scores=pd.DataFrame(scores,columns=['model','best_score','best_params'])
scores

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.818171,{'normalize': False}
1,lasso,0.667126,"{'alpha': 2, 'selection': 'random'}"
2,decision_tree,0.693339,"{'criterion': 'mse', 'splitter': 'random'}"


In [97]:
# from the above scores we conclude that linear_regression with normalized parameter= true
#is the best among the models we trained 

def predict_price(location,sqft,bath,bhk):
    mdl = LinearRegression(normalize=True)
    mdl.fit(X,y)
    
    # creating x instance from the given details
    x=np.zeros(len(X.columns))
    x[0]=bhk
    x[1]=sqft
    x[2]=bath
    loc_index=np.where(X.columns == location)[0][0]
    x[loc_index]=1
    
    return mdl.predict([x])
 

In [100]:
predict_price("Indira Nagar",1000,2,2)

array([148.7265625])

In [112]:
predict_price("1st Block Jayanagar",1000,3,3)

array([204.015625])

## Exporting model

In [123]:
import pickle

with open('house_price_prediction.pickle','wb') as f:
    pickle.dump(mdl,f)

In [124]:
import json

columns={
    'data_columns':[ col for col in X.columns]
}

with open('house_prediction_columns.json','w') as f:
    f.write(json.dumps(columns))