In [112]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [113]:
df = pd.read_csv("BHP.csv")
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [114]:
df.isna().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [115]:
df = df.dropna()
df.isna().sum()

area_type       0
availability    0
location        0
size            0
society         0
total_sqft      0
bath            0
balcony         0
price           0
dtype: int64

## Feature Engineering - Column(area_type)

In [116]:
unique_area = df.area_type.unique()
unique_area
#len(unique_area)

array(['Super built-up  Area', 'Plot  Area', 'Built-up  Area',
       'Carpet  Area'], dtype=object)

In [117]:
df.groupby("area_type")["area_type"].agg("count")

area_type
Built-up  Area          1158
Carpet  Area              52
Plot  Area               269
Super built-up  Area    6017
Name: area_type, dtype: int64

In [118]:
df = df.drop(["area_type"],axis = 1)
df.head()

Unnamed: 0,availability,location,size,society,total_sqft,bath,balcony,price
0,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
3,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
5,Ready To Move,Whitefield,2 BHK,DuenaTa,1170,2.0,1.0,38.0
11,Ready To Move,Whitefield,4 Bedroom,Prrry M,2785,5.0,3.0,295.0


## Feature Engineering - Column(availability)

#### We do not including availability column as it has time bound which we can not set for our future predictions. We will going to drop it, after checking all our columns.

In [119]:
df = df.drop(["availability"],axis = "columns")
df.head()

Unnamed: 0,location,size,society,total_sqft,bath,balcony,price
0,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
3,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
5,Whitefield,2 BHK,DuenaTa,1170,2.0,1.0,38.0
11,Whitefield,4 Bedroom,Prrry M,2785,5.0,3.0,295.0


## Feature Engineering - Column(location)

In [120]:
unique_location = df.location.unique()
len(unique_location)

634

In [121]:
location_stats = df.groupby("location")["location"].agg("count").sort_values(ascending = False)
location_stats

location
Whitefield                        397
Sarjapur  Road                    310
Electronic City                   236
Kanakpura Road                    216
Thanisandra                       212
                                 ... 
Kalhalli                            1
 Bhoganhalli                        1
KSRTC Layout                        1
KR Garden                           1
Yeshwanthpur Industrial Suburb      1
Name: location, Length: 634, dtype: int64

In [122]:
location_stats_less_than_10 = location_stats[location_stats<=10]
len(location_stats_less_than_10)

491

In [123]:
df1 = df.copy()

In [124]:
len(df1.location.unique())

634

In [125]:
# Setting all the location_stats_less_than_10 data points to other
df1.location = df1.location.apply(lambda x :"other" if x in location_stats_less_than_10 else x)

In [126]:
len(df1.location.unique())

144

## Feature Engineering - Column(size)

In [127]:
df1["size"].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '3 Bedroom', '1 RK', '4 BHK',
       '1 BHK', '5 BHK', '11 BHK', '5 Bedroom', '9 BHK', '2 Bedroom',
       '6 BHK', '7 BHK', '6 Bedroom'], dtype=object)

In [128]:
# Sperate the numerical data from above columns, i.e. Number of bedrooms from size column
# And adding that result in new column "bhk"

df2 = df1.copy()
df2["bhk"] = df2["size"].apply(lambda x: int (x.split(" ")[0]))

In [129]:
df2["bhk"].unique()

array([ 2,  4,  3,  1,  5, 11,  9,  6,  7], dtype=int64)

In [130]:
df2 = df2.drop(["size"],axis = 1)

## Feature Engineering - Column(society)


#### Also, drop the society column for simplicity

In [131]:
df2.shape

(7496, 7)

In [132]:
df3= df2.drop(["society"],axis = "columns")

In [133]:
df3.shape

(7496, 6)

## Feature Engineering - Column(total_sqft)


In [134]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [135]:
df3[~df3["total_sqft"].apply(is_float)]

Unnamed: 0,location,total_sqft,bath,balcony,price,bhk
30,Yelahanka,2100 - 2850,4.0,0.0,186.000,4
122,Hebbal,3067 - 8156,4.0,0.0,477.000,4
137,8th Phase JP Nagar,1042 - 1105,2.0,0.0,54.005,2
165,Sarjapur,1145 - 1340,2.0,0.0,43.490,2
188,KR Puram,1015 - 1540,2.0,0.0,56.800,2
...,...,...,...,...,...,...
12975,Whitefield,850 - 1060,2.0,0.0,38.190,2
12990,Talaghattapura,1804 - 2273,3.0,0.0,122.000,3
13059,Harlur,1200 - 1470,2.0,0.0,72.760,2
13265,Hoodi,1133 - 1384,2.0,0.0,59.135,2


In [136]:
def convert_sqft_to_num(x):
    tokens = x.split("-")
    if len(tokens) == 2:
        return(float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None
    


In [137]:
df4 = df3.copy()
df4["total_sqft"] = df4["total_sqft"].apply(convert_sqft_to_num)
df4.head()

Unnamed: 0,location,total_sqft,bath,balcony,price,bhk
0,Electronic City Phase II,1056.0,2.0,1.0,39.07,2
1,Chikka Tirupathi,2600.0,5.0,3.0,120.0,4
3,Lingadheeranahalli,1521.0,3.0,1.0,95.0,3
5,Whitefield,1170.0,2.0,1.0,38.0,2
11,Whitefield,2785.0,5.0,3.0,295.0,4


In [138]:
df4.shape

(7496, 6)

In [139]:
df4.dtypes

location       object
total_sqft    float64
bath          float64
balcony       float64
price         float64
bhk             int64
dtype: object

## Feature Engineering - Column(balcony)

In [140]:
df4.balcony.unique()

array([1., 3., 2., 0.])

### Outliers Detection

#### By analysing Price_per_sqft, areawise

In [141]:
df5 = df4.copy()
df5["price_per_sqft"] = df5["price"] *100000 / df5["total_sqft"]

In [142]:
df5.describe()

Unnamed: 0,total_sqft,bath,balcony,price,bhk,price_per_sqft
count,7481.0,7496.0,7496.0,7496.0,7496.0,7481.0
mean,1507.516043,2.458378,1.629936,96.857122,2.529482,5992.058235
std,935.083274,0.881128,0.780127,105.38249,0.733228,9160.840277
min,11.0,1.0,0.0,8.0,1.0,371.428571
25%,1115.0,2.0,1.0,49.5,2.0,4279.131007
50%,1310.0,2.0,2.0,69.755,2.0,5319.148936
75%,1656.0,3.0,2.0,105.0,3.0,6696.629213
max,36000.0,9.0,3.0,2700.0,11.0,672727.272727


In [143]:
# Now, reducing our data through standard deviation

def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    
    for key,subdf in df.groupby("location"):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        
        reduced_df = subdf[(subdf.price_per_sqft > (m-st)) & (subdf.price_per_sqft <= (m+st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index = True)
        
    return df_out

In [144]:
df6 = remove_pps_outliers(df5)
df6.shape

(6104, 7)

#### By analysing Basic area need for a bedroom

In [145]:
df6[df6.total_sqft / df6.bhk <300]

Unnamed: 0,location,total_sqft,bath,balcony,price,bhk,price_per_sqft
643,Bisuvanahalli,873.0,2.0,1.0,40.0,3,4581.901489
704,Bommenahalli,832.0,3.0,1.0,77.0,3,9254.807692
1028,Electronic City,550.0,1.0,1.0,16.0,2,2909.090909
1040,Electronic City,550.0,1.0,1.0,15.0,2,2727.272727
1065,Electronic City,550.0,1.0,1.0,15.0,2,2727.272727
1130,Electronic City,550.0,1.0,1.0,15.0,2,2727.272727
1202,Electronic City,550.0,1.0,1.0,16.0,2,2909.090909
1208,Electronic City,550.0,2.0,1.0,16.0,2,2909.090909
1263,Electronic City Phase II,875.0,2.0,2.0,40.0,3,4571.428571
2642,Kengeri Satellite Town,560.0,2.0,1.0,16.6,2,2964.285714


In [146]:
df7 = df6[~(df6.total_sqft / df6.bhk <300)]
df7.shape

(6078, 7)

In [147]:
df7.describe()

Unnamed: 0,total_sqft,bath,balcony,price,bhk,price_per_sqft
count,6078.0,6078.0,6078.0,6078.0,6078.0,6078.0
mean,1485.054945,2.432215,1.632445,91.24626,2.507733,5680.877561
std,842.670544,0.853568,0.776579,88.046733,0.714588,2178.567471
min,350.0,1.0,0.0,10.0,1.0,1250.0
25%,1117.0,2.0,1.0,50.0,2.0,4350.661384
50%,1305.0,2.0,2.0,69.0,2.0,5289.456592
75%,1645.0,3.0,2.0,100.0,3.0,6482.166856
max,36000.0,9.0,3.0,2000.0,11.0,23404.109762


#### By, having bath > bhk+2

In [148]:
df7[df7.bath > df7.bhk+2]

Unnamed: 0,location,total_sqft,bath,balcony,price,bhk,price_per_sqft
3814,Thanisandra,1806.0,6.0,2.0,116.0,3,6423.03433
4965,other,11338.0,9.0,1.0,1000.0,6,8819.897689


In [149]:
df7.shape

(6078, 7)

In [150]:
df8 = df7[~(df7.bath > df7.bhk+2)]

In [151]:
df8.shape

(6076, 7)

In [152]:
df9 = df8.drop(["price_per_sqft"],axis= 1)

In [153]:
df9.columns

Index(['location', 'total_sqft', 'bath', 'balcony', 'price', 'bhk'], dtype='object')

### Get Dummies of Location

In [154]:
dummies = pd.get_dummies(df9.location)

In [155]:
df10 = pd.concat([df9,dummies.drop(["other"],axis = 1)],axis = 1)

In [156]:
df11 =df10.drop(["location","balcony"],axis = 1)

Index(['total_sqft', 'bath', 'price', 'bhk', '1st Phase JP Nagar',
       '5th Phase JP Nagar', '7th Phase JP Nagar', '8th Phase JP Nagar',
       '9th Phase JP Nagar', 'AECS Layout',
       ...
       'Thubarahalli', 'Tumkur Road', 'Uttarahalli', 'Varthur', 'Vijayanagar',
       'Vittasandra', 'Whitefield', 'Yelahanka', 'Yelahanka New Town',
       'Yeshwanthpur'],
      dtype='object', length=147)

## Now, Model Training

In [157]:
X = df11.drop(["price"],axis=1)
y = df11.price

In [158]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

In [159]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [162]:
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso

In [163]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [164]:
def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                'normalize': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        "randomforestregressor":{
            "model" : RandomForestRegressor(),
            "params" :{
                "max_features" : [1,2,3,5,6,7,8,10],
                "max_depth" : [10,12,15,20,50,70,100]
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])



In [165]:
find_best_model_using_gridsearchcv(X,y)

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline wi

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.694118,{'normalize': False}
1,lasso,0.654563,"{'alpha': 2, 'selection': 'random'}"
2,randomforestregressor,0.788378,"{'max_depth': 50, 'max_features': 7}"


In [167]:
model = RandomForestRegressor(max_depth=122,max_features=3)

In [168]:
model.fit(X_train,y_train)

RandomForestRegressor(max_depth=122, max_features=3)

In [169]:
model.score(X_train,y_train)

0.956029371511671

In [170]:
model.score(X_test,y_test)

0.8031596192638237

In [171]:
cv = ShuffleSplit(n_splits=5,test_size=0.2,random_state=0)
cross_val_score(RandomForestRegressor(),X,y,cv = cv)

array([0.75406342, 0.77457477, 0.79240768, 0.72771561, 0.69484956])

In [172]:
X.columns

Index(['total_sqft', 'bath', 'bhk', '1st Phase JP Nagar', '5th Phase JP Nagar',
       '7th Phase JP Nagar', '8th Phase JP Nagar', '9th Phase JP Nagar',
       'AECS Layout', 'Abbigere',
       ...
       'Thubarahalli', 'Tumkur Road', 'Uttarahalli', 'Varthur', 'Vijayanagar',
       'Vittasandra', 'Whitefield', 'Yelahanka', 'Yelahanka New Town',
       'Yeshwanthpur'],
      dtype='object', length=146)

In [177]:
def predict_price(location,sqft,bath,bhk):
    loc_index = np.where(X.columns==location)[0][0]
    
    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    if loc_index >= 0:
        x[loc_index] =1
        
    return model.predict([x])[0]

In [178]:
predict_price("7th Phase JP Nagar",1000,2,2)



57.304828999495356

## Export the tested model to a pickle file

In [179]:

import pickle
with open('banglore_home_prices_model.pickle','wb') as f:
    pickle.dump(model,f)

## Export location and column information to a file that will be useful later on in our prediction application

In [180]:

import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))