In [2]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='dark')

from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.model_selection import StratifiedKFold, KFold

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor,ExtraTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,BaggingRegressor, StackingRegressor

from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error, r2_score
from sklearn.metrics import make_scorer
import os

In [5]:
# to increase no. of rows and column visibility in outputs
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [6]:
# to see all the comands result in a single kernal 
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [7]:
train = pd.read_csv('./data/Housing Train dataset.csv')
train.shape

(1991, 11)

In [8]:
train.head()

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,LONGITUDE,LATITUDE,TARGET(PRICE_IN_LACS)
0,Dealer,0,0,2,944.88189,1,1,Undri,18.452663,73.93104,42.0
1,Dealer,1,1,2,1020.087884,0,1,Wakad,18.5993,73.7625,65.0
2,Dealer,0,1,2,1058.114928,1,1,Sus,18.539812,73.737678,65.0
3,Dealer,1,0,1,400.0,0,1,Kharadi,18.5438,73.9438,30.0
4,Dealer,0,1,3,2293.077091,1,1,Kharadi,18.5438,73.9438,210.0


In [9]:
train['POSTED_BY'].value_counts()

Dealer     1532
Owner       434
Builder      25
Name: POSTED_BY, dtype: int64

In [10]:
train['UNDER_CONSTRUCTION'].value_counts()

0    1608
1     383
Name: UNDER_CONSTRUCTION, dtype: int64

In [11]:
train['RERA'].value_counts()

0    1165
1     826
Name: RERA, dtype: int64

In [12]:
train['BHK_NO.'].value_counts()

2    1012
3     498
1     380
4      87
5      11
8       2
6       1
Name: BHK_NO., dtype: int64

In [13]:
train['READY_TO_MOVE'].value_counts()

1    1608
0     383
Name: READY_TO_MOVE, dtype: int64

In [14]:
train['RESALE'].value_counts()

1    1892
0      99
Name: RESALE, dtype: int64

In [15]:
train['ADDRESS'].value_counts().head(10)

Baner       101
Wagholi      79
Kharadi      78
Hadapsar     71
NIBM         71
Wakad        71
Moshi        58
Undri        56
Balewadi     56
Ravet        55
Name: ADDRESS, dtype: int64

In [16]:
df1 = train[(train['SQUARE_FT'] < 8000) & (train['SQUARE_FT'] > 100) & (train['BHK_NO.'] < 6)]

In [17]:
df1.drop(['LONGITUDE','LATITUDE','POSTED_BY'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [18]:
train.shape,df1.shape

((1991, 11), (1975, 8))

In [19]:
df1.head()

Unnamed: 0,UNDER_CONSTRUCTION,RERA,BHK_NO.,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,TARGET(PRICE_IN_LACS)
0,0,0,2,944.88189,1,1,Undri,42.0
1,1,1,2,1020.087884,0,1,Wakad,65.0
2,0,1,2,1058.114928,1,1,Sus,65.0
3,1,0,1,400.0,0,1,Kharadi,30.0
4,0,1,3,2293.077091,1,1,Kharadi,210.0


In [20]:
df1.ADDRESS = df1.ADDRESS.apply(lambda x : x.strip())
location_stats = df1.groupby('ADDRESS')['ADDRESS'].agg('count').sort_values(ascending=False)
location_stats

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


ADDRESS
Baner                                                98
Wagholi                                              79
Kharadi                                              77
Wakad                                                71
NIBM                                                 71
Hadapsar                                             70
Moshi                                                58
Balewadi                                             56
Undri                                                56
Ravet                                                55
Hinjewadi                                            51
Pimple Saudagar                                      43
Punawale                                             39
Keshav Nagar                                         35
Kalyani Nagar                                        31
Tathawade                                            28
Kondhwa                                              28
Bavdhan                                 

In [21]:
len(location_stats[location_stats <10])

267

In [22]:
location_stats_less_than_10 = location_stats[location_stats <10]
location_stats_less_than_10

ADDRESS
Handewadi                                            9
Prabhat Road                                         7
Dighi                                                7
Ambegaon                                             7
Bhumkar Nagar                                        7
Hinjewadi Phase 1                                    7
Katraj Kondhwa Road                                  7
Anand Nagar                                          7
Ambegaon Budruk                                      7
Model Colony                                         7
Pandhari Nagar                                       6
Warje                                                6
Mhada Colony                                         6
Ambegaon Bk                                          6
Kate Wasti                                           6
Manjari Khurd                                        6
Pimpri Chinchwad                                     6
Sukhsagar Nagar                                      6
Pi

In [23]:
df1.ADDRESS = df1.ADDRESS.apply(lambda x : 'other' if x in location_stats_less_than_10 else x)
df1['ADDRESS'].nunique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


45

In [24]:
df1[df1['SQUARE_FT']/df1['BHK_NO.'] < 300].head()
df1[df1['SQUARE_FT']/df1['BHK_NO.'] < 300].shape

Unnamed: 0,UNDER_CONSTRUCTION,RERA,BHK_NO.,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,TARGET(PRICE_IN_LACS)
28,1,1,2,530.237891,0,0,Tathawade,55.5
63,1,1,2,345.276221,0,0,Ravet,34.5
90,1,1,3,832.208727,0,0,Hinjewadi,74.0
170,0,0,3,700.0,1,1,other,42.0
174,1,0,2,581.222057,0,1,other,39.0


(34, 8)

In [25]:
df2 = df1.copy()
df2['price_per_sqft'] = df2['TARGET(PRICE_IN_LACS)']*100000/df2['SQUARE_FT']

In [26]:
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('ADDRESS'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out
df3 = remove_pps_outliers(df2)
df3.shape

(1468, 9)

In [27]:
df4 = df3.drop('price_per_sqft',axis=1)
df4.head(3)

Unnamed: 0,UNDER_CONSTRUCTION,RERA,BHK_NO.,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,TARGET(PRICE_IN_LACS)
0,0,0,2,960.052968,1,1,Aundh,87.0
1,0,0,4,4700.229789,1,1,Aundh,450.0
2,0,0,2,1000.0,1,1,Aundh,87.0


In [28]:
dummies = pd.get_dummies(df4.ADDRESS)
dummies.head()

Unnamed: 0,Aundh,Balewadi,Baner,Baner Pashan Link Road,Bavdhan,Chakan,Chikhali,Chinchwad,Dhanori,Dhayari,Eon Free Zone,Hadapsar,Hinjewadi,Kalyani Nagar,Kaspate Vasti,Keshav Nagar,Kharadi,Kondhwa,Koregaon Park,Kothrud,Lohegaon,Magarpatta,Mahalunge,Mohamadwadi,Moshi,NIBM,Narhe,Pashan,Pimple Nilakh,Pimple Saudagar,Pisoli,Punawale,Rahatani,Ravet,Shankar Kalat Nagar,Sinhgad Road,Sus,Tathawade,Undri,Viman Nagar,Wagholi,Wakad,Wanowrie,Wanwadi,other
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [29]:
df6 = pd.concat([df4,dummies.drop('other',axis=1)],axis=1) 
df6.head()

Unnamed: 0,UNDER_CONSTRUCTION,RERA,BHK_NO.,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,TARGET(PRICE_IN_LACS),Aundh,Balewadi,Baner,Baner Pashan Link Road,Bavdhan,Chakan,Chikhali,Chinchwad,Dhanori,Dhayari,Eon Free Zone,Hadapsar,Hinjewadi,Kalyani Nagar,Kaspate Vasti,Keshav Nagar,Kharadi,Kondhwa,Koregaon Park,Kothrud,Lohegaon,Magarpatta,Mahalunge,Mohamadwadi,Moshi,NIBM,Narhe,Pashan,Pimple Nilakh,Pimple Saudagar,Pisoli,Punawale,Rahatani,Ravet,Shankar Kalat Nagar,Sinhgad Road,Sus,Tathawade,Undri,Viman Nagar,Wagholi,Wakad,Wanowrie,Wanwadi
0,0,0,2,960.052968,1,1,Aundh,87.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,4,4700.229789,1,1,Aundh,450.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,2,1000.0,1,1,Aundh,87.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,1,565.035597,1,1,Aundh,50.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,2,1057.082452,1,1,Aundh,85.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [30]:
df7 = df6.drop('ADDRESS',axis=1)
df7.head()

Unnamed: 0,UNDER_CONSTRUCTION,RERA,BHK_NO.,SQUARE_FT,READY_TO_MOVE,RESALE,TARGET(PRICE_IN_LACS),Aundh,Balewadi,Baner,Baner Pashan Link Road,Bavdhan,Chakan,Chikhali,Chinchwad,Dhanori,Dhayari,Eon Free Zone,Hadapsar,Hinjewadi,Kalyani Nagar,Kaspate Vasti,Keshav Nagar,Kharadi,Kondhwa,Koregaon Park,Kothrud,Lohegaon,Magarpatta,Mahalunge,Mohamadwadi,Moshi,NIBM,Narhe,Pashan,Pimple Nilakh,Pimple Saudagar,Pisoli,Punawale,Rahatani,Ravet,Shankar Kalat Nagar,Sinhgad Road,Sus,Tathawade,Undri,Viman Nagar,Wagholi,Wakad,Wanowrie,Wanwadi
0,0,0,2,960.052968,1,1,87.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,4,4700.229789,1,1,450.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,2,1000.0,1,1,87.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,1,565.035597,1,1,50.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,2,1057.082452,1,1,85.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [31]:
df7.shape

(1468, 51)

In [32]:
target = 'TARGET(PRICE_IN_LACS)'

features = [col for col in df7.columns if col not in ([target])]

X = df7[features]
y = df7[target]
trn, val = train_test_split(df7, test_size = 0.2, random_state = 7)

##### Input for model
X_trn, X_val = trn[features], val[features]

##### Target column
y_trn, y_val = trn[target], val[target]

In [33]:
lr = LinearRegression()
lr.fit(X_trn,y_trn)
y_pred = lr.predict(X_val)
print('Root Mean Squared Error :',np.sqrt(mean_squared_error(y_val,y_pred)))
print('R2 Score:',r2_score(y_val,y_pred))
lr.score(X_val,y_val)

LinearRegression()

Root Mean Squared Error : 14.95987132946657
R2 Score: 0.9148667331996202


0.9148667331996202

In [34]:
from sklearn.model_selection import ShuffleSplit

In [35]:
cv = ShuffleSplit(n_splits = 5, test_size = 0.2, random_state = 0)
cross_val_score (LinearRegression(), X,y , cv =cv)

array([0.92021805, 0.91695522, 0.89255494, 0.92265614, 0.92219045])

In [36]:
rf = RandomForestRegressor(random_state = 1999, n_jobs = -1,n_estimators = 500)

rf.fit(X_trn, y_trn)
preds = rf.predict(X_val)
preds = np.abs(preds)

error2 = np.sqrt(mean_squared_error(y_val,preds))
print(f'Root mean_squared_error is : {error2}')
print('R2 Score:',r2_score(y_val,preds))

RandomForestRegressor(n_estimators=500, n_jobs=-1, random_state=1999)

Root mean_squared_error is : 18.02092741993701
R2 Score: 0.8764627704678329


In [37]:
cv = ShuffleSplit(n_splits = 5, test_size = 0.2, random_state = 0)
cross_val_score(rf, X,y , cv =cv)

array([0.89521509, 0.89931993, 0.90611676, 0.90529756, 0.90067866])

In [38]:
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
rfc = RandomForestRegressor(random_state=7)
param_grid = { 
'n_estimators': [100,200,300,400,500],
'max_features': ['auto', 'sqrt', 'log2'],
'max_depth' : [4,5,6,7,8]
}
grid_search_model = GridSearchCV(rfc, param_grid=param_grid,cv =cv)
grid_search_model.fit(X_trn, y_trn)

grid_search_model.best_params_

NameError: name 'GridSearchCV' is not defined

In [None]:
grid_search_model.best_score_

In [None]:
grd = grid_search_model.best_estimator_
preds = grd.predict(X_val)
preds = np.abs(preds)
error2 = np.sqrt(mean_squared_error(y_val,preds))
print(f'Root mean_squared_error is : {error2}')
print('R2 Score:',r2_score(y_val,preds))

In [41]:
def predict_price(location,underconstruction,rera,bhk,sqft,readytomove,resale):    
    loc_index = np.where(X.columns==location)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = underconstruction
    x[1] = rera
    x[2] = bhk
    x[3] = sqft
    x[4] = readytomove
    x[5] = resale
    if loc_index >= 0:
        x[loc_index] = 1

    return rf_grd.predict([x])[0]

In [42]:
predict_price('Hinjewadi',1,0,2,500,0,0)

39.0781558604313

In [44]:
import pickle
with open('./models/pune_house_prices_model.pkl','wb') as f:
    pickle.dump(rf_grd,f)

In [None]:
import pickle
pickle.dump(rf,open('./pune_house_prices_model.pkl','wb'))

In [42]:
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("./models/columns.json","w") as f:
    f.write(json.dumps(columns))

660

In [40]:
rf_grd = RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=8, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=7, verbose=0, warm_start=False)
rf_grd.fit(X_trn, y_trn)

RandomForestRegressor(max_depth=8, random_state=7)