In [20]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import json

import warnings

warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, ShuffleSplit, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor

In [2]:
data = pd.read_csv('../data/clean_data.csv')
data.head()

Unnamed: 0,total_sqft,bath,price,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,2850.0,4.0,428.0,4,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1630.0,3.0,194.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1875.0,2.0,235.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1200.0,2.0,130.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1235.0,2.0,148.0,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
X = data.drop('price',axis='columns')
y = data.price

In [5]:
print(X.shape)
print(y.shape)

(7239, 243)
(7239,)


In [111]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [112]:
lr = LinearRegression()

In [113]:
lr.fit(X_train,y_train)
lr.score(X_test,y_test)

0.8655871771675594

In [114]:
cv = ShuffleSplit(n_splits=6,test_size=0.2,random_state=1)

cross_val_score(LinearRegression(),X,y,cv=cv)

array([0.86558718, 0.80842556, 0.81545876, 0.86348895, 0.80592203,
       0.81210919])

In [115]:
def find_best_model_using_gridsearch(X,y):

    algos = {

        'linear regression': {
            'model': LinearRegression(),
            'params': {
                'normalize':[True,False]
            }
        },

        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection':['random','cyclic']
            }
        },

        'decision tree':  {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion': ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        }

    }

    scores = []

    cv = ShuffleSplit(n_splits = 5,test_size = 0.2,random_state = 1)

    for algo_name, config in algos.items():

        gs = GridSearchCV(config['model'],config['params'],cv=cv,return_train_score=False)

        gs.fit(X,y)

        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

In [116]:
find_best_model_using_gridsearch(X,y)

Unnamed: 0,model,best_score,best_params
0,linear regression,0.831776,{'normalize': True}
1,lasso,0.663672,"{'alpha': 1, 'selection': 'random'}"
2,decision tree,0.745021,"{'criterion': 'friedman_mse', 'splitter': 'ran..."


In [117]:
data.head(3)

Unnamed: 0,total_sqft,bath,price,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,2850.0,4.0,428.0,4,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1630.0,3.0,194.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1875.0,2.0,235.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [118]:
loc_idx = np.where(X.columns == '1st Block Jayanagar')
print('Doesn\'t exist' if len(loc_idx[0]) == 0 else loc_idx[0][0])

3


In [119]:
X.head()

Unnamed: 0,total_sqft,bath,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,6th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,2850.0,4.0,4,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1630.0,3.0,3,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1875.0,2.0,3,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1200.0,2.0,3,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1235.0,2.0,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [137]:
# def predict_price(location,sqft,bath,bhk):

#     loc_idx = np.where(data.columns == location)

#     x = np.zeros(len(X.columns))

#     x[0] = sqft
#     x[1] = bath
#     x[2] = bhk

#     result = 'You passed a wrong location'

#     if len(loc_idx[0]) != 0:

#         if loc_idx[0][0] >= 3:

#             x[loc_idx[0][0]] = 1

#             return lr.predict([x])[0]

#         else:

#             return result
    
#     else: return result


def predict_price(location,sqft,bath,bhk):
    try: 
        loc_index = np.where(X.columns==location)[0][0]

        x = np.zeros(len(X.columns))
        x[0] = sqft
        x[1] = bath
        x[2] = bhk
        if loc_index >= 0:
            x[loc_index] = 1    

        return lr.predict([x])[0]

    except:
        
        return "Please pass the correct location"


In [121]:
test_example = np.array([X_train.loc[1].values])
test_example.shape

(1, 243)

In [129]:
lr.predict(test_example)[0]

255.09489280017397

In [123]:
y_train.loc[1]

194.0

In [138]:
predict_price('micky',1000,2,2)

'Please pass the correct location'

In [139]:
predict_price('1st Phase JP Nagar',1000, 2, 2)

87.24836395331339

In [140]:
predict_price('1st Phase JP Nagar',1000, 3, 3)

89.92218481730399

In [141]:
predict_price('Indira Nagar',1000, 2, 2)

176.61342940485662

In [107]:
data.head()

Unnamed: 0,total_sqft,bath,price,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,2850.0,4.0,428.0,4,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1630.0,3.0,194.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1875.0,2.0,235.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1200.0,2.0,130.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1235.0,2.0,148.0,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [143]:
predict_price('1st Block Jayanagar',2850,4,4)

354.250285439689