# Importing usefull libraries and scripts

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import json

import warnings

warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, ShuffleSplit, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor

# Helper functions

In [11]:
'''

This helper function itterates through some models given independent and dependent variables.
It uses the grid search cross validation to come up with the best model and parameters.

Returns a dataframe containing the model comparision table.

'''

def find_best_model_using_gridsearch(X,y):

    # Creating a models dictionary containing all the parameters and model names

    algos = {

        'linear regression': {
            'model': LinearRegression(),
            'params': {
                'normalize':[True,False]
            }
        },

        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection':['random','cyclic']
            }
        },

        'decision tree':  {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion': ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        }

    }

    scores = []

    # Performing k-fold cross validation and finding the best model with the best parameters

    cv = ShuffleSplit(n_splits = 5,test_size = 0.2,random_state = 1)

    for algo_name, config in algos.items():

        gs = GridSearchCV(config['model'],config['params'],cv=cv,return_train_score=False)

        gs.fit(X,y)

        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

In [19]:
'''

A helper function that recieve all the fields and predict the house price 
using linear regression model.

'''

def predict_price(location,sqft,bath,bhk,X,lr):

    try: 
        loc_index = np.where(X.columns==location)[0][0]

        x = np.zeros(len(X.columns))
        x[0] = sqft
        x[1] = bath
        x[2] = bhk
        if loc_index >= 0:
            x[loc_index] = 1    

        return lr.predict([x])[0]

    except:
        
        return "Please pass the correct location"


# Reading and loading the data that was saved after EDA

In [3]:
data = pd.read_csv('../data/clean_data.csv')
data.head()

Unnamed: 0,total_sqft,bath,price,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,2850.0,4.0,428.0,4,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1630.0,3.0,194.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1875.0,2.0,235.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1200.0,2.0,130.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1235.0,2.0,148.0,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Train test splitting

In [4]:
# We take all the independent features as X and the dependent i.e. the price as y

X = data.drop('price',axis='columns')
y = data.price

In [5]:
# Lets see the X and y shape

print(X.shape)
print(y.shape)

(7239, 243)
(7239,)


In [6]:
# Lets use the train test split function from sklearn to split our independent and dependent features

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

# Model training and validating

In [13]:
# Instantiating a linear regression model

lr = LinearRegression()

In [14]:
# Fitting our training set for training

lr.fit(X_train,y_train)

# Checking the score of our model by using our test set

lr.score(X_test,y_test)

0.8655871771675594

In [9]:
# Lets do K-fold cross validation to check if our model over fits

# k = 6 here

cv = ShuffleSplit(n_splits=6,test_size=0.2,random_state=1)

cross_val_score(LinearRegression(),X,y,cv=cv)

array([0.86558718, 0.80842556, 0.81545876, 0.86348895, 0.80592203,
       0.81210919])

> From the above result we can see that our model didn't overfit

In [12]:
find_best_model_using_gridsearch(X,y)

Unnamed: 0,model,best_score,best_params
0,linear regression,0.831776,{'normalize': True}
1,lasso,0.663611,"{'alpha': 1, 'selection': 'cyclic'}"
2,decision tree,0.737879,"{'criterion': 'friedman_mse', 'splitter': 'ran..."


> From the above table we can see that the linear regression model performs well on this dataset

In [15]:
# Lets see our data

data.head(3)

Unnamed: 0,total_sqft,bath,price,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,2850.0,4.0,428.0,4,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1630.0,3.0,194.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1875.0,2.0,235.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Checking a way to identify the column number from the one hot encoded column
# Where the location we pased is valid

loc_idx = np.where(X.columns == '1st Block Jayanagar')
print('Doesn\'t exist' if len(loc_idx[0]) == 0 else loc_idx[0][0])

3


In [17]:
# Checking a way to identify the column number from the one hot encoded column
# Where the location we pased is invalid

loc_idx = np.where(X.columns == 'Micky')
print('Doesn\'t exist' if len(loc_idx[0]) == 0 else loc_idx[0][0])

Doesn't exist


# Predicting 

In [18]:
# Lets see our independent variable values

X.head()

Unnamed: 0,total_sqft,bath,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,6th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,2850.0,4.0,4,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1630.0,3.0,3,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1875.0,2.0,3,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1200.0,2.0,3,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1235.0,2.0,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# Taking an example array for predicting

test_example = np.array([X_train.loc[1].values])
test_example.shape

(1, 243)

In [21]:
# Predicting the test example we initialized above

lr.predict(test_example)[0]

255.09489280017397

In [23]:
'''

The exact value of our predicted example
We can see that there is a difference in result, this is due to our model accuracy.

'''

y_train.loc[1]

194.0

In [24]:
# Trying to predict passing a dummy location

predict_price('micky',1000,2,2,X,lr)

'Please pass the correct location'

In [27]:
# Trying to predict by passing a correct location

print(predict_price('1st Phase JP Nagar',1000, 2, 2,X,lr))
print(predict_price('1st Phase JP Nagar',1000, 3, 3,X,lr))
print(predict_price('Indira Nagar',1000, 2, 2,X,lr))
print(predict_price('1st Block Jayanagar',2850,4,4,X,lr))

87.24836395331339
89.92218481730399
176.61342940485662
354.250285439689


# Saving the model and columns

In [28]:
# Saving our model using pickle

with open('../models/house_price_prediction_model.pickle','wb') as f:
    pickle.dump(lr,f)

In [29]:
# Saving the columns that we will use while buidling our UI and easier prediction system

columuns = {
    'data_columns': [col.lower() for col in X.columns]
}

with open('../models/columns.json','w') as f:
    f.write(json.dumps(columuns))