# Bangalore House Price Prediction - Outlier Detection

#### This notebook only train ML model on different ml algorithms

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

In [2]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [3]:
# This file contain [area_type	availability	location	bath	balcony	price	total_sqft_int	bhk	price_per_sqft]
# and ['area_type','availability','location'] this are cat var
# We encoded few classes from above cat var in OHE

df = pd.read_csv('dataset/clean_data.csv')
df.shape

(7120, 6)

In [4]:
df.sample(5)

Unnamed: 0,bath,balcony,price,total_sqft_int,bhk,price_per_sqft
2308,3.0,3.0,82.5,1618.0,3,5098.887515
2546,3.0,2.0,105.0,2017.0,3,5205.751116
1790,3.0,0.0,200.0,2400.0,4,8333.333333
2572,2.0,1.0,65.0,1056.0,2,6155.30303
6410,2.0,2.0,35.0,850.0,2,4117.647059


## Split Dataset in train and test

In [5]:
X = df.drop('price', axis=1)
y = df['price']
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Shape of X: (7120, 5)
Shape of y: (7120,)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=51)
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (5696, 5)
Shape of y_train: (5696,)
Shape of X_test: (1424, 5)
Shape of y_test: (1424,)


## Feature Scaling

## Machine Learning Model Training

In [7]:
pipeline_lr = Pipeline([('scalar1', StandardScaler()),
                        ('pca1', PCA(n_components=2)),
                        ('lr_reg', LinearRegression())])

In [8]:
pipeline_svr = Pipeline([('scalar2', StandardScaler()),
                        ('pca2', PCA(n_components=2)),
                        ('svr_reg', SVR())])

In [9]:
pipeline_rfr = Pipeline([('scalar3', StandardScaler()),
                        ('pca3', PCA(n_components=2)),
                        ('rf_reg', RandomForestRegressor())])

In [10]:
pipelines = [pipeline_lr, pipeline_svr, pipeline_rfr]

In [11]:
best_accuracy = 0.0
best_regressor = 0
best_pipeline = ""

In [12]:
pipe_dict = {0: 'Linear Regression', 1: 'SVR', 2: 'RandomForest'}

for pipe in pipelines:
    pipe.fit(X_train, y_train)

In [13]:
for i, model in enumerate(pipelines):
    print("{} Test Accuracy: {}".format(pipe_dict[i], model.score(X_test, y_test)))

Linear Regression Test Accuracy: 0.6839996247076763
SVR Test Accuracy: 0.24985115349908416
RandomForest Test Accuracy: 0.8861548634981415


In [14]:
for i, model in enumerate(pipelines):
    if model.score(X_test, y_test) > best_accuracy:
        best_accuracy = model.score(X_test, y_test)
        best_pipeline = model
        best_regressor = i

print("Regressor with Best Accuracy:{}".format(pipe_dict[best_regressor]))

Regressor with Best Accuracy:RandomForest


## Test Model

In [15]:
list(X.columns)

['bath', 'balcony', 'total_sqft_int', 'bhk', 'price_per_sqft']

In [16]:
# it help to get predicted value of hosue  by providing features value 
def predict_house_price(model,bath,balcony,total_sqft_int,bhk,price_per_sqft):

  x =np.zeros(len(X.columns)) # create zero numpy array, len = 107 as input value for model

  # adding feature's value accorind to their column index
  x[0]=bath
  x[1]=balcony
  x[2]=total_sqft_int
  x[3]=bhk
  x[4]=price_per_sqft

  #print(x)

  # feature scaling
  # x = sc.transform([x])[0] # give 2d np array for feature scaling and get 1d scaled np array
  #print(x)

  return model.predict([x])[0] # return the predicted value by train XGBoost model

In [17]:
# Sample df
df.sample(3)

Unnamed: 0,bath,balcony,price,total_sqft_int,bhk,price_per_sqft
2772,3.0,2.0,132.0,2026.0,3,6515.301086
973,4.0,1.0,180.0,2400.0,4,7500.0
4828,2.0,2.0,65.0,1240.0,3,5241.935484


In [18]:
predict_house_price(model=pipeline_rfr, 
                    bath=3,
                    balcony=2,
                    total_sqft_int=1440,
                    bhk=3,
                    price_per_sqft=4569)



np.float64(65.81480000000009)

In [19]:
predict_house_price(model=pipeline_rfr, 
                    bath=3,
                    balcony=2,
                    total_sqft_int=1850,
                    bhk=3,
                    price_per_sqft=8378)



np.float64(154.6)

# Save & Load Model

In [20]:
import pickle

In [21]:
# save model

pickle.dump(pipeline_rfr, open('rfr.pkl', 'wb'))

In [22]:
# load model

rfr_model = pickle.load(open('rfr.pkl', 'rb'))

In [23]:
predict_house_price(model=rfr_model, 
                    bath=3,
                    balcony=2,
                    total_sqft_int=1850,
                    bhk=3,
                    price_per_sqft=8378)



np.float64(154.6)