# Name: Gurjot Singh
# Batch: COE 30
# Roll Number: 401853006
### Data Science Project: House Cost Prediction using hybrid machine learning model

In [1]:
# Importing Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pycaret.regression import *
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.cluster import KMeans

In [2]:
# Reading Dataset

data = pd.read_csv('housing.csv')

In [3]:
# Viewing data

data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
# Checking for Null Values

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [5]:
# NaN values removal

data.dropna(inplace = True)

In [6]:
# Getting the summary of Data

data.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0
mean,-119.570689,35.633221,28.633094,2636.504233,537.870553,1424.946949,499.433465,3.871162,206864.413155
std,2.003578,2.136348,12.591805,2185.269567,421.38507,1133.20849,382.299226,1.899291,115435.667099
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1450.0,296.0,787.0,280.0,2.5637,119500.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5365,179700.0
75%,-118.01,37.72,37.0,3143.0,647.0,1722.0,604.0,4.744,264700.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [7]:
# One hot encoding and label encoding

Ocean_Prox_col = pd.get_dummies(data.ocean_proximity, prefix = 'Ocean')

data = pd.concat([data, Ocean_Prox_col], axis = 1)
data.drop(columns = ['ocean_proximity', 'households'], inplace = True)

In [8]:
# Viewing data

data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,median_income,median_house_value,Ocean_<1H OCEAN,Ocean_INLAND,Ocean_ISLAND,Ocean_NEAR BAY,Ocean_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,8.3252,452600.0,0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,8.3014,358500.0,0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,7.2574,352100.0,0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,5.6431,341300.0,0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,3.8462,342200.0,0,0,0,1,0


In [9]:
# Viewing column names

data.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'median_income', 'median_house_value',
       'Ocean_<1H OCEAN', 'Ocean_INLAND', 'Ocean_ISLAND', 'Ocean_NEAR BAY',
       'Ocean_NEAR OCEAN'],
      dtype='object')

In [10]:
# Putting feature variable to X

X = data[['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'median_income', 'Ocean_<1H OCEAN', 'Ocean_INLAND', 'Ocean_ISLAND', 'Ocean_NEAR BAY', 'Ocean_NEAR OCEAN']]

# Putting response variable to y

y = data['median_house_value']

In [11]:
# Test - Train split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.65, random_state = 1111)

In [12]:
# Viewing shapes of data

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(13281, 12)
(7152, 12)
(13281,)
(7152,)


In [13]:
# Changing column names

X_train.columns = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'median_income', 'Ocean_<1H OCEAN', 'Ocean_INLAND', 'Ocean_ISLAND', 'Ocean_NEAR BAY', 'Ocean_NEAR OCEAN']
y_train.columns = ['median_house_value']

# Preparing training dataset

train_df = pd.concat([X_train, y_train], axis = 1)

In [14]:
# Comparing various models using Pycaret

setup(data = data, target = 'median_house_value', session_id = 1111)
compare_models(exclude = ['lightgbm', 'rf', 'et', 'gbr', 'dt'])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lr,Linear Regression,49834.9238,4712504064.0,68617.6297,0.6473,0.3747,0.2875,0.371
lasso,Lasso Regression,49835.3105,4712514508.8,68617.7125,0.6473,0.3747,0.2875,0.03
ridge,Ridge Regression,49840.3965,4712759244.8,68619.5898,0.6473,0.3748,0.2876,0.007
llar,Lasso Least Angle Regression,49824.4853,4713761631.6137,68627.1193,0.6472,0.3699,0.2868,0.008
br,Bayesian Ridge,49857.0226,4715891212.6887,68642.892,0.6471,0.3751,0.2877,0.011
en,Elastic Net,52725.6023,5107081523.2,71436.6219,0.618,0.3692,0.3187,0.033
huber,Huber Regressor,52043.129,5531323859.9689,74344.4552,0.5861,0.3918,0.2816,0.073
omp,Orthogonal Matching Pursuit,62675.6645,7014124839.0864,83720.9643,0.4754,0.427,0.3855,0.008
lar,Least Angle Regression,63243.1918,7794222081.1323,84495.4161,0.4097,0.543,0.3866,0.008
ada,AdaBoost Regressor,75934.2095,7929508045.7281,89016.0088,0.4059,0.4747,0.5217,0.259


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)

In [15]:
# Traing Linear Model as a whole

lm = LinearRegression()
lm.fit(X_train, y_train)

# Making predictions using the model

y_pred = lm.predict(X_test)

# Checking R2 score
r_squared = r2_score(y_test, y_pred)
print('r_square_value :',r_squared)

r_square_value : 0.6403466623772675


## Methodology: 
<br>
<div style="text-align: justify">A variety of different regression models were tested for the given dataset including Linear, Huber, Orthogonal Matching Point, etc. (refer table below) and linear regression (multivarialte) was found to be the best model among all applied models having a R2 score of 0.6479.</div>

![image](image1.png)

In [16]:
# Hybrid clustering approach: finding best number of clusters using comparison of R2 scores by following hybrid model on training data, and making predictions on test data.

all_centers = []
all_rsquared = []
all_lms = []

for clusters in range(1, 51):
    
    kmeans = KMeans(n_clusters = clusters, random_state = 16).fit(X_train)
    
    all_centers.append(kmeans.cluster_centers_)
    
    dframes = []

    for i in range(clusters):
        dframes.append(pd.DataFrame(columns = train_df.columns))

    labels = list(kmeans.labels_)

    for i in range(len(labels)):
        dframes[labels[i]].loc[len(dframes[labels[i]].index)] = list(train_df.iloc[i,:].values)

    Xs = []
    ys = []
    lms = []

    for i in range(clusters):
        Xs.append(dframes[i][['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'median_income', 'Ocean_<1H OCEAN', 'Ocean_INLAND', 'Ocean_ISLAND', 'Ocean_NEAR BAY', 'Ocean_NEAR OCEAN']])
        ys.append(dframes[i][['median_house_value']])
        lms.append(LinearRegression())
        lms[i].fit(Xs[i], ys[i])

    all_lms.append(lms)
    
    y_pred = []

    for i in range(len(X_test)):
        temp = np.array(X_test.iloc[i,:]).reshape(1, -1)
        y_pred.append(lms[int(kmeans.predict(temp))].predict(temp))

    y_pred = np.array(y_pred).reshape(X_test.shape[0], )

    r_squared = r2_score(y_test, y_pred)
    
    all_rsquared.append(r_squared)
    
    print("Clusters : " + str(clusters) + ", r_square_value : ", r_squared, "\n")

Clusters : 1, r_square_value :  0.6403466623772675 

Clusters : 2, r_square_value :  0.6550016045378284 

Clusters : 3, r_square_value :  0.6584258043228772 

Clusters : 4, r_square_value :  0.6645807690855483 

Clusters : 5, r_square_value :  0.6644486724978962 

Clusters : 6, r_square_value :  0.6649349356010337 

Clusters : 7, r_square_value :  0.6647880102844352 

Clusters : 8, r_square_value :  0.6670920119310297 

Clusters : 9, r_square_value :  0.6724807594276986 

Clusters : 10, r_square_value :  0.6734091526993119 

Clusters : 11, r_square_value :  0.6748620901739408 

Clusters : 12, r_square_value :  0.6700350194197229 

Clusters : 13, r_square_value :  0.6695728086012087 

Clusters : 14, r_square_value :  0.6735388885871829 

Clusters : 15, r_square_value :  0.6805542175356816 

Clusters : 16, r_square_value :  0.6800458350203731 

Clusters : 17, r_square_value :  0.6792996661230705 

Clusters : 18, r_square_value :  0.6806192391245172 

Clusters : 19, r_square_value :  0.67

In [17]:
# Saving appropriate cluster information and linear model informations accordingly.

max_index = all_rsquared.index(max(all_rsquared))
output_clusters = pd.DataFrame(columns = X_train.columns)
output_lms = pd.DataFrame(columns = X_train.columns.insert(0, "Intercept"))

for i in range(max_index + 1):
    output_clusters.loc[len(output_clusters.index)] = list(all_centers[max_index][i])
    output_lms.loc[len(output_lms.index)] = list(all_lms[max_index][i].intercept_) + list(all_lms[max_index][i].coef_.tolist()[0])
    
output_clusters.to_csv("ClusterInfo.csv", index = False)
output_lms.to_csv("LMSInfo.csv", index = False)