# Predict House Prices in King County, Washington  

## Overview 

King County, Washington is the most populous county in Washington. It is also the 12th most populous in the United States. Washington's most populous city is Seattle.

Our goal is to predict the price of a house based on houses sold between May 2014 to May 2015 in King County, Washington State, USA.  

In [1]:
#imports 
import pandas as pd
import numpy as np
import opendatasets as od
#import os
#import pickle
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.dummy import DummyRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

# 1. Data Wrangling

In [2]:
#get the data from API
#https://www.kaggle.com/swathiachath/kc-housesales-data > data with explanations 
housingAPI = od.download("https://www.kaggle.com/shivachandel/kc-house-data")

Skipping, found downloaded files in "./kc-house-data" (use force=True to force download)


In [3]:
#get the data and read it 
housingData = pd.read_csv('./kc-house-data/kc_house_data.csv')

In [4]:
#check for duplicates
housingData.duplicated()

duplicate = housingData[housingData.duplicated()]
print(duplicate)

Empty DataFrame
Columns: [id, date, price, bedrooms, bathrooms, sqft_living, sqft_lot, floors, waterfront, view, condition, grade, sqft_above, sqft_basement, yr_built, yr_renovated, zipcode, lat, long, sqft_living15, sqft_lot15]
Index: []

[0 rows x 21 columns]


In [5]:
#check for any null values
housingData.isnull().values.any()

#check how many null values
housingData.isnull().sum()

# 2 null values are in the sqft_above column > index 10 & 17
housingData[housingData['sqft_above'].isna()]

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
10,1736800520,20150403T000000,662500.0,3,2.5,3560,9796,1.0,0,0,...,8,,1700,1965,0,98007,47.6007,-122.145,2210,8925
17,6865200140,20140529T000000,485000.0,4,1.0,1600,4300,1.5,0,0,...,7,,0,1916,0,98103,47.6648,-122.343,1610,4300


In [6]:
#Remove the rows where sqft_above is not NA
newHousingDF = housingData[housingData['sqft_above'].notna()]

newHousingDF.isnull().sum()

id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

# 2 Preprocessing and Training


## 2.1 Train/Test split

In [7]:
#X = newHousingDF[["sqft_living", 'grade', 'lat', 'long']]
X = newHousingDF.drop(columns=['date', 'price', 'id', 'bathrooms', 'sqft_basement', 'yr_renovated'])
y = newHousingDF['price']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=47)

In [9]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(15127, 15) (6484, 15) (15127,) (6484,)


### Metrics 

In [10]:
def r_squared(y, ypred):
    """R-squared score.
    Arguments:
    y -- the observed values
    ypred -- the predicted values
    """
    ybar = np.sum(y) / len(y) #yes, we could use np.mean(y)
    sum_sq_tot = np.sum((y - ybar)**2) #total sum of squares error
    sum_sq_res = np.sum((y - ypred)**2) #residual sum of squares error
    R2 = 1.0 - sum_sq_res / sum_sq_tot
    return R2

In [11]:
#Calculate the mean of `y_train`
train_mean = y_train.mean()
train_mean

541317.1673828254

In [12]:
#init dummy regressor
dumb_reg = DummyRegressor(strategy='mean')
dumb_reg.fit(X_train, y_train)
dumb_reg.constant_

array([[541317.16738283]])

In [13]:
y_tr_pred = dumb_reg.predict(X_train)
y_tr_pred[:5]

array([541317.16738283, 541317.16738283, 541317.16738283, 541317.16738283,
       541317.16738283])

In [14]:
y_te_pred = train_mean * np.ones(len(y_test))
r_squared(y_test, y_te_pred)

-0.0001319455771289224

In [15]:
#mean squared error function
def mse(y, ypred):
    """Mean square error.

    Arguments:
    y -- the observed values
    ypred -- the predicted values
    """
    sq_error = (y - ypred)**2
    mse = np.mean(sq_error)
    return mse

In [16]:
#root mean square error
np.sqrt([mse(y_train, y_tr_pred), mse(y_test, y_te_pred)])

array([371174.53555269, 357539.02472642])

## 2.2 Linear regression

In [17]:
#Let's start with the stats model
import statsmodels.api as sm


X_stats = sm.add_constant(X)
model = sm.OLS(y, X_stats).fit()

print_model = model.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.697
Model:                            OLS   Adj. R-squared:                  0.697
Method:                 Least Squares   F-statistic:                     3309.
Date:                Tue, 19 Apr 2022   Prob (F-statistic):               0.00
Time:                        15:45:40   Log-Likelihood:            -2.9468e+05
No. Observations:               21611   AIC:                         5.894e+05
Df Residuals:                   21595   BIC:                         5.895e+05
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const           7.79e+06   2.94e+06      2.651

  x = pd.concat(x[::order], 1)


In [18]:
#Linear model

X_tr = X_train
X_te = X_test

#scale data
scaler = StandardScaler()
scaler.fit(X)
X_tr_scaled = scaler.transform(X_tr)
X_te_scaled = scaler.transform(X_te)

In [19]:
#create linear regression object and train the train on the model
lm = LinearRegression().fit(X_tr_scaled, y_train)

In [20]:
#predict it using the train and test 
y_tr_pred = lm.predict(X_tr_scaled)
y_te_pred = lm.predict(X_te_scaled)

In [21]:
#root mean square error
np.sqrt([mse(y_train, y_tr_pred), mse(y_test, y_te_pred)])

array([205344.01069285, 194733.20654819])

<b>We are going to use GridSearchCV.</b>

In [22]:
#get the params for the gridsearch 
lm.get_params().keys()

dict_keys(['copy_X', 'fit_intercept', 'n_jobs', 'normalize', 'positive'])

In [23]:
linear_params = {'fit_intercept': [True], 'normalize':[True]}
#'copy_X': [True]
gs = GridSearchCV(lm, linear_params, cv = 5)
gs.fit(X_tr, y_train)

print(gs.best_params_)

{'fit_intercept': True, 'normalize': True}


In [24]:
#get the best scores 
print(gs.best_score_)

0.692841236373096


<b>We will now get the Accuracy</b> 

In [25]:
lm.score(X_te_scaled, y_test)

0.7033185264118567

<b>The accuracy is 70%.</b>

In [26]:
#root mean square error
np.sqrt([mse(y_train, y_tr_pred), mse(y_test, y_te_pred)])

array([205344.01069285, 194733.20654819])

## 2.3 Ridge regression

In [27]:
from sklearn.linear_model import Ridge
from sklearn import metrics

ridge = Ridge().fit(X_tr_scaled, y_train)

#predict it using the train and test 
y_tr_pred = ridge.predict(X_tr_scaled)
y_te_pred = ridge.predict(X_te_scaled)

In [28]:
#root mean square error
np.sqrt([mse(y_train, y_tr_pred), mse(y_test, y_te_pred)])

array([205344.0124379, 194733.0943854])

<b>We are going to use Gridsearchcv</b>

In [29]:
ridge.get_params().keys()

dict_keys(['alpha', 'copy_X', 'fit_intercept', 'max_iter', 'normalize', 'random_state', 'solver', 'tol'])

In [30]:
ridge_params = {'fit_intercept': [True], 'normalize':[True]}

In [31]:
ridge_gs = GridSearchCV(ridge, ridge_params, cv = 5)
ridge_gs.fit(X_tr, y_train)

print(ridge_gs.best_params_)

{'fit_intercept': True, 'normalize': True}


In [32]:
#get the best scores 
print(ridge_gs.best_score_)

0.6138004019250871


In [33]:
ridge_gs.score(X_test, y_test)

0.6278121617753638

In [34]:
#root mean square error
np.sqrt([mse(y_train, y_tr_pred), mse(y_test, y_te_pred)])

array([205344.0124379, 194733.0943854])

## 2.4 Random Forest

In [35]:
from sklearn.ensemble import RandomForestRegressor

X_tr = X_train
X_te = X_test

rf = RandomForestRegressor()
rf.fit(X_tr, y_train)

RandomForestRegressor()

In [36]:
#predict it using the train and test 
y_tr_pred = rf.predict(X_tr_scaled)
y_te_pred = rf.predict(X_te_scaled)

In [37]:
#root mean square error
np.sqrt([mse(y_train, y_tr_pred), mse(y_test, y_te_pred)])

array([470484.33212226, 455802.54071825])

<b>We are going to use Gridsearchcv</b>

In [38]:
rf.get_params().keys()

dict_keys(['bootstrap', 'ccp_alpha', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

In [39]:
rf_params = {'max_depth': [100], 'max_features': ['auto'], 'random_state': [47]}

In [40]:
rf_gs = GridSearchCV(rf, rf_params, cv = 5)
rf_gs.fit(X_tr, y_train) 

print(rf_gs.best_params_)

{'max_depth': 100, 'max_features': 'auto', 'random_state': 47}


In [41]:
#get the best scores 
print(rf_gs.best_score_)

0.8868405496935114


In [42]:
rf_gs.score(X_test, y_test)

0.8557264893512168

In [43]:
#root mean square error
np.sqrt([mse(y_train, y_tr_pred), mse(y_test, y_te_pred)])

array([470484.33212226, 455802.54071825])

<b>The Random Forest model had the highest score of 86% and the highest root mean square error. We will use this model for the housing prices.</b>

# 3 Modeling

In [47]:
#Making an prediction
prediction1 = rf.predict(X)
print(prediction1)

[232690.63       524543.46       216714.83333333 ... 376417.93
 412130.         338663.9       ]


In [48]:
prediction2 = rf.predict(X_te)
print(prediction2)

[1540557.2         228076.29        333467.         ...  637949.86666667
  371988.09        436193.35      ]


In [49]:
prediction3 = rf.predict(X_te_scaled)
print(prediction3)

[233746.  227933.5 387807.  ... 230717.  230717.  227933.5]
