### King County House Price Prediction

In [1]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [2]:
# import the relevant libraries
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error,r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import accuracy_score,recall_score
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
import warnings
warnings.filterwarnings('ignore')

---

In [3]:
#function to split the date into (year,month,day)
def SplitDate(df):
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day

In [4]:
#function to drop unnessisary data
def Drop(df):
    df.drop("id",axis=1,inplace=True)
    df.drop("zipcode",axis=1,inplace=True)
    df.drop("date",axis=1,inplace=True)      

In [5]:
#function to show the heat map 
def HeatMap(df):
    cor = df.corr()
    plt.figure(figsize=(20,15))
    sns.heatmap(cor, annot=True, cmap=plt.cm.BrBG)
    plt.title("Correlation matrix among features")
    plt.show()

In [6]:
#function to extract the feature
def FeatureSelection(df):
    df=df.drop(columns=["day","year","sqft_lot15","sqft_living15","sqft_basement","condition","yr_built","bathrooms"],axis=1)
    return df 

In [7]:
#function to determine the input and the target
def DetermineXY(df):
    x = df.drop(["price"], axis=1)
    y = df[['price']]
    return x,y

In [8]:
#function to standardize the input 
def standardization(x_train,x_test):
    sc=StandardScaler()
    x_train_sc=sc.fit_transform(x_train)
    x_test_sc=sc.transform(x_test)
    return x_train_sc,x_test_sc

---

In [9]:
#function to choose the best parameters for the Decision Tree Model
def CVDecisionTree(x_train, y_train):
    params = {'splitter':["best", "random"],'max_depth':np.arange(5,30),'min_samples_leaf':np.arange(1,20)}
    dt=DecisionTreeRegressor()
    cv2 = RandomizedSearchCV(dt,params,cv=4,n_iter=25)
    cv2.fit(x_train, y_train)
    return cv2.best_params_

In [10]:
#function to choose the best parameters for the Random Forest Model
def CVRandomForest(x_train, y_train):
    params = {'n_estimators':np.arange(100,250,30),'max_depth':np.arange(5,15),'min_samples_leaf':np.arange(1,10)}
    rf=RandomForestRegressor()
    cv2 = RandomizedSearchCV(rf,params,cv=4,n_iter=25)
    cv2.fit(x_train, y_train)
    return cv2.best_params_

In [11]:
#function to choose the best parameters for the XGBoost Model
def CVXGBoost(x_train, y_train):
    params = {'n_estimators':np.arange(100,250,30),'max_depth':np.arange(5,15),'max_leaves':np.arange(1,10)}
    xg=XGBRegressor()
    cv2 = RandomizedSearchCV(xg,params,cv=4,n_iter=25)
    cv2.fit(x_train, y_train)
    return cv2.best_params_

---

In [12]:
#load the preprocessed CSV data
data=pd.read_csv("kc_house_data .csv")

Call the data preperation functions

In [13]:
#function to prepare the data
def DataPreperation(data):
    SplitDate(data)  
    Drop(data)
    #HeatMap(data)
    data1=FeatureSelection(data)
    x,y=DetermineXY(data1)
    x_train, x_test, y_train, y_test= train_test_split(x,y, test_size=0.3, random_state=0)
    x_train_sc,x_test_sc=standardization(x_train,x_test)
    #data.head()    
    return x_train, x_test, y_train, y_test,x_train_sc,x_test_sc

In [14]:
 x_train, x_test, y_train, y_test,x_train_sc,x_test_sc=DataPreperation(data)

LinerRegrition 

In [15]:
#function to applay the Liner Regrition Model 
def LinerRegrition(x_train, x_test, y_train, y_test):
    Reg=LinearRegression()
    Reg.fit(x_train,y_train)
    y_pred=Reg.predict(x_test)
    score_train=Reg.score(x_train,y_train)
    score_test=Reg.score(x_test,y_test)
    print("The training data score= ",score_train)
    print("The testing data score= ",score_test)
    print("The reall price for this model: ",y_test[0:5])
    print("The price prediction for this model: ",y_pred[0:5])
    print("The mean absolute error= ",mean_absolute_error(y_test,y_pred))
    return score_test

In [16]:
#LinerRegrition 
#applay the Model on the data
print("The Liner Regrition Model")
LinerRegrition_accurecy=LinerRegrition(x_train_sc,x_test_sc, y_train, y_test)

The Liner Regrition Model
The training data score=  0.67436894355064
The testing data score=  0.6584779324386063
The reall price for this model:             price
17384   297000.0
722    1578000.0
2680    562100.0
18754   631500.0
14554   780000.0
The price prediction for this model:  [[ 429521.10287413]
 [1418060.59361937]
 [ 492676.02025756]
 [ 498615.07677153]
 [1111732.7467045 ]]
The mean absolute error=  131908.37577056943


---

DecisionTree

In [17]:
#DecisionTree
#choose the best parameters to use for this model
DT_best_params=CVDecisionTree(x_train, y_train)
print("the best parameters to use for this model",DT_best_params)

the best parameters to use for this model {'splitter': 'best', 'min_samples_leaf': 3, 'max_depth': 12}


In [18]:
#function to applay the Decision Tree Model 
def DecisionTree(x_train, x_test, y_train, y_test):
    dt=DecisionTreeRegressor(splitter='best', min_samples_leaf=12, max_depth=8)
    dt.fit(x_train,y_train)
    y_pred=dt.predict(x_test)
    score_train=dt.score(x_train,y_train)
    score_test=dt.score(x_test,y_test)
    print("The training data score= ",score_train)
    print("The testing data score= ",score_test)
    print("The reall price for this model: ",y_test[0:5])
    print("The price prediction for this model: ",y_pred[0:5])
    print("The mean absolute error= ",mean_absolute_error(y_test,y_pred))
    return score_test

In [19]:
#applay the Model on the data
print("The Decision Tree Model")
DecisionTree_accurecy=DecisionTree(x_train_sc,x_test_sc, y_train, y_test)

The Decision Tree Model
The training data score=  0.8264735428426069
The testing data score=  0.782475437577127
The reall price for this model:             price
17384   297000.0
722    1578000.0
2680    562100.0
18754   631500.0
14554   780000.0
The price prediction for this model:  [ 326870.65217391 1504752.          499934.97076736  499934.97076736
  809260.86956522]
The mean absolute error=  94092.3335461281


---

RandomForest

In [None]:
#RandomForest
#choose the best parameters to use for this model
RF_best_params=CVRandomForest(x_train, y_train)
print("the best parameters to use for this model",RF_best_params)

In [None]:
#function to apply the Random Forest Model 
def RandomForest(x_train, x_test, y_train, y_test):
    rf=RandomForestRegressor(n_estimators=200,min_samples_leaf=7,max_depth=10)
    rf.fit(x_train,y_train)
    y_pred=rf.predict(x_test)
    score_train=rf.score(x_train,y_train)
    score_test=rf.score(x_test,y_test)
    print("The training data score= ",score_train)
    print("The testing data score= ",score_test)
    print("The reall price for this model: ",y_test[0:5])
    print("The price prediction for this model: ",y_pred[0:5])
    print("The mean absolute error= ",mean_absolute_error(y_test,y_pred))
    return score_test

In [None]:
#applay the Model on the data
print("The Random Forest Model")
RandomForest_accurecy=RandomForest(x_train,x_test, y_train, y_test)

---

XGBoost

In [None]:
#XGBoost
#choose the best parameters to use for this model
XG_best_params=CVXGBoost(x_train, y_train)
print(XG_best_params)

In [None]:
#function to applay the XGBoost Model 
def XGBoost(x_train, x_test, y_train, y_test):
    xg=XGBRegressor(n_estimators=120,max_leaves=13,max_depth=3) 
    xg.fit(x_train,y_train)
    y_pred=xg.predict(x_test)
    score_train=xg.score(x_train,y_train)
    score_test=xg.score(x_test,y_test)
    print("The training data score= ",score_train)
    print("The testing data score= ",score_test)
    print("The reall price for this model: ",y_test[0:5])
    print("The price prediction for this model: ",y_pred[0:5])
    print("The mean absolute error= ",mean_absolute_error(y_test,y_pred))
    return score_test

In [None]:
#applay the Model on the data
print("The XGBoost Model")
XGBoost_accurecy=XGBoost(x_train,x_test, y_train, y_test)

---

The best Model

In [None]:
#to specify the best model with best accuracy for the data set 
array=[["LinerRegrition",LinerRegrition_accurecy],["DecisionTree",DecisionTree_accurecy],["RandomForest",RandomForest_accurecy],["XGBoost",XGBoost_accurecy]]
best=[]
for i in array:
    best.append(i[1])
print("The best Model is:",i[0]," with accuracy = ",max(best))    
 

Pikle

In [None]:
#XGBoost global environment model
XGB=XGBRegressor(n_estimators=120,max_leaves=13,max_depth=3)
XGB.fit(x_train,y_train)

In [None]:
#pikle the model file
with open('XGBoost', 'wb') as file:
    pickle.dump(XGB, file)