# House prices 6: Machine learning models

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings

%matplotlib inline
warnings.filterwarnings("ignore")
sns.set()

In [2]:
from sklearn.linear_model import LinearRegression as Linear
from sklearn.linear_model import BayesianRidge as Bayesian
from sklearn.linear_model import ARDRegression as ARD
from sklearn.linear_model import HuberRegressor as Huber
from sklearn.linear_model import RANSACRegressor as RANSAC
from sklearn.linear_model import PoissonRegressor as Poisson
from sklearn.linear_model import TweedieRegressor as Tweedie
from sklearn.linear_model import GammaRegressor as Gamma
from sklearn.linear_model import SGDRegressor as SGD
from sklearn.tree import DecisionTreeRegressor as DecisionTree
from sklearn.tree import ExtraTreeRegressor as ExtraTree
from sklearn.ensemble import RandomForestRegressor as RandomForest
from sklearn.ensemble import AdaBoostRegressor as AdaBoost
from sklearn.ensemble import BaggingRegressor as Bagging
from sklearn.svm import SVR as SVR
from sklearn.neighbors import KNeighborsRegressor as KNeighbors
from sklearn.gaussian_process import GaussianProcessRegressor as Gaussian
from sklearn.neural_network import MLPRegressor as Neural

from sklearn.model_selection import KFold as KFold
from sklearn.model_selection import cross_val_score as VScore
from sklearn.metrics import mean_squared_error, r2_score

In [4]:
dataset = pd.read_csv("pca_train.csv")
train = dataset.drop(["target"], axis=1)
target = dataset["target"]

def writeCSV(number):
    train.to_csv("train"+str(number)+".csv")
    test.to_csv("test"+str(number)+".csv")

def updateJoinedData():
    global all_data, numerical, categorical
    all_data = pd.concat(dataframes)
    numerical = all_data.select_dtypes(include=np.number)
    categorical = all_data.select_dtypes(exclude=np.number)

In [5]:
def getAccuracy(model, data, target, parameters=""):
    model = eval(model)()
    if parameters:
        string = "model.set_params("+parameters+")"
        exec(string)
    scoring = ("neg_mean_squared_error", "r2")
    folds = np.abs(VScore(model, data, target, cv=10, n_jobs=1, scoring=scoring[0]))
    mean_score = np.mean(folds)
    r2_score = np.mean(VScore(model, data, target, cv=10, n_jobs=1, scoring=scoring[1]))
    return folds, mean_score, r2_score

In [6]:
def tryModel(model, data, target, parameters=""):
    folds, mean_score, r2_score = getAccuracy(model, data, target, parameters)
    print("KFold Scores:  ", folds, "\n")
    print("MEAN SQUARED ERROR:  ", format(mean_score, ".4e"), "\n")
    print("R2 ACCURACY:  ", "{0:.5g}".format(r2_score*100), "%")

In [9]:
def testModels(data, target):    
    folds = [] #kfolds
    means = [] #squared errordef tryModel(model, data, target, parameters=""):
    scores = [] #r2 score
    models = ["Linear", "Bayesian", "ARD", "Huber", "RANSAC", "Poisson", "Tweedie", "Gamma",
              "SGD", "DecisionTree", "ExtraTree", "RandomForest", "AdaBoost", "Bagging",
              "SVR", "KNeighbors", "Gaussian", "Neural"]
    
    #Compute each model and print the progress
    unit = 100/len(models)
    progress = unit
    for i in models:
        try:
            print("Running", i + "...")
            folds.append(getAccuracy(i, data, target)[0])
            means.append(getAccuracy(i, data, target)[1])
            scores.append(getAccuracy(i, data, target)[2])
            print(i, "model successfully computed. Progress: ", "{0:.3g}".format(progress), "%")
            progress += unit
        except:
            print("ERROR!!! The model ", i, "was NOT computed")
    print("="*64, "\n")
    
    #Decide what model is the best and and printed on screen
    max_value = max(scores)
    index = scores.index(max_value)
    k_folds = folds[index]
    mean_score = means[index]
    print("BEST MODEL:  ", models[index], "\n")
    print("KFold Scores:  ", k_folds, "\n")
    print("MEAN SQUARED ERROR:  ", format(mean_score, ".4e"), "\n")
    print("R2 ACCURACY: ",  "{0:.5g}".format(max_value*100), "%") 

In [39]:
tryModel("Linear", train, target)

KFold Scores:   [6.08420692e+08 8.43337772e+08 4.90951296e+08 1.20777784e+09
 1.17860883e+09 6.80784037e+08 1.35494059e+09 5.53800915e+08
 2.71885669e+09 6.20504111e+08] 

MEAN SQUARED ERROR:   1.0258e+09 

R2 ACCURACY:   84.092 %


In [20]:
tryModel("Huber", train, target)

KFold Scores:   [4.88684805e+08 6.78487868e+08 4.94489485e+08 1.23975702e+09
 1.25609100e+09 7.65946204e+08 5.77720587e+08 6.02834592e+08
 2.73622837e+09 5.44865014e+08] 

MEAN SQUARED ERROR:   9.3851e+08 

R2 ACCURACY:   85.867 %


In [27]:
tryModel("Tweedie", train, target)

KFold Scores:   [5.28528460e+08 8.65803026e+08 6.13400406e+08 1.35840709e+09
 1.58911233e+09 9.00385865e+08 6.96572433e+08 7.06438396e+08
 2.86175764e+09 6.03869962e+08] 

MEAN SQUARED ERROR:   1.0724e+09 

R2 ACCURACY:   83.828 %


In [30]:
tryModel("RandomForest", train, target)

KFold Scores:   [3.81928602e+08 8.33771432e+08 4.75626418e+08 1.59687013e+09
 1.20698728e+09 4.35818191e+08 4.02789568e+08 5.22975010e+08
 1.99257966e+09 4.73318544e+08] 

MEAN SQUARED ERROR:   8.3227e+08 

R2 ACCURACY:   87.717 %


In [31]:
tryModel("AdaBoost", train, target)

KFold Scores:   [6.94873551e+08 1.16285637e+09 8.24054304e+08 1.45013181e+09
 1.84505248e+09 6.77937634e+08 7.86256768e+08 7.78508615e+08
 2.31661685e+09 1.01955904e+09] 

MEAN SQUARED ERROR:   1.1556e+09 

R2 ACCURACY:   82.652 %


In [32]:
tryModel("Bagging", train, target)

KFold Scores:   [4.26140426e+08 9.01248479e+08 6.17420111e+08 1.71251074e+09
 1.17093418e+09 4.76220725e+08 4.61381192e+08 6.10276412e+08
 2.19809767e+09 5.41073365e+08] 

MEAN SQUARED ERROR:   9.1153e+08 

R2 ACCURACY:   85.941 %


In [10]:
testModels(train, target)

Running Linear...
Linear model successfully computed. Progress:  5.56 %
Running Bayesian...
Bayesian model successfully computed. Progress:  11.1 %
Running ARD...
ARD model successfully computed. Progress:  16.7 %
Running Huber...
Huber model successfully computed. Progress:  22.2 %
Running RANSAC...
RANSAC model successfully computed. Progress:  27.8 %
Running Poisson...
Poisson model successfully computed. Progress:  33.3 %
Running Tweedie...
Tweedie model successfully computed. Progress:  38.9 %
Running Gamma...
Gamma model successfully computed. Progress:  44.4 %
Running SGD...
SGD model successfully computed. Progress:  50 %
Running DecisionTree...
DecisionTree model successfully computed. Progress:  55.6 %
Running ExtraTree...
ExtraTree model successfully computed. Progress:  61.1 %
Running RandomForest...
RandomForest model successfully computed. Progress:  66.7 %
Running AdaBoost...
AdaBoost model successfully computed. Progress:  72.2 %
Running Bagging...
Bagging model success