# Regression Models

## Imports

In [50]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

import xgboost as xgb

from sklearn.metrics import mean_squared_error

from yellowbrick.regressor import PredictionError

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import nbimporter

import TrainTestEvalSplit as split
from sklearn.model_selection import KFold

import pickle
from sklearn.utils import check_X_y

from sklearn.model_selection import RandomizedSearchCV

## Import File

In [2]:
full_df = pd.read_csv("../../../Data_thesis/Full_Datasets/Full.csv")
full_df["Date"] = pd.to_datetime(full_df["Date"], format="%Y-%m-%d")

full_df.head()

Unnamed: 0,Date,Hour,weekday,is_weekend,Sensor,SensorLongitude,SensorLatitude,CrowdednessCount,Lon_4.8971927,Lon_4.8973336,...,month_cos,day_sin,day_cos,hour_sin,hour_cos,Nieuwmarkt score,Nieuwezijds Kolk score,Dam score,Spui score,Centraal Station score
0,2018-03-11,100,6.0,1.0,GAWW-04,4.897908,52.373283,886,0,0,...,6.123234000000001e-17,0.188227,0.982126,0.258819,0.965926,0.0,0.0,102.996844,0.0,472.993853
1,2018-03-11,2100,6.0,1.0,GAWW-07,4.900441,52.374414,1603,0,0,...,6.123234000000001e-17,0.188227,0.982126,-0.707107,0.707107,346.998829,198.995171,1266.930956,133.98973,3859.981463
2,2018-03-11,2100,6.0,1.0,GAWW-08,4.897193,52.37165,21,1,0,...,6.123234000000001e-17,0.188227,0.982126,-0.707107,0.707107,346.997145,198.996668,1266.966573,133.995346,3859.909232
3,2018-03-11,2100,6.0,1.0,GAWW-09,4.898479,52.37504,88,0,0,...,6.123234000000001e-17,0.188227,0.982126,-0.707107,0.707107,346.997014,198.997601,1266.952991,133.991938,3859.978146
4,2018-03-11,2100,6.0,1.0,GAWW-10,4.898808,52.372369,49,0,0,...,6.123234000000001e-17,0.188227,0.982126,-0.707107,0.707107,346.998943,198.995907,1266.951383,133.993174,3859.941786


## Train/Test/Eval split

In [3]:
size = 0.8
kf = KFold(n_splits=10, shuffle=True, random_state=42)

In [4]:
x_train, y_train, x_test, y_test, x_eval, y_eval, train_dates, eval_dates = split.trainTestSplit(full_df, size)

## Models

### Baseline: Linear Regression
Implemented the [Sklearn Version](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html#sklearn.linear_model.LinearRegression)

#### Training

In [34]:
base = LinearRegression(fit_intercept=True, normalize=True, copy_X=False, n_jobs=1)

In [35]:
mean_score = 0
mean_rmse = 0

for train_index, val_index in kf.split(train_dates):
    base.fit(x_train[x_train["Date"].isin(train_dates[train_index])].drop(columns={"Date"}),
             y_train[y_train["Date"].isin(train_dates[train_index])]["CrowdednessCount"])
    
    mean_score += base.score(x_train[x_train["Date"].isin(train_dates[val_index])].drop(columns={"Date"}),
             y_train[y_train["Date"].isin(train_dates[val_index])]["CrowdednessCount"])
    
    y_pred_base = base.predict(x_train[x_train["Date"].isin(train_dates[val_index])].drop(columns={"Date"}))
    mean_rmse += np.sqrt(mean_squared_error(y_pred_base, 
                                            y_train[y_train["Date"].isin(train_dates[val_index])]["CrowdednessCount"]))

mean_score /= 10
mean_rmse /= 10

print("Average R^2 Score: ", mean_score)
print("Average RMSE Score: ", mean_rmse)

Average R^2 Score:  0.5865941580351348
Average RMSE Score:  632.3231658220328


#### Hyperparameter Tuning

##### fit_intercept

In [31]:
fit_bool = [True, False, "normalize"]

for b in fit_bool:
    if b == "normalize":
        base = LinearRegression(fit_intercept=True, normalize=True)
    
    else:
        base = LinearRegression(fit_intercept=b)
        
    mean_score = 0
    mean_rmse = 0
    
    for train_index, val_index in kf.split(eval_dates):
        base.fit(x_eval[x_eval["Date"].isin(eval_dates[train_index])].drop(columns={"Date"}),
             y_eval[y_eval["Date"].isin(eval_dates[train_index])]["CrowdednessCount"])
        
        mean_score += base.score(x_eval[x_eval["Date"].isin(eval_dates[val_index])].drop(columns={"Date"}),
                 y_eval[y_eval["Date"].isin(eval_dates[val_index])]["CrowdednessCount"])

        y_pred_base = base.predict(x_eval[x_eval["Date"].isin(eval_dates[val_index])].drop(columns={"Date"}))
        mean_rmse += np.sqrt(mean_squared_error(y_pred_base, 
                                                y_eval[y_eval["Date"].isin(eval_dates[val_index])]["CrowdednessCount"]))

    mean_score /= 10
    mean_rmse /= 10

    print("Test for fit_intercept bool: ", b)
    print("Average R^2 Score: ", mean_score)
    print("Average RMSE Score: ", mean_rmse)
    print("\n")

Test for fit_intercept bool:  True
Average R^2 Score:  0.5556509211325003
Average RMSE Score:  624.1457396143729


Test for fit_intercept bool:  False
Average R^2 Score:  0.555648204905386
Average RMSE Score:  624.1471777564127


Test for fit_intercept bool:  normalize
Average R^2 Score:  0.5436915486502217
Average RMSE Score:  631.5665833984187




##### Copy_X

In [32]:
fit_bool = [True, False]

for b in fit_bool:    
    base = LinearRegression(fit_intercept=True, normalize=True, copy_X=b)
    
    mean_score = 0
    mean_rmse = 0
    
    for train_index, val_index in kf.split(eval_dates):
        base.fit(x_eval[x_eval["Date"].isin(eval_dates[train_index])].drop(columns={"Date"}),
             y_eval[y_eval["Date"].isin(eval_dates[train_index])]["CrowdednessCount"])
        
        mean_score += base.score(x_eval[x_eval["Date"].isin(eval_dates[val_index])].drop(columns={"Date"}),
                 y_eval[y_eval["Date"].isin(eval_dates[val_index])]["CrowdednessCount"])

        y_pred_base = base.predict(x_eval[x_eval["Date"].isin(eval_dates[val_index])].drop(columns={"Date"}))
        mean_rmse += np.sqrt(mean_squared_error(y_pred_base, 
                                                y_eval[y_eval["Date"].isin(eval_dates[val_index])]["CrowdednessCount"]))

    mean_score /= 10
    mean_rmse /= 10

    print("Test for copy_X bool: ", b)
    print("Average R^2 Score: ", mean_score)
    print("Average RMSE Score: ", mean_rmse)
    print("\n")

Test for copy_X bool:  True
Average R^2 Score:  0.5436915486502217
Average RMSE Score:  631.5665833984187


Test for copy_X bool:  False
Average R^2 Score:  0.5436915486502217
Average RMSE Score:  631.5665833984187




##### N-Jobs

In [33]:
fit_jobs = [1, 10, 100,-1]
mean_score = 0
mean_rmse = 0

for n in fit_jobs:    
    base = LinearRegression(fit_intercept=True, normalize=True, copy_X=False, n_jobs=n)
    
    mean_score = 0
    mean_rmse = 0
    
    for train_index, val_index in kf.split(eval_dates):
        base.fit(x_eval[x_eval["Date"].isin(eval_dates[train_index])].drop(columns={"Date"}),
             y_eval[y_eval["Date"].isin(eval_dates[train_index])]["CrowdednessCount"])
        
        mean_score += base.score(x_eval[x_eval["Date"].isin(eval_dates[val_index])].drop(columns={"Date"}),
                 y_eval[y_eval["Date"].isin(eval_dates[val_index])]["CrowdednessCount"])

        y_pred_base = base.predict(x_eval[x_eval["Date"].isin(eval_dates[val_index])].drop(columns={"Date"}))
        mean_rmse += np.sqrt(mean_squared_error(y_pred_base, 
                                                y_eval[y_eval["Date"].isin(eval_dates[val_index])]["CrowdednessCount"]))

    mean_score /= 10
    mean_rmse /= 10

    print("Test number of jobs: ", n)
    print("Average R^2 Score: ", mean_score)
    print("Average RMSE Score: ", mean_rmse)
    print("\n")

Test number of jobs:  1
Average R^2 Score:  0.5436915486502217
Average RMSE Score:  631.5665833984187


Test number of jobs:  10
Average R^2 Score:  0.5436915486502217
Average RMSE Score:  631.5665833984187


Test number of jobs:  100
Average R^2 Score:  0.5436915486502217
Average RMSE Score:  631.5665833984187


Test number of jobs:  -1
Average R^2 Score:  0.5436915486502217
Average RMSE Score:  631.5665833984187




### Random Forrest Regressor 
Implemented the [Sklearn Version](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)

#### Training

In [44]:
rfg = RandomForestRegressor(random_state=42)

In [45]:
mean_score = 0
mean_rmse = 0

for train_index, val_index in kf.split(train_dates):
    rfg.fit(x_train[x_train["Date"].isin(train_dates[train_index])].drop(columns={"Date"}),
             y_train[y_train["Date"].isin(train_dates[train_index])]["CrowdednessCount"])
    
    mean_score += rfg.score(x_train[x_train["Date"].isin(train_dates[val_index])].drop(columns={"Date"}),
             y_train[y_train["Date"].isin(train_dates[val_index])]["CrowdednessCount"])
    
    y_pred_rfg = rfg.predict(x_train[x_train["Date"].isin(train_dates[val_index])].drop(columns={"Date"}))
    mean_rmse += np.sqrt(mean_squared_error(y_pred_rfg, 
                                            y_train[y_train["Date"].isin(train_dates[val_index])]["CrowdednessCount"]))

mean_score /= 10
mean_rmse /= 10

print("Average R^2 Score: ", mean_score)
print("Average RMSE Score: ", mean_rmse)



Average R^2 Score:  0.8751634823422115
Average RMSE Score:  344.03229874790736


#### Hyperparameter Tuning

##### N_estimators

In [46]:
estimators = [310, 320, 330]

for n in estimators:
    rfg = RandomForestRegressor(n_estimators=n, random_state=42)
    
    mean_score = 0
    mean_rmse = 0
    
    for train_index, val_index in kf.split(eval_dates):
        rfg.fit(x_eval[x_eval["Date"].isin(eval_dates[train_index])].drop(columns={"Date"}),
             y_eval[y_eval["Date"].isin(eval_dates[train_index])]["CrowdednessCount"])
        
        mean_score += rfg.score(x_eval[x_eval["Date"].isin(eval_dates[val_index])].drop(columns={"Date"}),
                 y_eval[y_eval["Date"].isin(eval_dates[val_index])]["CrowdednessCount"])

        y_pred_base = rfg.predict(x_eval[x_eval["Date"].isin(eval_dates[val_index])].drop(columns={"Date"}))
        mean_rmse += np.sqrt(mean_squared_error(y_pred_base, 
                                                y_eval[y_eval["Date"].isin(eval_dates[val_index])]["CrowdednessCount"]))

    mean_score /= 10
    mean_rmse /= 10

    print("Test number of estimators: ", n)
    print("Average R^2 Score: ", mean_score)
    print("Average RMSE Score: ", mean_rmse)
    print("\n")

Test number of estimators:  310
Average R^2 Score:  0.7869756734292173
Average RMSE Score:  416.8075579449549


Test number of estimators:  320
Average R^2 Score:  0.7868046354319738
Average RMSE Score:  416.9523126613989


Test number of estimators:  330
Average R^2 Score:  0.7867256944723233
Average RMSE Score:  417.01173205436606




##### Criterion
went for mse as mae took too long

##### Bootstrap

In [47]:
boot = [True, False]

for b in boot:
    rfg = RandomForestRegressor(random_state=42, n_estimators=320, criterion="mse", bootstrap=b)
    
    mean_score = 0
    mean_rmse = 0
    
    for train_index, val_index in kf.split(eval_dates):
        rfg.fit(x_eval[x_eval["Date"].isin(eval_dates[train_index])].drop(columns={"Date"}),
             y_eval[y_eval["Date"].isin(eval_dates[train_index])]["CrowdednessCount"])
        
        mean_score += rfg.score(x_eval[x_eval["Date"].isin(eval_dates[val_index])].drop(columns={"Date"}),
                 y_eval[y_eval["Date"].isin(eval_dates[val_index])]["CrowdednessCount"])

        y_pred_base = rfg.predict(x_eval[x_eval["Date"].isin(eval_dates[val_index])].drop(columns={"Date"}))
        mean_rmse += np.sqrt(mean_squared_error(y_pred_base, 
                                                y_eval[y_eval["Date"].isin(eval_dates[val_index])]["CrowdednessCount"]))

    mean_score /= 10
    mean_rmse /= 10

    print("Test bootstrap: ", b)
    print("Average R^2 Score: ", mean_score)
    print("Average RMSE Score: ", mean_rmse)
    print("\n")

Test bootstrap:  True
Average R^2 Score:  0.7868046354319738
Average RMSE Score:  416.9523126613989


Test bootstrap:  False
Average R^2 Score:  0.6855598259921123
Average RMSE Score:  508.7618752609642




##### oob_score

In [48]:
oobs = [True, False]

for oob in oobs:
    rfg = RandomForestRegressor(random_state=42, n_estimators=320, criterion="mse", bootstrap=True, oob_score=oob)
    
    mean_score = 0
    mean_rmse = 0
    
    for train_index, val_index in kf.split(eval_dates):
        rfg.fit(x_eval[x_eval["Date"].isin(eval_dates[train_index])].drop(columns={"Date"}),
             y_eval[y_eval["Date"].isin(eval_dates[train_index])]["CrowdednessCount"])
        
        mean_score += rfg.score(x_eval[x_eval["Date"].isin(eval_dates[val_index])].drop(columns={"Date"}),
                 y_eval[y_eval["Date"].isin(eval_dates[val_index])]["CrowdednessCount"])

        y_pred_base = rfg.predict(x_eval[x_eval["Date"].isin(eval_dates[val_index])].drop(columns={"Date"}))
        mean_rmse += np.sqrt(mean_squared_error(y_pred_base, 
                                                y_eval[y_eval["Date"].isin(eval_dates[val_index])]["CrowdednessCount"]))

    mean_score /= 10
    mean_rmse /= 10

    print("Test Oob score: ", oob)
    print("Average R^2 Score: ", mean_score)
    print("Average RMSE Score: ", mean_rmse)
    print("\n")

Test Oob score:  True
Average R^2 Score:  0.7868046354319738
Average RMSE Score:  416.9523126613989


Test Oob score:  False
Average R^2 Score:  0.7868046354319738
Average RMSE Score:  416.9523126613989




##### n_jobs

In [49]:
jobs = [-1, 1, 5, 10]

for job in jobs:
    rfg = RandomForestRegressor(random_state=42, n_estimators=320, criterion="mse", bootstrap=True, oob_score=False,
                               n_jobs=job)
    
    mean_score = 0
    mean_rmse = 0
    
    for train_index, val_index in kf.split(eval_dates):
        rfg.fit(x_eval[x_eval["Date"].isin(eval_dates[train_index])].drop(columns={"Date"}),
             y_eval[y_eval["Date"].isin(eval_dates[train_index])]["CrowdednessCount"])
        
        mean_score += rfg.score(x_eval[x_eval["Date"].isin(eval_dates[val_index])].drop(columns={"Date"}),
                 y_eval[y_eval["Date"].isin(eval_dates[val_index])]["CrowdednessCount"])

        y_pred_base = rfg.predict(x_eval[x_eval["Date"].isin(eval_dates[val_index])].drop(columns={"Date"}))
        mean_rmse += np.sqrt(mean_squared_error(y_pred_base, 
                                                y_eval[y_eval["Date"].isin(eval_dates[val_index])]["CrowdednessCount"]))

    mean_score /= 10
    mean_rmse /= 10

    print("Test jobs: ", job)
    print("Average R^2 Score: ", mean_score)
    print("Average RMSE Score: ", mean_rmse)
    print("\n")

Test jobs:  -1
Average R^2 Score:  0.7868046354319738
Average RMSE Score:  416.9523126613989


Test jobs:  1
Average R^2 Score:  0.7868046354319738
Average RMSE Score:  416.9523126613989


Test jobs:  5
Average R^2 Score:  0.7868046354319738
Average RMSE Score:  416.9523126613989


Test jobs:  10
Average R^2 Score:  0.7868046354319738
Average RMSE Score:  416.9523126613989




### XGBoost Regressor
Through [SciKit-Learn API](https://xgboost.readthedocs.io/en/latest/python/python_api.html) 

#### Training

In [11]:
xgbr = xgb.XGBRegressor(max_depth=6, learning_rate=0.08, n_estimators=500, verbosity=1,
                          booster="gbtree", n_jobs=50, random_state=42)

In [12]:
mean_score = 0
mean_rmse = 0

for train_index, val_index in kf.split(train_dates):
    
    x_train_con, y_train_con = check_X_y(X=x_train[x_train["Date"].isin(train_dates[train_index])].drop(columns={"Date"}),
                                        y=y_train[y_train["Date"].isin(train_dates[train_index])]["CrowdednessCount"])
    
    xgbr.fit(x_train_con,y_train_con)
    
    x_val_con, y_val_con = check_X_y(X=x_train[x_train["Date"].isin(train_dates[val_index])].drop(columns={"Date"}),
                                        y=y_train[y_train["Date"].isin(train_dates[val_index])]["CrowdednessCount"])
    
    mean_score += xgbr.score(x_val_con,y_val_con)
    
    y_pred_xgbr = xgbr.predict(x_val_con)
    mean_rmse += np.sqrt(mean_squared_error(y_pred_xgbr,y_val_con))

mean_score /= 10
mean_rmse /= 10

print("Average R^2 Score: ", mean_score)
print("Average RMSE Score: ", mean_rmse)

Average R^2 Score:  0.8266354773156328
Average RMSE Score:  404.7650130646047
