# Classification Models

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import nbimporter

import TrainTestEvalSplit as split

from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

#Classification Metrics
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

from yellowbrick.classifier import ClassPredictionError

import pickle

from sklearn.model_selection import KFold
from sklearn.utils import check_X_y

Importing Jupyter notebook from TrainTestEvalSplit.ipynb


## Import File

In [2]:
full_df = pd.read_csv("../../../Data_thesis/Full_Datasets/Full.csv")
full_df["Date"] = pd.to_datetime(full_df["Date"], format="%Y-%m-%d")

full_df.head()

Unnamed: 0,Date,Hour,weekday,is_weekend,Sensor,SensorLongitude,SensorLatitude,CrowdednessCount,Lon_4.8971927,Lon_4.8973336,...,month_cos,day_sin,day_cos,hour_sin,hour_cos,Nieuwmarkt score,Nieuwezijds Kolk score,Dam score,Spui score,Centraal Station score
0,2018-03-11,100,6.0,1.0,GAWW-04,4.897908,52.373283,886,0,0,...,6.123234000000001e-17,0.188227,0.982126,0.258819,0.965926,0.0,0.0,102.996844,0.0,472.993853
1,2018-03-11,2100,6.0,1.0,GAWW-07,4.900441,52.374414,1603,0,0,...,6.123234000000001e-17,0.188227,0.982126,-0.707107,0.707107,346.998829,198.995171,1266.930956,133.98973,3859.981463
2,2018-03-11,2100,6.0,1.0,GAWW-08,4.897193,52.37165,21,1,0,...,6.123234000000001e-17,0.188227,0.982126,-0.707107,0.707107,346.997145,198.996668,1266.966573,133.995346,3859.909232
3,2018-03-11,2100,6.0,1.0,GAWW-09,4.898479,52.37504,88,0,0,...,6.123234000000001e-17,0.188227,0.982126,-0.707107,0.707107,346.997014,198.997601,1266.952991,133.991938,3859.978146
4,2018-03-11,2100,6.0,1.0,GAWW-10,4.898808,52.372369,49,0,0,...,6.123234000000001e-17,0.188227,0.982126,-0.707107,0.707107,346.998943,198.995907,1266.951383,133.993174,3859.941786


## Train/Test/Eval

In [3]:
size = 0.9
kf = KFold(n_splits=10, shuffle=True, random_state=42)

In [4]:
full_df = split.clasCrowdednessCounts(full_df)

In [5]:
x_train, y_train, x_test, y_test, x_eval, y_eval, train_dates, eval_dates = split.trainTestSplit(full_df, size)

## Models

### Random Forrest Classifier 
Implemented the [Sklearn Version](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier)

#### Training

In [11]:
rfc = RandomForestClassifier(random_state=42)

In [12]:
labels = [1, 2, 3, 4]

mean_acc = 0

mean_precision = 0
mean_recall = 0
mean_f1_score = 0

for train_index, val_index in kf.split(train_dates):
    
    x_train_con, y_train_con = check_X_y(X=x_train[x_train["Date"].isin(train_dates[train_index])].drop(columns={"Date"}),
                                        y=y_train[y_train["Date"].isin(train_dates[train_index])]["CrowdednessCount"])
    rfc.fit(x_train_con,y_train_con)
    
    x_val_con, y_val_con = check_X_y(X=x_train[x_train["Date"].isin(train_dates[val_index])].drop(columns={"Date"}),
                                    y=y_train[y_train["Date"].isin(train_dates[val_index])]["CrowdednessCount"])
    
    y_pred_base = rfc.predict(x_val_con)
    
    mean_acc += accuracy_score(y_val_con, y_pred_base)

    mean_precision += precision_score(y_val_con, y_pred_base, average=None)
    mean_recall += recall_score(y_val_con, y_pred_base, average=None)
    mean_f1_score += f1_score(y_val_con, y_pred_base, average=None)

mean_acc = round(((mean_acc / 10) * 100),2)
mean_precision = (mean_precision / 10) * 100
mean_recall = (mean_recall / 10) * 100
mean_f1_score = (mean_f1_score / 10) * 100

print("Mean Accuracy Score: ", mean_acc, "\n")

for i in range(len(labels)):
    print("For label {0}".format(labels[i]))
    print("Mean Precision Score: ", round(mean_precision[i], 2))
    print("Mean Recall Score: ", round(mean_recall[i], 2))
    print("Mean F1 Score: ", round(mean_f1_score[i], 2), "\n")



Mean Accuracy Score:  85.64 

For label 1
Mean Precision Score:  84.5
Mean Recall Score:  84.73
Mean F1 Score:  84.59 

For label 2
Mean Precision Score:  82.95
Mean Recall Score:  79.31
Mean F1 Score:  81.08 

For label 3
Mean Precision Score:  86.12
Mean Recall Score:  87.32
Mean F1 Score:  86.71 

For label 4
Mean Precision Score:  88.57
Mean Recall Score:  91.29
Mean F1 Score:  89.89 



#### Hyperparameter Tuning

##### Estimators

In [31]:
estimators = [210, 220, 230]

for n in estimators:
    rfc = RandomForestClassifier(n_estimators=n, random_state=42)
    
    labels = [1, 2, 3, 4]

    mean_acc = 0

    mean_precision = 0
    mean_recall = 0
    mean_f1_score = 0

    for train_index, val_index in kf.split(eval_dates):

        x_train_con, y_train_con = check_X_y(X=x_eval[x_eval["Date"].isin(eval_dates[train_index])].drop(columns={"Date"}),
                                            y=y_eval[y_eval["Date"].isin(eval_dates[train_index])]["CrowdednessCount"])
        rfc.fit(x_train_con,y_train_con)

        x_val_con, y_val_con = check_X_y(X=x_eval[x_eval["Date"].isin(eval_dates[val_index])].drop(columns={"Date"}),
                                            y=y_eval[y_eval["Date"].isin(eval_dates[val_index])]["CrowdednessCount"])

        y_pred_base = rfc.predict(x_val_con)

        mean_acc += accuracy_score(y_val_con, y_pred_base)

        mean_precision += precision_score(y_val_con, y_pred_base, average=None)
        mean_recall += recall_score(y_val_con, y_pred_base, average=None)
        mean_f1_score += f1_score(y_val_con, y_pred_base, average=None)

    mean_acc = round(((mean_acc / 10) * 100),2)
    mean_precision = (mean_precision / 10) * 100
    mean_recall = (mean_recall / 10) * 100
    mean_f1_score = (mean_f1_score / 10) * 100
    
    print("Results for ", n, " estimators")

    print("Mean Accuracy Score: ", mean_acc, "\n")

    for i in range(len(labels)):
        print("For label {0}".format(labels[i]))
        print("Mean Precision Score: ", round(mean_precision[i], 2))
        print("Mean Recall Score: ", round(mean_recall[i], 2))
        print("Mean F1 Score: ", round(mean_f1_score[i], 2), "\n")
        
    print("\n")

Results for  210  estimators
Mean Accuracy Score:  79.32 

For label 1
Mean Precision Score:  81.87
Mean Recall Score:  79.39
Mean F1 Score:  80.3 

For label 2
Mean Precision Score:  76.05
Mean Recall Score:  73.8
Mean F1 Score:  74.46 

For label 3
Mean Precision Score:  78.03
Mean Recall Score:  81.04
Mean F1 Score:  79.09 

For label 4
Mean Precision Score:  82.1
Mean Recall Score:  86.2
Mean F1 Score:  83.0 



Results for  220  estimators
Mean Accuracy Score:  79.36 

For label 1
Mean Precision Score:  81.9
Mean Recall Score:  79.8
Mean F1 Score:  80.53 

For label 2
Mean Precision Score:  76.12
Mean Recall Score:  73.62
Mean F1 Score:  74.42 

For label 3
Mean Precision Score:  78.02
Mean Recall Score:  81.0
Mean F1 Score:  79.08 

For label 4
Mean Precision Score:  82.18
Mean Recall Score:  86.2
Mean F1 Score:  83.03 



Results for  230  estimators
Mean Accuracy Score:  79.3 

For label 1
Mean Precision Score:  81.87
Mean Recall Score:  79.63
Mean F1 Score:  80.44 

For label 

##### Criterion

In [32]:
criterion = ["gini", "entropy"]

for c in criterion:
    rfc = RandomForestClassifier(n_estimators=220, criterion=c, random_state=42)
    
    labels = [1, 2, 3, 4]

    mean_acc = 0

    mean_precision = 0
    mean_recall = 0
    mean_f1_score = 0

    for train_index, val_index in kf.split(eval_dates):

        x_train_con, y_train_con = check_X_y(X=x_eval[x_eval["Date"].isin(eval_dates[train_index])].drop(columns={"Date"}),
                                            y=y_eval[y_eval["Date"].isin(eval_dates[train_index])]["CrowdednessCount"])
        rfc.fit(x_train_con,y_train_con)

        x_val_con, y_val_con = check_X_y(X=x_eval[x_eval["Date"].isin(eval_dates[val_index])].drop(columns={"Date"}),
                                            y=y_eval[y_eval["Date"].isin(eval_dates[val_index])]["CrowdednessCount"])

        y_pred_base = rfc.predict(x_val_con)

        mean_acc += accuracy_score(y_val_con, y_pred_base)

        mean_precision += precision_score(y_val_con, y_pred_base, average=None)
        mean_recall += recall_score(y_val_con, y_pred_base, average=None)
        mean_f1_score += f1_score(y_val_con, y_pred_base, average=None)

    mean_acc = round(((mean_acc / 10) * 100),2)
    mean_precision = (mean_precision / 10) * 100
    mean_recall = (mean_recall / 10) * 100
    mean_f1_score = (mean_f1_score / 10) * 100
    
    print("Results for criterion: ", c)

    print("Mean Accuracy Score: ", mean_acc, "\n")

    for i in range(len(labels)):
        print("For label {0}".format(labels[i]))
        print("Mean Precision Score: ", round(mean_precision[i], 2))
        print("Mean Recall Score: ", round(mean_recall[i], 2))
        print("Mean F1 Score: ", round(mean_f1_score[i], 2), "\n")
        
    print("\n")

Results for criterion:  gini
Mean Accuracy Score:  79.36 

For label 1
Mean Precision Score:  81.9
Mean Recall Score:  79.8
Mean F1 Score:  80.53 

For label 2
Mean Precision Score:  76.12
Mean Recall Score:  73.62
Mean F1 Score:  74.42 

For label 3
Mean Precision Score:  78.02
Mean Recall Score:  81.0
Mean F1 Score:  79.08 

For label 4
Mean Precision Score:  82.18
Mean Recall Score:  86.2
Mean F1 Score:  83.03 



Results for criterion:  entropy
Mean Accuracy Score:  78.67 

For label 1
Mean Precision Score:  80.38
Mean Recall Score:  79.48
Mean F1 Score:  79.64 

For label 2
Mean Precision Score:  75.48
Mean Recall Score:  73.75
Mean F1 Score:  74.2 

For label 3
Mean Precision Score:  78.11
Mean Recall Score:  80.44
Mean F1 Score:  78.79 

For label 4
Mean Precision Score:  81.63
Mean Recall Score:  84.29
Mean F1 Score:  81.67 





##### Max Features

In [34]:
features = ["auto", "log2", None]

for f in features:
    rfc = RandomForestClassifier(n_estimators=220, criterion="gini", max_features=f, random_state=42)
    
    labels = [1, 2, 3, 4]

    mean_acc = 0

    mean_precision = 0
    mean_recall = 0
    mean_f1_score = 0

    for train_index, val_index in kf.split(eval_dates):

        x_train_con, y_train_con = check_X_y(X=x_eval[x_eval["Date"].isin(eval_dates[train_index])].drop(columns={"Date"}),
                                            y=y_eval[y_eval["Date"].isin(eval_dates[train_index])]["CrowdednessCount"])
        rfc.fit(x_train_con,y_train_con)

        x_val_con, y_val_con = check_X_y(X=x_eval[x_eval["Date"].isin(eval_dates[val_index])].drop(columns={"Date"}),
                                            y=y_eval[y_eval["Date"].isin(eval_dates[val_index])]["CrowdednessCount"])

        y_pred_base = rfc.predict(x_val_con)

        mean_acc += accuracy_score(y_val_con, y_pred_base)

        mean_precision += precision_score(y_val_con, y_pred_base, average=None)
        mean_recall += recall_score(y_val_con, y_pred_base, average=None)
        mean_f1_score += f1_score(y_val_con, y_pred_base, average=None)

    mean_acc = round(((mean_acc / 10) * 100),2)
    mean_precision = (mean_precision / 10) * 100
    mean_recall = (mean_recall / 10) * 100
    mean_f1_score = (mean_f1_score / 10) * 100
    
    print("Results for feature handling: ", f)

    print("Mean Accuracy Score: ", mean_acc, "\n")

    for i in range(len(labels)):
        print("For label {0}".format(labels[i]))
        print("Mean Precision Score: ", round(mean_precision[i], 2))
        print("Mean Recall Score: ", round(mean_recall[i], 2))
        print("Mean F1 Score: ", round(mean_f1_score[i], 2), "\n")
        
    print("\n")

Results for feature handling:  auto
Mean Accuracy Score:  79.36 

For label 1
Mean Precision Score:  81.9
Mean Recall Score:  79.8
Mean F1 Score:  80.53 

For label 2
Mean Precision Score:  76.12
Mean Recall Score:  73.62
Mean F1 Score:  74.42 

For label 3
Mean Precision Score:  78.02
Mean Recall Score:  81.0
Mean F1 Score:  79.08 

For label 4
Mean Precision Score:  82.18
Mean Recall Score:  86.2
Mean F1 Score:  83.03 



Results for feature handling:  log2
Mean Accuracy Score:  79.36 

For label 1
Mean Precision Score:  81.9
Mean Recall Score:  79.8
Mean F1 Score:  80.53 

For label 2
Mean Precision Score:  76.12
Mean Recall Score:  73.62
Mean F1 Score:  74.42 

For label 3
Mean Precision Score:  78.02
Mean Recall Score:  81.0
Mean F1 Score:  79.08 

For label 4
Mean Precision Score:  82.18
Mean Recall Score:  86.2
Mean F1 Score:  83.03 



Results for feature handling:  None
Mean Accuracy Score:  77.62 

For label 1
Mean Precision Score:  79.86
Mean Recall Score:  78.19
Mean F1 Sco

##### Bootstrap

In [35]:
bootstrap = [True, False]

for b in bootstrap:
    rfc = RandomForestClassifier(n_estimators=220, criterion="gini", max_features="auto", bootstrap=b, random_state=42)
    
    labels = [1, 2, 3, 4]

    mean_acc = 0

    mean_precision = 0
    mean_recall = 0
    mean_f1_score = 0

    for train_index, val_index in kf.split(eval_dates):

        x_train_con, y_train_con = check_X_y(X=x_eval[x_eval["Date"].isin(eval_dates[train_index])].drop(columns={"Date"}),
                                            y=y_eval[y_eval["Date"].isin(eval_dates[train_index])]["CrowdednessCount"])
        rfc.fit(x_train_con,y_train_con)

        x_val_con, y_val_con = check_X_y(X=x_eval[x_eval["Date"].isin(eval_dates[val_index])].drop(columns={"Date"}),
                                            y=y_eval[y_eval["Date"].isin(eval_dates[val_index])]["CrowdednessCount"])

        y_pred_base = rfc.predict(x_val_con)

        mean_acc += accuracy_score(y_val_con, y_pred_base)

        mean_precision += precision_score(y_val_con, y_pred_base, average=None)
        mean_recall += recall_score(y_val_con, y_pred_base, average=None)
        mean_f1_score += f1_score(y_val_con, y_pred_base, average=None)

    mean_acc = round(((mean_acc / 10) * 100),2)
    mean_precision = (mean_precision / 10) * 100
    mean_recall = (mean_recall / 10) * 100
    mean_f1_score = (mean_f1_score / 10) * 100
    
    print("Results for Boostrap: ", b)

    print("Mean Accuracy Score: ", mean_acc, "\n")

    for i in range(len(labels)):
        print("For label {0}".format(labels[i]))
        print("Mean Precision Score: ", round(mean_precision[i], 2))
        print("Mean Recall Score: ", round(mean_recall[i], 2))
        print("Mean F1 Score: ", round(mean_f1_score[i], 2), "\n")
        
    print("\n")

Results for Boostrap:  True
Mean Accuracy Score:  79.36 

For label 1
Mean Precision Score:  81.9
Mean Recall Score:  79.8
Mean F1 Score:  80.53 

For label 2
Mean Precision Score:  76.12
Mean Recall Score:  73.62
Mean F1 Score:  74.42 

For label 3
Mean Precision Score:  78.02
Mean Recall Score:  81.0
Mean F1 Score:  79.08 

For label 4
Mean Precision Score:  82.18
Mean Recall Score:  86.2
Mean F1 Score:  83.03 



Results for Boostrap:  False
Mean Accuracy Score:  78.06 

For label 1
Mean Precision Score:  79.44
Mean Recall Score:  79.03
Mean F1 Score:  78.9 

For label 2
Mean Precision Score:  74.43
Mean Recall Score:  72.72
Mean F1 Score:  73.13 

For label 3
Mean Precision Score:  77.78
Mean Recall Score:  80.37
Mean F1 Score:  78.55 

For label 4
Mean Precision Score:  81.76
Mean Recall Score:  83.51
Mean F1 Score:  81.33 





##### OOB_Score

In [38]:
oob = [True, False]

for o in oob:
    rfc = RandomForestClassifier(n_estimators=220, criterion="gini", max_features="auto", bootstrap=True, random_state=42,
                                oob_score=o)
    
    labels = [1, 2, 3, 4]

    mean_acc = 0

    mean_precision = 0
    mean_recall = 0
    mean_f1_score = 0

    for train_index, val_index in kf.split(eval_dates):

        x_train_con, y_train_con = check_X_y(X=x_eval[x_eval["Date"].isin(eval_dates[train_index])].drop(columns={"Date"}),
                                            y=y_eval[y_eval["Date"].isin(eval_dates[train_index])]["CrowdednessCount"])
        rfc.fit(x_train_con,y_train_con)

        x_val_con, y_val_con = check_X_y(X=x_eval[x_eval["Date"].isin(eval_dates[val_index])].drop(columns={"Date"}),
                                            y=y_eval[y_eval["Date"].isin(eval_dates[val_index])]["CrowdednessCount"])

        y_pred_base = rfc.predict(x_val_con)

        mean_acc += accuracy_score(y_val_con, y_pred_base)

        mean_precision += precision_score(y_val_con, y_pred_base, average=None)
        mean_recall += recall_score(y_val_con, y_pred_base, average=None)
        mean_f1_score += f1_score(y_val_con, y_pred_base, average=None)

    mean_acc = round(((mean_acc / 10) * 100),2)
    mean_precision = (mean_precision / 10) * 100
    mean_recall = (mean_recall / 10) * 100
    mean_f1_score = (mean_f1_score / 10) * 100
    
    print("Results for Oob: ", o)

    print("Mean Accuracy Score: ", mean_acc, "\n")

    for i in range(len(labels)):
        print("For label {0}".format(labels[i]))
        print("Mean Precision Score: ", round(mean_precision[i], 2))
        print("Mean Recall Score: ", round(mean_recall[i], 2))
        print("Mean F1 Score: ", round(mean_f1_score[i], 2), "\n")
        
    print("\n")

Results for Oob:  True
Mean Accuracy Score:  79.36 

For label 1
Mean Precision Score:  81.9
Mean Recall Score:  79.8
Mean F1 Score:  80.53 

For label 2
Mean Precision Score:  76.12
Mean Recall Score:  73.62
Mean F1 Score:  74.42 

For label 3
Mean Precision Score:  78.02
Mean Recall Score:  81.0
Mean F1 Score:  79.08 

For label 4
Mean Precision Score:  82.18
Mean Recall Score:  86.2
Mean F1 Score:  83.03 



Results for Oob:  False
Mean Accuracy Score:  79.36 

For label 1
Mean Precision Score:  81.9
Mean Recall Score:  79.8
Mean F1 Score:  80.53 

For label 2
Mean Precision Score:  76.12
Mean Recall Score:  73.62
Mean F1 Score:  74.42 

For label 3
Mean Precision Score:  78.02
Mean Recall Score:  81.0
Mean F1 Score:  79.08 

For label 4
Mean Precision Score:  82.18
Mean Recall Score:  86.2
Mean F1 Score:  83.03 





##### N_jobs

In [39]:
jobs = [-1, 1, 10]

for n in jobs:
    rfc = RandomForestClassifier(n_estimators=220, criterion="gini", max_features="auto", bootstrap=True, random_state=42,
                                oob_score=False, n_jobs=n)
    
    labels = [1, 2, 3, 4]

    mean_acc = 0

    mean_precision = 0
    mean_recall = 0
    mean_f1_score = 0

    for train_index, val_index in kf.split(eval_dates):

        x_train_con, y_train_con = check_X_y(X=x_eval[x_eval["Date"].isin(eval_dates[train_index])].drop(columns={"Date"}),
                                            y=y_eval[y_eval["Date"].isin(eval_dates[train_index])]["CrowdednessCount"])
        rfc.fit(x_train_con,y_train_con)

        x_val_con, y_val_con = check_X_y(X=x_eval[x_eval["Date"].isin(eval_dates[val_index])].drop(columns={"Date"}),
                                            y=y_eval[y_eval["Date"].isin(eval_dates[val_index])]["CrowdednessCount"])

        y_pred_base = rfc.predict(x_val_con)

        mean_acc += accuracy_score(y_val_con, y_pred_base)

        mean_precision += precision_score(y_val_con, y_pred_base, average=None)
        mean_recall += recall_score(y_val_con, y_pred_base, average=None)
        mean_f1_score += f1_score(y_val_con, y_pred_base, average=None)

    mean_acc = round(((mean_acc / 10) * 100),2)
    mean_precision = (mean_precision / 10) * 100
    mean_recall = (mean_recall / 10) * 100
    mean_f1_score = (mean_f1_score / 10) * 100
    
    print("Results jobs: ", n)

    print("Mean Accuracy Score: ", mean_acc, "\n")

    for i in range(len(labels)):
        print("For label {0}".format(labels[i]))
        print("Mean Precision Score: ", round(mean_precision[i], 2))
        print("Mean Recall Score: ", round(mean_recall[i], 2))
        print("Mean F1 Score: ", round(mean_f1_score[i], 2), "\n")
        
    print("\n")

Results jobs:  -1
Mean Accuracy Score:  79.36 

For label 1
Mean Precision Score:  81.9
Mean Recall Score:  79.8
Mean F1 Score:  80.53 

For label 2
Mean Precision Score:  76.12
Mean Recall Score:  73.62
Mean F1 Score:  74.42 

For label 3
Mean Precision Score:  78.02
Mean Recall Score:  81.0
Mean F1 Score:  79.08 

For label 4
Mean Precision Score:  82.18
Mean Recall Score:  86.2
Mean F1 Score:  83.03 



Results jobs:  1
Mean Accuracy Score:  79.36 

For label 1
Mean Precision Score:  81.9
Mean Recall Score:  79.8
Mean F1 Score:  80.53 

For label 2
Mean Precision Score:  76.12
Mean Recall Score:  73.62
Mean F1 Score:  74.42 

For label 3
Mean Precision Score:  78.02
Mean Recall Score:  81.0
Mean F1 Score:  79.08 

For label 4
Mean Precision Score:  82.18
Mean Recall Score:  86.2
Mean F1 Score:  83.03 



Results jobs:  10
Mean Accuracy Score:  79.36 

For label 1
Mean Precision Score:  81.9
Mean Recall Score:  79.8
Mean F1 Score:  80.53 

For label 2
Mean Precision Score:  76.12
Mea

##### Warm start

In [41]:
warm = [True, False]

for w in warm:
    rfc = RandomForestClassifier(n_estimators=220, criterion="gini", max_features="auto", bootstrap=True, random_state=42,
                                oob_score=False, n_jobs=1, warm_start=w)
    
    labels = [1, 2, 3, 4]

    mean_acc = 0

    mean_precision = 0
    mean_recall = 0
    mean_f1_score = 0

    for train_index, val_index in kf.split(eval_dates):

        x_train_con, y_train_con = check_X_y(X=x_eval[x_eval["Date"].isin(eval_dates[train_index])].drop(columns={"Date"}),
                                            y=y_eval[y_eval["Date"].isin(eval_dates[train_index])]["CrowdednessCount"])
        rfc.fit(x_train_con,y_train_con)

        x_val_con, y_val_con = check_X_y(X=x_eval[x_eval["Date"].isin(eval_dates[val_index])].drop(columns={"Date"}),
                                            y=y_eval[y_eval["Date"].isin(eval_dates[val_index])]["CrowdednessCount"])

        y_pred_base = rfc.predict(x_val_con)

        mean_acc += accuracy_score(y_val_con, y_pred_base)

        mean_precision += precision_score(y_val_con, y_pred_base, average=None)
        mean_recall += recall_score(y_val_con, y_pred_base, average=None)
        mean_f1_score += f1_score(y_val_con, y_pred_base, average=None)

    mean_acc = round(((mean_acc / 10) * 100),2)
    mean_precision = (mean_precision / 10) * 100
    mean_recall = (mean_recall / 10) * 100
    mean_f1_score = (mean_f1_score / 10) * 100
    
    print("Results warm start: ", w)

    print("Mean Accuracy Score: ", mean_acc, "\n")

    for i in range(len(labels)):
        print("For label {0}".format(labels[i]))
        print("Mean Precision Score: ", round(mean_precision[i], 2))
        print("Mean Recall Score: ", round(mean_recall[i], 2))
        print("Mean F1 Score: ", round(mean_f1_score[i], 2), "\n")
        
    print("\n")

  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "


Results warm start:  True
Mean Accuracy Score:  97.17 

For label 1
Mean Precision Score:  97.07
Mean Recall Score:  97.7
Mean F1 Score:  97.37 

For label 2
Mean Precision Score:  97.16
Mean Recall Score:  97.35
Mean F1 Score:  97.25 

For label 3
Mean Precision Score:  96.15
Mean Recall Score:  97.65
Mean F1 Score:  96.82 

For label 4
Mean Precision Score:  98.32
Mean Recall Score:  96.35
Mean F1 Score:  97.2 



Results warm start:  False
Mean Accuracy Score:  79.36 

For label 1
Mean Precision Score:  81.9
Mean Recall Score:  79.8
Mean F1 Score:  80.53 

For label 2
Mean Precision Score:  76.12
Mean Recall Score:  73.62
Mean F1 Score:  74.42 

For label 3
Mean Precision Score:  78.02
Mean Recall Score:  81.0
Mean F1 Score:  79.08 

For label 4
Mean Precision Score:  82.18
Mean Recall Score:  86.2
Mean F1 Score:  83.03 





##### weight

In [42]:
weight = ["balanced", "balanced_subsample", None]

for w in weight:
    rfc = RandomForestClassifier(n_estimators=220, criterion="gini", max_features="auto", bootstrap=True, random_state=42,
                                oob_score=False, n_jobs=1, warm_start=False, class_weight=w)
    
    labels = [1, 2, 3, 4]

    mean_acc = 0

    mean_precision = 0
    mean_recall = 0
    mean_f1_score = 0

    for train_index, val_index in kf.split(eval_dates):

        x_train_con, y_train_con = check_X_y(X=x_eval[x_eval["Date"].isin(eval_dates[train_index])].drop(columns={"Date"}),
                                            y=y_eval[y_eval["Date"].isin(eval_dates[train_index])]["CrowdednessCount"])
        rfc.fit(x_train_con,y_train_con)

        x_val_con, y_val_con = check_X_y(X=x_eval[x_eval["Date"].isin(eval_dates[val_index])].drop(columns={"Date"}),
                                            y=y_eval[y_eval["Date"].isin(eval_dates[val_index])]["CrowdednessCount"])

        y_pred_base = rfc.predict(x_val_con)

        mean_acc += accuracy_score(y_val_con, y_pred_base)

        mean_precision += precision_score(y_val_con, y_pred_base, average=None)
        mean_recall += recall_score(y_val_con, y_pred_base, average=None)
        mean_f1_score += f1_score(y_val_con, y_pred_base, average=None)

    mean_acc = round(((mean_acc / 10) * 100),2)
    mean_precision = (mean_precision / 10) * 100
    mean_recall = (mean_recall / 10) * 100
    mean_f1_score = (mean_f1_score / 10) * 100
    
    print("Results weight: ", w)

    print("Mean Accuracy Score: ", mean_acc, "\n")

    for i in range(len(labels)):
        print("For label {0}".format(labels[i]))
        print("Mean Precision Score: ", round(mean_precision[i], 2))
        print("Mean Recall Score: ", round(mean_recall[i], 2))
        print("Mean F1 Score: ", round(mean_f1_score[i], 2), "\n")
        
    print("\n")

Results weight:  balanced
Mean Accuracy Score:  78.71 

For label 1
Mean Precision Score:  81.77
Mean Recall Score:  79.32
Mean F1 Score:  80.24 

For label 2
Mean Precision Score:  75.19
Mean Recall Score:  73.73
Mean F1 Score:  74.07 

For label 3
Mean Precision Score:  77.17
Mean Recall Score:  80.52
Mean F1 Score:  78.26 

For label 4
Mean Precision Score:  81.78
Mean Recall Score:  84.75
Mean F1 Score:  81.97 



Results weight:  balanced_subsample
Mean Accuracy Score:  78.75 

For label 1
Mean Precision Score:  81.04
Mean Recall Score:  79.56
Mean F1 Score:  79.98 

For label 2
Mean Precision Score:  75.56
Mean Recall Score:  72.93
Mean F1 Score:  73.83 

For label 3
Mean Precision Score:  77.47
Mean Recall Score:  80.62
Mean F1 Score:  78.5 

For label 4
Mean Precision Score:  81.97
Mean Recall Score:  85.09
Mean F1 Score:  82.28 



Results weight:  None
Mean Accuracy Score:  79.36 

For label 1
Mean Precision Score:  81.9
Mean Recall Score:  79.8
Mean F1 Score:  80.53 

For la

### XGBoost 
Implement [XGBoost](https://xgboost.readthedocs.io/en/latest/python/python_intro.html)

#### Training

In [43]:
xgb = xgb.XGBClassifier(random_state=42)

In [44]:
labels = [1, 2, 3, 4]

mean_acc = 0

mean_precision = 0
mean_recall = 0
mean_f1_score = 0

for train_index, val_index in kf.split(train_dates):
    
    xgb.fit(x_train[x_train["Date"].isin(train_dates[train_index])].drop(columns={"Date"}),
            y_train[y_train["Date"].isin(train_dates[train_index])]["CrowdednessCount"])
    
    y_pred_base = xgb.predict(x_train[x_train["Date"].isin(train_dates[val_index])].drop(columns={"Date"}))
    
    mean_acc += accuracy_score(y_train[y_train["Date"].isin(train_dates[val_index])]["CrowdednessCount"], y_pred_base)

    mean_precision += precision_score(y_train[y_train["Date"].isin(train_dates[val_index])]["CrowdednessCount"], y_pred_base, average=None)
    mean_recall += recall_score(y_train[y_train["Date"].isin(train_dates[val_index])]["CrowdednessCount"], y_pred_base, average=None)
    mean_f1_score += f1_score(y_train[y_train["Date"].isin(train_dates[val_index])]["CrowdednessCount"], y_pred_base, average=None)

mean_acc = round(((mean_acc / 10) * 100),2)
mean_precision = (mean_precision / 10) * 100
mean_recall = (mean_recall / 10) * 100
mean_f1_score = (mean_f1_score / 10) * 100

print("Mean Accuracy Score: ", mean_acc, "\n")

for i in range(len(labels)):
    print("For label {0}".format(labels[i]))
    print("Mean Precision Score: ", round(mean_precision[i], 2))
    print("Mean Recall Score: ", round(mean_recall[i], 2))
    print("Mean F1 Score: ", round(mean_f1_score[i], 2), "\n")

Mean Accuracy Score:  81.24 

For label 1
Mean Precision Score:  85.74
Mean Recall Score:  79.09
Mean F1 Score:  82.24 

For label 2
Mean Precision Score:  77.31
Mean Recall Score:  75.55
Mean F1 Score:  76.41 

For label 3
Mean Precision Score:  81.23
Mean Recall Score:  77.62
Mean F1 Score:  79.36 

For label 4
Mean Precision Score:  80.92
Mean Recall Score:  93.09
Mean F1 Score:  86.49 

