# Day 09. Exercise 00
# Regularization

## 0. Imports

In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## 1. Preprocessing

1. Read the file `dayofweek.csv` that you used in the previous day to a dataframe.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [2]:
df = pd.read_csv('../data/dayofweek.csv')
df

Unnamed: 0,numTrials,hour,dayofweek,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,-0.788667,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.756764,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.724861,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.692958,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-0.661055,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,-0.533442,0.945382,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1682,-0.629151,0.945382,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1683,-0.597248,0.945382,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1684,-0.565345,0.945382,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [3]:
X=df.drop(columns=['dayofweek'])
y=df['dayofweek']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=21, stratify=y)

## 2. Logreg regularization

### a. Default regularization

1. Train a baseline model with the only parameters `random_state=21`, `fit_intercept=False`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model


The result of the code where you trained and evaluated the baseline model should be exactly like this (use `%%time` to get the info about how long it took to run the cell):

```
train -  0.62902   |   valid -  0.59259
train -  0.64633   |   valid -  0.62963
train -  0.63479   |   valid -  0.56296
train -  0.65622   |   valid -  0.61481
train -  0.63397   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64138   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64333   |   valid -  0.59701
train -  0.63674   |   valid -  0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943
```

In [5]:
def crossval(n_splits, X, y, model):
	kf = KFold(n_splits=n_splits, shuffle=True, random_state=21)
	l_rmse=list()
	for ind_train,ind_test in kf.split(X):
		t_X_train, t_X_test = X.iloc[ind_train], X.iloc[ind_test]
		t_y_train, t_y_test = y.iloc[ind_train], y.iloc[ind_test]
		
		model.fit(t_X_train, t_y_train)
		t_y_pred_train = model.predict(t_X_train)
		t_y_pred_test = model.predict(t_X_test)

		t_rmse_train = accuracy_score(t_y_train, t_y_pred_train)
		t_rmse_test = accuracy_score(t_y_test, t_y_pred_test)

		l_rmse.append(t_rmse_test)

		print(f"train -  {t_rmse_train}   |   test -  {t_rmse_test}")
	print(f"Average accuracy on crossval is {np.mean(l_rmse)}")

In [6]:
model_logreg = LogisticRegression(random_state=21, fit_intercept=False)
model_logreg.fit(X_train, y_train)
accuracy_score(model_logreg.predict(X_test), y_test)

0.6331360946745562

In [7]:
%%time
crossval(10, X, y, model_logreg)

train -  0.6282135794330916   |   test -  0.7396449704142012
train -  0.6526038233355307   |   test -  0.6153846153846154
train -  0.6539222148978246   |   test -  0.6094674556213018
train -  0.6361239288068556   |   test -  0.5443786982248521
train -  0.6453526697429136   |   test -  0.6331360946745562
train -  0.6453526697429136   |   test -  0.5798816568047337
train -  0.6291172595520421   |   test -  0.5714285714285714
train -  0.6442687747035574   |   test -  0.6190476190476191
train -  0.6363636363636364   |   test -  0.6011904761904762
train -  0.6403162055335968   |   test -  0.6130952380952381
Average accuracy on crossval is 0.6126655395886165
CPU times: user 2.06 s, sys: 6.19 ms, total: 2.07 s
Wall time: 209 ms


### b. Optimizing regularization parameters

1. In the cells below try different values of penalty: `none`, `l1`, `l2` – you can change the values of solver too.

In [None]:
model_logreg_none = LogisticRegression(penalty=None, random_state=21, fit_intercept=False)

In [9]:
%%time
crossval(10, X, y, model_logreg_none)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to sca

train -  0.6611733684904416   |   test -  0.7514792899408284
train -  0.6638101516150297   |   test -  0.6331360946745562
train -  0.6585365853658537   |   test -  0.621301775147929
train -  0.6598549769281477   |   test -  0.5739644970414202
train -  0.6671061305207646   |   test -  0.6627218934911243
train -  0.6690837178642056   |   test -  0.6153846153846154
train -  0.658102766798419   |   test -  0.6190476190476191
train -  0.6561264822134387   |   test -  0.6369047619047619
train -  0.6574440052700923   |   test -  0.6071428571428571
train -  0.6745718050065876   |   test -  0.6369047619047619
Average accuracy on crossval is 0.6357988165680473
CPU times: user 4.96 s, sys: 9.52 ms, total: 4.97 s
Wall time: 506 ms


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to sca

In [None]:
model_logreg_l1 = LogisticRegression(penalty='l1', random_state=21, fit_intercept=False, solver='saga')

In [11]:
%%time
crossval(10, X, y, model_logreg_l1)



train -  0.6242584047462096   |   test -  0.7337278106508875
train -  0.6486486486486487   |   test -  0.621301775147929




train -  0.6493078444297956   |   test -  0.5976331360946746
train -  0.6328279499011207   |   test -  0.5325443786982249




train -  0.6433750823994726   |   test -  0.6272189349112426
train -  0.6506262359920897   |   test -  0.5798816568047337




train -  0.6304347826086957   |   test -  0.5714285714285714
train -  0.6436100131752306   |   test -  0.6309523809523809
train -  0.6330698287220027   |   test -  0.6011904761904762
train -  0.6469038208168643   |   test -  0.6190476190476191
Average accuracy on crossval is 0.611492673992674
CPU times: user 2.39 s, sys: 2.38 ms, total: 2.39 s
Wall time: 1.91 s




In [12]:
model_logreg_l2 = LogisticRegression(penalty='l2', random_state=21, fit_intercept=False)

In [13]:
%%time
crossval(10, X, y, model_logreg_l2)

train -  0.6282135794330916   |   test -  0.7396449704142012
train -  0.6526038233355307   |   test -  0.6153846153846154
train -  0.6539222148978246   |   test -  0.6094674556213018
train -  0.6361239288068556   |   test -  0.5443786982248521
train -  0.6453526697429136   |   test -  0.6331360946745562
train -  0.6453526697429136   |   test -  0.5798816568047337
train -  0.6291172595520421   |   test -  0.5714285714285714
train -  0.6442687747035574   |   test -  0.6190476190476191
train -  0.6363636363636364   |   test -  0.6011904761904762
train -  0.6403162055335968   |   test -  0.6130952380952381
Average accuracy on crossval is 0.6126655395886165
CPU times: user 2.94 s, sys: 7.94 ms, total: 2.95 s
Wall time: 325 ms


## 3. SVM regularization

### a. Default regularization

1. Train a baseline model with the only parameters `probability=True`, `kernel='linear'`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [14]:
model_svm = SVC(random_state=21, kernel='linear', probability=True)
model_svm.fit(X_train, y_train)
accuracy_score(model_svm.predict(X_test), y_test)

0.7159763313609467

In [15]:
%%time
crossval(10, X, y, model_svm)

train -  0.6934739617666447   |   test -  0.7692307692307693
train -  0.7112722478576137   |   test -  0.6863905325443787
train -  0.7013843111404087   |   test -  0.6745562130177515
train -  0.6994067237969677   |   test -  0.6094674556213018
train -  0.7007251153592617   |   test -  0.6982248520710059
train -  0.7066578773895847   |   test -  0.727810650887574
train -  0.7002635046113307   |   test -  0.6547619047619048
train -  0.7114624505928854   |   test -  0.6369047619047619
train -  0.6956521739130435   |   test -  0.6845238095238095
train -  0.7108036890645586   |   test -  0.6428571428571429
Average accuracy on crossval is 0.67847280924204
CPU times: user 2.88 s, sys: 5.06 ms, total: 2.88 s
Wall time: 2.88 s


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `C`.

In [16]:
model_svm_c1 = SVC(random_state=21, kernel='linear', probability=True, C=1)

model_svm_c10 = SVC(random_state=21, kernel='linear', probability=True, C=10)

model_svm_c01 = SVC(random_state=21, kernel='linear', probability=True, C=0.1)

In [17]:
%%time
crossval(10, X, y, model_svm_c1)

train -  0.6934739617666447   |   test -  0.7692307692307693
train -  0.7112722478576137   |   test -  0.6863905325443787
train -  0.7013843111404087   |   test -  0.6745562130177515
train -  0.6994067237969677   |   test -  0.6094674556213018
train -  0.7007251153592617   |   test -  0.6982248520710059
train -  0.7066578773895847   |   test -  0.727810650887574
train -  0.7002635046113307   |   test -  0.6547619047619048
train -  0.7114624505928854   |   test -  0.6369047619047619
train -  0.6956521739130435   |   test -  0.6845238095238095
train -  0.7108036890645586   |   test -  0.6428571428571429
Average accuracy on crossval is 0.67847280924204
CPU times: user 2.88 s, sys: 2.07 ms, total: 2.88 s
Wall time: 2.87 s


In [18]:
%%time
crossval(10, X, y, model_svm_c10)

train -  0.7646671061305208   |   test -  0.8047337278106509
train -  0.7778510217534608   |   test -  0.7218934911242604
train -  0.7771918259723137   |   test -  0.7692307692307693
train -  0.7798286090969018   |   test -  0.7041420118343196
train -  0.7640079103493738   |   test -  0.8165680473372781
train -  0.7679630850362558   |   test -  0.7692307692307693
train -  0.7687747035573123   |   test -  0.7083333333333334
train -  0.7760210803689065   |   test -  0.6964285714285714
train -  0.7786561264822134   |   test -  0.75
train -  0.7779973649538867   |   test -  0.7023809523809523
Average accuracy on crossval is 0.7442941673710906
CPU times: user 4.41 s, sys: 120 μs, total: 4.41 s
Wall time: 4.4 s


In [19]:
%%time
crossval(10, X, y, model_svm_c01)

train -  0.5886618325642716   |   test -  0.6804733727810651
train -  0.5781147000659196   |   test -  0.591715976331361
train -  0.5866842452208306   |   test -  0.5621301775147929
train -  0.5899802241265656   |   test -  0.47337278106508873
train -  0.5728411338167436   |   test -  0.5443786982248521
train -  0.5675675675675675   |   test -  0.5680473372781065
train -  0.6001317523056654   |   test -  0.5178571428571429
train -  0.6001317523056654   |   test -  0.5714285714285714
train -  0.5718050065876152   |   test -  0.5654761904761905
train -  0.5856389986824769   |   test -  0.5535714285714286
Average accuracy on crossval is 0.56284516765286
CPU times: user 3.05 s, sys: 2.05 ms, total: 3.05 s
Wall time: 3.04 s


## 4. Tree

### a. Default regularization

1. Train a baseline model with the only parameter `max_depth=10` and `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [20]:
model_tree = DecisionTreeClassifier(max_depth=10, random_state=21)
model_tree.fit(X_train, y_train)
accuracy_score(model_tree.predict(X_test), y_test)

0.7396449704142012

In [21]:
%%time
crossval(10, X, y, model_tree)

train -  0.8088332234673699   |   test -  0.7810650887573964
train -  0.8220171390903098   |   test -  0.7633136094674556
train -  0.8180619644034278   |   test -  0.7396449704142012
train -  0.8233355306526038   |   test -  0.757396449704142
train -  0.8134475939353988   |   test -  0.7869822485207101
train -  0.8213579433091628   |   test -  0.8165680473372781
train -  0.8142292490118577   |   test -  0.7321428571428571
train -  0.8194993412384717   |   test -  0.7202380952380952
train -  0.8241106719367589   |   test -  0.7619047619047619
train -  0.8254281949934124   |   test -  0.7559523809523809
Average accuracy on crossval is 0.761520850943928
CPU times: user 53.7 ms, sys: 996 μs, total: 54.7 ms
Wall time: 54.1 ms


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `max_depth`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [22]:
model_tree_depth5 = DecisionTreeClassifier(max_depth=5, random_state=21)

model_tree_depth15 = DecisionTreeClassifier(max_depth=15, random_state=21)

model_tree_depth20 = DecisionTreeClassifier(max_depth=20, random_state=21)

model_tree_depth25 = DecisionTreeClassifier(max_depth=25, random_state=21)

In [23]:
%%time
crossval(10, X, y, model_tree_depth5)

train -  0.6051417270929466   |   test -  0.6804733727810651
train -  0.6104152933421226   |   test -  0.6272189349112426
train -  0.6301911667765326   |   test -  0.5976331360946746
train -  0.6163480553724456   |   test -  0.5266272189349113
train -  0.6176664469347396   |   test -  0.5621301775147929
train -  0.6077785102175346   |   test -  0.6331360946745562
train -  0.616600790513834   |   test -  0.5416666666666666
train -  0.6258234519104084   |   test -  0.6130952380952381
train -  0.6284584980237155   |   test -  0.5892857142857143
train -  0.61133069828722   |   test -  0.5952380952380952
Average accuracy on crossval is 0.5966504649196956
CPU times: user 47.2 ms, sys: 13 μs, total: 47.2 ms
Wall time: 46.7 ms


In [24]:
%%time
crossval(10, X, y, model_tree_depth15)

train -  0.9419907712590639   |   test -  0.8875739644970414
train -  0.946605141727093   |   test -  0.8816568047337278
train -  0.954515491100857   |   test -  0.863905325443787
train -  0.9571522742254449   |   test -  0.8579881656804734
train -  0.952537903757416   |   test -  0.863905325443787
train -  0.956493078444298   |   test -  0.9171597633136095
train -  0.9472990777338604   |   test -  0.8333333333333334
train -  0.9552042160737813   |   test -  0.8511904761904762
train -  0.9466403162055336   |   test -  0.8392857142857143
train -  0.9472990777338604   |   test -  0.8333333333333334
Average accuracy on crossval is 0.8629332206255282
CPU times: user 60.2 ms, sys: 1.99 ms, total: 62.2 ms
Wall time: 61.2 ms


In [25]:
%%time
crossval(10, X, y, model_tree_depth20)

train -  0.99340804218853   |   test -  0.893491124260355
train -  0.98681608437706   |   test -  0.8994082840236687
train -  0.990112063282795   |   test -  0.893491124260355
train -  0.984179301252472   |   test -  0.8875739644970414
train -  0.992089650626236   |   test -  0.9053254437869822
train -  0.986156888595913   |   test -  0.9230769230769231
train -  0.9901185770750988   |   test -  0.875
train -  0.9920948616600791   |   test -  0.8869047619047619
train -  0.9888010540184453   |   test -  0.8511904761904762
train -  0.9855072463768116   |   test -  0.8630952380952381
Average accuracy on crossval is 0.8878557340095803
CPU times: user 60.6 ms, sys: 1.99 ms, total: 62.6 ms
Wall time: 61 ms


In [26]:
%%time
crossval(10, X, y, model_tree_depth25)

train -  0.999340804218853   |   test -  0.8875739644970414
train -  0.998681608437706   |   test -  0.9053254437869822
train -  0.999340804218853   |   test -  0.8875739644970414
train -  0.998681608437706   |   test -  0.9053254437869822
train -  0.999340804218853   |   test -  0.9053254437869822
train -  0.997363216875412   |   test -  0.9467455621301775
train -  1.0   |   test -  0.8928571428571429
train -  1.0   |   test -  0.8928571428571429
train -  0.997364953886693   |   test -  0.8630952380952381
train -  0.9993412384716732   |   test -  0.8869047619047619
Average accuracy on crossval is 0.8973584108199493
CPU times: user 60.9 ms, sys: 1.99 ms, total: 62.9 ms
Wall time: 61.7 ms


## 5. Random forest

### a. Default regularization

1. Train a baseline model with the only parameters `n_estimators=50`, `max_depth=14`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [27]:
model_forest = RandomForestClassifier(n_estimators=50, max_depth=14, random_state=21)
model_forest.fit(X_train, y_train)
accuracy_score(model_forest.predict(X_test), y_test)

0.908284023668639

In [28]:
%%time
crossval(10, X, y, model_forest)

train -  0.967699406723797   |   test -  0.9230769230769231
train -  0.96044825313118   |   test -  0.9230769230769231
train -  0.958470665787739   |   test -  0.8698224852071006
train -  0.966381015161503   |   test -  0.8520710059171598
train -  0.97363216875412   |   test -  0.9230769230769231
train -  0.9630850362557679   |   test -  0.9467455621301775
train -  0.955862977602108   |   test -  0.8869047619047619
train -  0.9683794466403162   |   test -  0.8630952380952381
train -  0.9703557312252964   |   test -  0.9047619047619048
train -  0.9703557312252964   |   test -  0.8809523809523809
Average accuracy on crossval is 0.8973584108199493
CPU times: user 687 ms, sys: 19 ms, total: 706 ms
Wall time: 703 ms


### b. Optimizing regularization parameters

1. In the new cells try different values of the parameters `max_depth` and `n_estimators`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [29]:
model_forest_n25_d14 = RandomForestClassifier(n_estimators=25, max_depth=14, random_state=21)

model_forest_n75_d14 = RandomForestClassifier(n_estimators=75, max_depth=14, random_state=21)

model_forest_n50_d5 = RandomForestClassifier(n_estimators=10, max_depth=5, random_state=21)

model_forest_n50_d20 = RandomForestClassifier(n_estimators=10, max_depth=20, random_state=21)

In [30]:
%%time
crossval(10, X, y, model_forest_n25_d14)

train -  0.967699406723797   |   test -  0.9171597633136095
train -  0.9512195121951219   |   test -  0.9171597633136095
train -  0.951878707976269   |   test -  0.8698224852071006
train -  0.955833882663151   |   test -  0.8165680473372781
train -  0.9611074489123269   |   test -  0.9171597633136095
train -  0.956493078444298   |   test -  0.9349112426035503
train -  0.9525691699604744   |   test -  0.8809523809523809
train -  0.9598155467720685   |   test -  0.8511904761904762
train -  0.9552042160737813   |   test -  0.875
train -  0.9677206851119895   |   test -  0.8511904761904762
Average accuracy on crossval is 0.8831114398422091
CPU times: user 386 ms, sys: 5 ms, total: 391 ms
Wall time: 390 ms


In [31]:
%%time
crossval(10, X, y, model_forest_n75_d14)

train -  0.9670402109426499   |   test -  0.9230769230769231
train -  0.963744232036915   |   test -  0.9408284023668639
train -  0.9650626235992089   |   test -  0.8698224852071006
train -  0.968358602504944   |   test -  0.8520710059171598
train -  0.975609756097561   |   test -  0.9230769230769231
train -  0.965721819380356   |   test -  0.9408284023668639
train -  0.9644268774703557   |   test -  0.875
train -  0.969038208168643   |   test -  0.8630952380952381
train -  0.9756258234519104   |   test -  0.8988095238095238
train -  0.9703557312252964   |   test -  0.8809523809523809
Average accuracy on crossval is 0.8967561284868978
CPU times: user 1.04 s, sys: 18 ms, total: 1.05 s
Wall time: 1.05 s


In [32]:
%%time
crossval(10, X, y, model_forest_n50_d5)

train -  0.6084377059986816   |   test -  0.7218934911242604
train -  0.5754779169413315   |   test -  0.5857988165680473
train -  0.6328279499011207   |   test -  0.5976331360946746
train -  0.6064601186552406   |   test -  0.5088757396449705
train -  0.6044825313117996   |   test -  0.5621301775147929
train -  0.6090969017798286   |   test -  0.591715976331361
train -  0.6073781291172595   |   test -  0.5654761904761905
train -  0.6119894598155468   |   test -  0.625
train -  0.6106719367588933   |   test -  0.6071428571428571
train -  0.6054018445322793   |   test -  0.5595238095238095
Average accuracy on crossval is 0.5925190194420963
CPU times: user 145 ms, sys: 2 ms, total: 147 ms
Wall time: 146 ms


In [33]:
%%time
crossval(10, X, y, model_forest_n50_d20)

train -  0.990771259063942   |   test -  0.9112426035502958
train -  0.988793671720501   |   test -  0.9408284023668639
train -  0.987475280158207   |   test -  0.893491124260355
train -  0.988793671720501   |   test -  0.8698224852071006
train -  0.990112063282795   |   test -  0.9230769230769231
train -  0.987475280158207   |   test -  0.9349112426035503
train -  0.9888010540184453   |   test -  0.8928571428571429
train -  0.9894598155467721   |   test -  0.8809523809523809
train -  0.9907773386034255   |   test -  0.9047619047619048
train -  0.9907773386034255   |   test -  0.8928571428571429
Average accuracy on crossval is 0.9044801352493661
CPU times: user 190 ms, sys: 5 ms, total: 195 ms
Wall time: 193 ms


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.
3. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your test dataset).
4. Save the model.

In [34]:
model_best = model_forest

In [35]:
pred_y = model_best.predict(X)

In [36]:
accuracy_score(pred_y, y)

0.961447212336892

In [37]:
pred_y

array([4, 4, 4, ..., 3, 3, 3], shape=(1686,))

In [38]:
temp_df = pd.DataFrame(y).reset_index().drop(columns=['index'])
temp_df['ans_model'] = pred_y
temp_df

Unnamed: 0,dayofweek,ans_model
0,4,4
1,4,4
2,4,4
3,4,4
4,4,4
...,...,...
1681,3,3
1682,3,3
1683,3,3
1684,3,3


In [39]:
temp_df[temp_df['dayofweek'] != temp_df['ans_model']].dayofweek.value_counts() / len(pred_y)

dayofweek
2    0.007711
5    0.007117
1    0.006524
6    0.005931
0    0.005338
3    0.002966
4    0.002966
Name: count, dtype: float64

In [40]:
joblib.dump(model_best, 'model_ex00.joblib')

['model_ex00.joblib']