# Day 09. Exercise 00
# Regularization

## 0. Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm
import joblib
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

## 1. Preprocessing

1. Read the file `dayofweek.csv` that you used in the previous day to a dataframe.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [5]:
data = pd.read_csv('../data/dayofweek.csv')
data = data.drop(columns=['Unnamed: 0'])
data

Unnamed: 0,numTrials,hour,dayofweek,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,-0.788667,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.756764,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.724861,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.692958,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-0.661055,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,-0.533442,0.945382,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1682,-0.629151,0.945382,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1683,-0.597248,0.945382,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1684,-0.565345,0.945382,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [8]:
X = data.drop(columns=['dayofweek'])
y = data['dayofweek']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=21, test_size=0.2, stratify=y)

## 2. Logreg regularization

### a. Default regularization

1. Train a baseline model with the only parameters `random_state=21`, `fit_intercept=False`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model


The result of the code where you trained and evaluated the baseline model should be exactly like this (use `%%time` to get the info about how long it took to run the cell):

```
train -  0.62902   |   valid -  0.59259
train -  0.64633   |   valid -  0.62963
train -  0.63479   |   valid -  0.56296
train -  0.65622   |   valid -  0.61481
train -  0.63397   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64138   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64333   |   valid -  0.59701
train -  0.63674   |   valid -  0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943
```

In [67]:
%%timeit -n 1 -r 1
log_reg = LogisticRegression(random_state=21, fit_intercept=False)

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

train_accuracies = []
valid_accuracies = []

for train_index, valid_index in skf.split(X_train, y_train):
    X_ctrain, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_ctrain, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]

    log_reg.fit(X_ctrain, y_ctrain)

    train_pred = log_reg.predict(X_ctrain)
    valid_pred = log_reg.predict(X_valid)

    train_accuracy = accuracy_score(y_ctrain, train_pred)
    valid_accuracy = accuracy_score(y_valid, valid_pred)
    
    train_accuracies.append(train_accuracy)
    valid_accuracies.append(valid_accuracy)
    
    print(f"train - {train_accuracy:.5f} |   valid - {valid_accuracy:.5f}")

average_accuracy = np.mean(valid_accuracies)
std_accuracy = np.std(valid_accuracies)

print(f"Average accuracy on crossval is {average_accuracy:.5f}")
print(f"Std is {std_accuracy:.5f}")


train - 0.64056 |   valid - 0.65926
train - 0.63561 |   valid - 0.62222
train - 0.64468 |   valid - 0.60000
train - 0.64056 |   valid - 0.64444
train - 0.65375 |   valid - 0.60741
train - 0.62902 |   valid - 0.60000
train - 0.66117 |   valid - 0.60000
train - 0.63726 |   valid - 0.54074
train - 0.63756 |   valid - 0.66418
train - 0.64745 |   valid - 0.61194
Average accuracy on crossval is 0.61502
Std is 0.03399
605 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


### b. Optimizing regularization parameters

1. In the cells below try different values of penalty: `none`, `l1`, `l2` – you can change the values of solver too.

In [68]:
%%timeit -n 1 -r 1
log_reg = LogisticRegression(random_state=21, fit_intercept=False, penalty=None, max_iter=200)

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

train_accuracies = []
valid_accuracies = []

for train_index, valid_index in skf.split(X_train, y_train):
    X_ctrain, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_ctrain, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]

    log_reg.fit(X_ctrain, y_ctrain)

    train_pred = log_reg.predict(X_ctrain)
    valid_pred = log_reg.predict(X_valid)

    train_accuracy = accuracy_score(y_ctrain, train_pred)
    valid_accuracy = accuracy_score(y_valid, valid_pred)
    
    train_accuracies.append(train_accuracy)
    valid_accuracies.append(valid_accuracy)
    
    print(f"train - {train_accuracy:.5f} |   valid - {valid_accuracy:.5f}")

average_accuracy = np.mean(valid_accuracies)
std_accuracy = np.std(valid_accuracies)

print(f"Average accuracy on crossval is {average_accuracy:.5f}")
print(f"Std is {std_accuracy:.5f}")

train - 0.65952 |   valid - 0.67407
train - 0.67189 |   valid - 0.60741
train - 0.65787 |   valid - 0.62963
train - 0.66035 |   valid - 0.66667
train - 0.66777 |   valid - 0.60741
train - 0.64716 |   valid - 0.62963
train - 0.66859 |   valid - 0.59259
train - 0.66447 |   valid - 0.60741
train - 0.65898 |   valid - 0.70149
train - 0.66969 |   valid - 0.62687
Average accuracy on crossval is 0.63432
Std is 0.03340
1.35 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [69]:
%%timeit -n 1 -r 1
log_reg = LogisticRegression(random_state=21, fit_intercept=False, penalty='l2', max_iter=200)

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

train_accuracies = []
valid_accuracies = []

for train_index, valid_index in skf.split(X_train, y_train):
    X_ctrain, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_ctrain, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]

    log_reg.fit(X_ctrain, y_ctrain)

    train_pred = log_reg.predict(X_ctrain)
    valid_pred = log_reg.predict(X_valid)

    train_accuracy = accuracy_score(y_ctrain, train_pred)
    valid_accuracy = accuracy_score(y_valid, valid_pred)
    
    train_accuracies.append(train_accuracy)
    valid_accuracies.append(valid_accuracy)
    
    print(f"train - {train_accuracy:.5f} |   valid - {valid_accuracy:.5f}")

average_accuracy = np.mean(valid_accuracies)
std_accuracy = np.std(valid_accuracies)

print(f"Average accuracy on crossval is {average_accuracy:.5f}")
print(f"Std is {std_accuracy:.5f}")

train - 0.64056 |   valid - 0.65926
train - 0.63561 |   valid - 0.62222
train - 0.64468 |   valid - 0.60000
train - 0.64056 |   valid - 0.64444
train - 0.65375 |   valid - 0.60741
train - 0.62902 |   valid - 0.60000
train - 0.66117 |   valid - 0.60000
train - 0.63726 |   valid - 0.54074
train - 0.63756 |   valid - 0.66418
train - 0.64745 |   valid - 0.61194
Average accuracy on crossval is 0.61502
Std is 0.03399
586 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [70]:
%%timeit -n 1 -r 1
log_reg = LogisticRegression(random_state=21, fit_intercept=False, penalty='l1', max_iter=400, solver='saga')

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

train_accuracies = []
valid_accuracies = []

for train_index, valid_index in skf.split(X_train, y_train):
    X_ctrain, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_ctrain, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]

    log_reg.fit(X_ctrain, y_ctrain)

    train_pred = log_reg.predict(X_ctrain)
    valid_pred = log_reg.predict(X_valid)

    train_accuracy = accuracy_score(y_ctrain, train_pred)
    valid_accuracy = accuracy_score(y_valid, valid_pred)
    
    train_accuracies.append(train_accuracy)
    valid_accuracies.append(valid_accuracy)
    
    print(f"train - {train_accuracy:.5f} |   valid - {valid_accuracy:.5f}")

average_accuracy = np.mean(valid_accuracies)
std_accuracy = np.std(valid_accuracies)

print(f"Average accuracy on crossval is {average_accuracy:.5f}")
print(f"Std is {std_accuracy:.5f}")

train - 0.64056 |   valid - 0.63704
train - 0.63561 |   valid - 0.60000
train - 0.64551 |   valid - 0.61481
train - 0.64303 |   valid - 0.64444
train - 0.64716 |   valid - 0.60000
train - 0.62819 |   valid - 0.60000
train - 0.65705 |   valid - 0.58519
train - 0.64551 |   valid - 0.56296
train - 0.63756 |   valid - 0.67164
train - 0.63344 |   valid - 0.55970
Average accuracy on crossval is 0.60758
Std is 0.03367
9.42 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


## 3. SVM regularization

### a. Default regularization

1. Train a baseline model with the only parameters `probability=True`, `kernel='linear'`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [71]:
%%timeit -n 1 -r 1
svc = SVC(probability=True, kernel='linear', random_state=21)

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

train_accuracies = []
valid_accuracies = []

for train_index, valid_index in skf.split(X_train, y_train):
    X_ctrain, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_ctrain, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]

    svc.fit(X_ctrain, y_ctrain)

    train_pred = svc.predict(X_ctrain)
    valid_pred = svc.predict(X_valid)

    train_accuracy = accuracy_score(y_ctrain, train_pred)
    valid_accuracy = accuracy_score(y_valid, valid_pred)

    train_accuracies.append(train_accuracy)
    valid_accuracies.append(valid_accuracy)

    print(f"train - {train_accuracy:.5f} |   valid - {valid_accuracy:.5f}")

average_accuracy = np.mean(valid_accuracies)
std_accuracy = np.std(valid_accuracies)

print(f"Average accuracy on crossval is {average_accuracy:.5f}")
print(f"Std is {std_accuracy:.5f}")


train - 0.70651 |   valid - 0.68148
train - 0.68920 |   valid - 0.64444
train - 0.69744 |   valid - 0.66667
train - 0.68920 |   valid - 0.65926
train - 0.69497 |   valid - 0.63704
train - 0.68673 |   valid - 0.68148
train - 0.69827 |   valid - 0.61481
train - 0.70486 |   valid - 0.57778
train - 0.68863 |   valid - 0.72388
train - 0.71005 |   valid - 0.64179
Average accuracy on crossval is 0.65286
Std is 0.03800
2.32 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `C`.

In [72]:
%%timeit -n 1 -r 1
svc = SVC(probability=True, kernel='linear', random_state=21, C=0.1)

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

train_accuracies = []
valid_accuracies = []

for train_index, valid_index in skf.split(X_train, y_train):
    X_ctrain, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_ctrain, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]

    svc.fit(X_ctrain, y_ctrain)

    train_pred = svc.predict(X_ctrain)
    valid_pred = svc.predict(X_valid)

    train_accuracy = accuracy_score(y_ctrain, train_pred)
    valid_accuracy = accuracy_score(y_valid, valid_pred)

    train_accuracies.append(train_accuracy)
    valid_accuracies.append(valid_accuracy)

    print(f"train - {train_accuracy:.5f} |   valid - {valid_accuracy:.5f}")

average_accuracy = np.mean(valid_accuracies)
std_accuracy = np.std(valid_accuracies)

print(f"Average accuracy on crossval is {average_accuracy:.5f}")
print(f"Std is {std_accuracy:.5f}")

train - 0.57049 |   valid - 0.55556
train - 0.56884 |   valid - 0.59259
train - 0.57543 |   valid - 0.54074
train - 0.56142 |   valid - 0.60000
train - 0.59110 |   valid - 0.57037
train - 0.57873 |   valid - 0.53333
train - 0.59687 |   valid - 0.54074
train - 0.59439 |   valid - 0.52593
train - 0.56590 |   valid - 0.58209
train - 0.58731 |   valid - 0.53731
Average accuracy on crossval is 0.55787
Std is 0.02522
7.02 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [73]:
%%timeit -n 1 -r 1
svc = SVC(probability=True, kernel='linear', random_state=21, C=10)

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

train_accuracies = []
valid_accuracies = []

for train_index, valid_index in skf.split(X_train, y_train):
    X_ctrain, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_ctrain, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]

    svc.fit(X_ctrain, y_ctrain)

    train_pred = svc.predict(X_ctrain)
    valid_pred = svc.predict(X_valid)

    train_accuracy = accuracy_score(y_ctrain, train_pred)
    valid_accuracy = accuracy_score(y_valid, valid_pred)

    train_accuracies.append(train_accuracy)
    valid_accuracies.append(valid_accuracy)

    print(f"train - {train_accuracy:.5f} |   valid - {valid_accuracy:.5f}")

average_accuracy = np.mean(valid_accuracies)
std_accuracy = np.std(valid_accuracies)

print(f"Average accuracy on crossval is {average_accuracy:.5f}")
print(f"Std is {std_accuracy:.5f}")

train - 0.76175 |   valid - 0.71852
train - 0.76340 |   valid - 0.71852
train - 0.76834 |   valid - 0.74074
train - 0.78071 |   valid - 0.76296
train - 0.75927 |   valid - 0.68889
train - 0.72630 |   valid - 0.74815
train - 0.76834 |   valid - 0.67407
train - 0.76340 |   valid - 0.64444
train - 0.77183 |   valid - 0.78358
train - 0.78418 |   valid - 0.70149
Average accuracy on crossval is 0.71814
Std is 0.04026
5.87 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [74]:
%%timeit -n 1 -r 1
svc = SVC(probability=True, kernel='linear', random_state=21, C=100)

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

train_accuracies = []
valid_accuracies = []

for train_index, valid_index in skf.split(X_train, y_train):
    X_ctrain, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_ctrain, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]

    svc.fit(X_ctrain, y_ctrain)

    train_pred = svc.predict(X_ctrain)
    valid_pred = svc.predict(X_valid)

    train_accuracy = accuracy_score(y_ctrain, train_pred)
    valid_accuracy = accuracy_score(y_valid, valid_pred)

    train_accuracies.append(train_accuracy)
    valid_accuracies.append(valid_accuracy)

    print(f"train - {train_accuracy:.5f} |   valid - {valid_accuracy:.5f}")

average_accuracy = np.mean(valid_accuracies)
std_accuracy = np.std(valid_accuracies)

print(f"Average accuracy on crossval is {average_accuracy:.5f}")
print(f"Std is {std_accuracy:.5f}")

train - 0.79143 |   valid - 0.74074
train - 0.80214 |   valid - 0.75556
train - 0.78566 |   valid - 0.75556
train - 0.80214 |   valid - 0.74815
train - 0.78566 |   valid - 0.74815
train - 0.77824 |   valid - 0.80000
train - 0.79637 |   valid - 0.71852
train - 0.80132 |   valid - 0.71852
train - 0.78501 |   valid - 0.79851
train - 0.79242 |   valid - 0.71642
Average accuracy on crossval is 0.75001
Std is 0.02849
16.9 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


## 4. Tree

### a. Default regularization

1. Train a baseline model with the only parameter `max_depth=10` and `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [75]:
%%timeit -n 1 -r 1
tree = DecisionTreeClassifier(max_depth=10, random_state=21)

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

train_accuracies = []
valid_accuracies = []

for train_index, valid_index in skf.split(X_train, y_train):
    X_ctrain, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_ctrain, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]

    tree.fit(X_ctrain, y_ctrain)

    train_pred = tree.predict(X_ctrain)
    valid_pred = tree.predict(X_valid)

    train_accuracy = accuracy_score(y_ctrain, train_pred)
    valid_accuracy = accuracy_score(y_valid, valid_pred)

    train_accuracies.append(train_accuracy)
    valid_accuracies.append(valid_accuracy)

    print(f"train - {train_accuracy:.5f} |   valid - {valid_accuracy:.5f}")

average_accuracy = np.mean(valid_accuracies)
std_accuracy = np.std(valid_accuracies)

print(f"Average accuracy on crossval is {average_accuracy:.5f}")
print(f"Std is {std_accuracy:.5f}")

train - 0.80874 |   valid - 0.77037
train - 0.79802 |   valid - 0.70370
train - 0.81286 |   valid - 0.72593
train - 0.80049 |   valid - 0.74815
train - 0.80956 |   valid - 0.68889
train - 0.78978 |   valid - 0.74074
train - 0.80627 |   valid - 0.60741
train - 0.82688 |   valid - 0.71111
train - 0.78995 |   valid - 0.79104
train - 0.80313 |   valid - 0.70896
Average accuracy on crossval is 0.71963
Std is 0.04791
62 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `max_depth`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [76]:
%%timeit -n 1 -r 1
tree = DecisionTreeClassifier(max_depth=30, random_state=21)

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

train_accuracies = []
valid_accuracies = []

for train_index, valid_index in skf.split(X_train, y_train):
    X_ctrain, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_ctrain, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]

    tree.fit(X_ctrain, y_ctrain)

    train_pred = tree.predict(X_ctrain)
    valid_pred = tree.predict(X_valid)

    train_accuracy = accuracy_score(y_ctrain, train_pred)
    valid_accuracy = accuracy_score(y_valid, valid_pred)

    train_accuracies.append(train_accuracy)
    valid_accuracies.append(valid_accuracy)

    print(f"train - {train_accuracy:.5f} |   valid - {valid_accuracy:.5f}")

average_accuracy = np.mean(valid_accuracies)
std_accuracy = np.std(valid_accuracies)

print(f"Average accuracy on crossval is {average_accuracy:.5f}")
print(f"Std is {std_accuracy:.5f}")

train - 1.00000 |   valid - 0.90370
train - 1.00000 |   valid - 0.85926
train - 1.00000 |   valid - 0.90370
train - 1.00000 |   valid - 0.92593
train - 1.00000 |   valid - 0.88148
train - 1.00000 |   valid - 0.88889
train - 1.00000 |   valid - 0.88148
train - 1.00000 |   valid - 0.88148
train - 1.00000 |   valid - 0.88806
train - 1.00000 |   valid - 0.86567
Average accuracy on crossval is 0.88797
Std is 0.01840
65 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [77]:
%%timeit -n 1 -r 1
tree = DecisionTreeClassifier(max_depth=30, random_state=21, min_samples_split=2)

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

train_accuracies = []
valid_accuracies = []

for train_index, valid_index in skf.split(X_train, y_train):
    X_ctrain, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_ctrain, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]

    tree.fit(X_ctrain, y_ctrain)

    train_pred = tree.predict(X_ctrain)
    valid_pred = tree.predict(X_valid)

    train_accuracy = accuracy_score(y_ctrain, train_pred)
    valid_accuracy = accuracy_score(y_valid, valid_pred)

    train_accuracies.append(train_accuracy)
    valid_accuracies.append(valid_accuracy)

    print(f"train - {train_accuracy:.5f} |   valid - {valid_accuracy:.5f}")

average_accuracy = np.mean(valid_accuracies)
std_accuracy = np.std(valid_accuracies)

print(f"Average accuracy on crossval is {average_accuracy:.5f}")
print(f"Std is {std_accuracy:.5f}")

train - 1.00000 |   valid - 0.90370
train - 1.00000 |   valid - 0.85926
train - 1.00000 |   valid - 0.90370
train - 1.00000 |   valid - 0.92593
train - 1.00000 |   valid - 0.88148
train - 1.00000 |   valid - 0.88889
train - 1.00000 |   valid - 0.88148
train - 1.00000 |   valid - 0.88148
train - 1.00000 |   valid - 0.88806
train - 1.00000 |   valid - 0.86567
Average accuracy on crossval is 0.88797
Std is 0.01840
69 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


## 5. Random forest

### a. Default regularization

1. Train a baseline model with the only parameters `n_estimators=50`, `max_depth=14`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [78]:
%%timeit -n 1 -r 1
rfc = RandomForestClassifier(n_estimators=50, max_depth=14, random_state=21)

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

train_accuracies = []
valid_accuracies = []

for train_index, valid_index in skf.split(X_train, y_train):
    X_ctrain, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_ctrain, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]

    rfc.fit(X_ctrain, y_ctrain)

    train_pred = rfc.predict(X_ctrain)
    valid_pred = rfc.predict(X_valid)

    train_accuracy = accuracy_score(y_ctrain, train_pred)
    valid_accuracy = accuracy_score(y_valid, valid_pred)

    train_accuracies.append(train_accuracy)
    valid_accuracies.append(valid_accuracy)

    print(f"train - {train_accuracy:.5f} |   valid - {valid_accuracy:.5f}")

average_accuracy = np.mean(valid_accuracies)
std_accuracy = np.std(valid_accuracies)

print(f"Average accuracy on crossval is {average_accuracy:.5f}")
print(f"Std is {std_accuracy:.5f}")

train - 0.97939 |   valid - 0.85185
train - 0.96620 |   valid - 0.85926
train - 0.96208 |   valid - 0.91852
train - 0.97115 |   valid - 0.91852
train - 0.97197 |   valid - 0.88148
train - 0.96538 |   valid - 0.86667
train - 0.96455 |   valid - 0.88889
train - 0.96867 |   valid - 0.87407
train - 0.96458 |   valid - 0.93284
train - 0.96787 |   valid - 0.86567
Average accuracy on crossval is 0.88578
Std is 0.02673
2.39 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


### b. Optimizing regularization parameters

1. In the new cells try different values of the parameters `max_depth` and `n_estimators`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [79]:
%%timeit -n 1 -r 1
rfc = RandomForestClassifier(n_estimators=50, max_depth=30, random_state=21)

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

train_accuracies = []
valid_accuracies = []

for train_index, valid_index in skf.split(X_train, y_train):
    X_ctrain, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_ctrain, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]

    rfc.fit(X_ctrain, y_ctrain)

    train_pred = rfc.predict(X_ctrain)
    valid_pred = rfc.predict(X_valid)

    train_accuracy = accuracy_score(y_ctrain, train_pred)
    valid_accuracy = accuracy_score(y_valid, valid_pred)

    train_accuracies.append(train_accuracy)
    valid_accuracies.append(valid_accuracy)

    print(f"train - {train_accuracy:.5f} |   valid - {valid_accuracy:.5f}")

average_accuracy = np.mean(valid_accuracies)
std_accuracy = np.std(valid_accuracies)

print(f"Average accuracy on crossval is {average_accuracy:.5f}")
print(f"Std is {std_accuracy:.5f}")

train - 1.00000 |   valid - 0.91111
train - 1.00000 |   valid - 0.87407
train - 1.00000 |   valid - 0.94815
train - 1.00000 |   valid - 0.94074
train - 1.00000 |   valid - 0.92593
train - 1.00000 |   valid - 0.94074
train - 1.00000 |   valid - 0.95556
train - 1.00000 |   valid - 0.88148
train - 1.00000 |   valid - 0.94776
train - 1.00000 |   valid - 0.86567
Average accuracy on crossval is 0.91912
Std is 0.03215
2.6 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [80]:
%%timeit -n 1 -r 1
rfc = RandomForestClassifier(n_estimators=80, max_depth=30, random_state=21)

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

train_accuracies = []
valid_accuracies = []

for train_index, valid_index in skf.split(X_train, y_train):
    X_ctrain, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_ctrain, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]

    rfc.fit(X_ctrain, y_ctrain)

    train_pred = rfc.predict(X_ctrain)
    valid_pred = rfc.predict(X_valid)

    train_accuracy = accuracy_score(y_ctrain, train_pred)
    valid_accuracy = accuracy_score(y_valid, valid_pred)

    train_accuracies.append(train_accuracy)
    valid_accuracies.append(valid_accuracy)

    print(f"train - {train_accuracy:.5f} |   valid - {valid_accuracy:.5f}")

average_accuracy = np.mean(valid_accuracies)
std_accuracy = np.std(valid_accuracies)

print(f"Average accuracy on crossval is {average_accuracy:.5f}")
print(f"Std is {std_accuracy:.5f}")

train - 1.00000 |   valid - 0.91111
train - 1.00000 |   valid - 0.88148
train - 1.00000 |   valid - 0.95556
train - 1.00000 |   valid - 0.94815
train - 1.00000 |   valid - 0.92593
train - 1.00000 |   valid - 0.92593
train - 1.00000 |   valid - 0.95556
train - 1.00000 |   valid - 0.87407
train - 1.00000 |   valid - 0.94776
train - 1.00000 |   valid - 0.88060
Average accuracy on crossval is 0.92061
Std is 0.03063
3.87 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [81]:
%%timeit -n 1 -r 1
rfc = RandomForestClassifier(n_estimators=200, max_depth=30, random_state=21)

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

train_accuracies = []
valid_accuracies = []

for train_index, valid_index in skf.split(X_train, y_train):
    X_ctrain, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_ctrain, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]

    rfc.fit(X_ctrain, y_ctrain)

    train_pred = rfc.predict(X_ctrain)
    valid_pred = rfc.predict(X_valid)

    train_accuracy = accuracy_score(y_ctrain, train_pred)
    valid_accuracy = accuracy_score(y_valid, valid_pred)

    train_accuracies.append(train_accuracy)
    valid_accuracies.append(valid_accuracy)

    print(f"train - {train_accuracy:.5f} |   valid - {valid_accuracy:.5f}")

average_accuracy = np.mean(valid_accuracies)
std_accuracy = np.std(valid_accuracies)

print(f"Average accuracy on crossval is {average_accuracy:.5f}")
print(f"Std is {std_accuracy:.5f}")

train - 1.00000 |   valid - 0.91111
train - 1.00000 |   valid - 0.88148
train - 1.00000 |   valid - 0.95556
train - 1.00000 |   valid - 0.92593
train - 1.00000 |   valid - 0.92593
train - 1.00000 |   valid - 0.93333
train - 1.00000 |   valid - 0.95556
train - 1.00000 |   valid - 0.88889
train - 1.00000 |   valid - 0.94776
train - 1.00000 |   valid - 0.89552
Average accuracy on crossval is 0.92211
Std is 0.02575
9.17 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [82]:
%%timeit -n 1 -r 1
rfc = RandomForestClassifier(n_estimators=400, max_depth=30, random_state=21)

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

train_accuracies = []
valid_accuracies = []

for train_index, valid_index in skf.split(X_train, y_train):
    X_ctrain, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_ctrain, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]

    rfc.fit(X_ctrain, y_ctrain)

    train_pred = rfc.predict(X_ctrain)
    valid_pred = rfc.predict(X_valid)

    train_accuracy = accuracy_score(y_ctrain, train_pred)
    valid_accuracy = accuracy_score(y_valid, valid_pred)

    train_accuracies.append(train_accuracy)
    valid_accuracies.append(valid_accuracy)

    print(f"train - {train_accuracy:.5f} |   valid - {valid_accuracy:.5f}")

average_accuracy = np.mean(valid_accuracies)
std_accuracy = np.std(valid_accuracies)

print(f"Average accuracy on crossval is {average_accuracy:.5f}")
print(f"Std is {std_accuracy:.5f}")

train - 1.00000 |   valid - 0.91111
train - 1.00000 |   valid - 0.87407
train - 1.00000 |   valid - 0.95556
train - 1.00000 |   valid - 0.93333
train - 1.00000 |   valid - 0.92593
train - 1.00000 |   valid - 0.93333
train - 1.00000 |   valid - 0.94074
train - 1.00000 |   valid - 0.88889
train - 1.00000 |   valid - 0.94776
train - 1.00000 |   valid - 0.88060
Average accuracy on crossval is 0.91913
Std is 0.02749
6.62 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.
3. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your test dataset).
4. Save the model.

In [51]:
rfc = RandomForestClassifier(n_estimators=200, max_depth=30, random_state=21)

rfc.fit(X_train, y_train)

In [52]:
y_pred = rfc.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print(f'Accurcay: {accuracy:.5f}')

Accurcay: 0.93787


In [54]:
y_final = rfc.predict(X)
                     
accuracy_final = accuracy_score(y, y_final)

print(f'Accurcay: {accuracy_final:.5f}')

Accurcay: 0.98754


In [55]:
joblib.dump(rfc, '../data/best_forest.pkl')

['../data/best_forest.pkl']

In [56]:
conf_matrix = confusion_matrix(y, y_final)
conf_matrix

array([[129,   1,   0,   0,   0,   1,   5],
       [  0, 271,   0,   2,   0,   1,   0],
       [  0,   0, 147,   2,   0,   0,   0],
       [  1,   0,   0, 394,   0,   1,   0],
       [  0,   0,   0,   0, 101,   2,   1],
       [  0,   0,   0,   2,   0, 268,   1],
       [  0,   0,   0,   0,   0,   1, 355]])

In [60]:
conf_matrix_df = pd.DataFrame(conf_matrix, 
                               index=[f'True day {i}' for i in range(conf_matrix.shape[0])], 
                               columns=[f'day {i}' for i in range(conf_matrix.shape[1])])
conf_matrix_df

Unnamed: 0,day 0,day 1,day 2,day 3,day 4,day 5,day 6
True day 0,129,1,0,0,0,1,5
True day 1,0,271,0,2,0,1,0
True day 2,0,0,147,2,0,0,0
True day 3,1,0,0,394,0,1,0
True day 4,0,0,0,0,101,2,1
True day 5,0,0,0,2,0,268,1
True day 6,0,0,0,0,0,1,355


In [63]:
error_rates = {}
for i in range(conf_matrix.shape[0]):
    total_samples = np.sum(conf_matrix[i])  # Total samples for class i
    errors = total_samples - conf_matrix[i, i]  # Errors for class i
    error_rate = errors / total_samples * 100 if total_samples > 0 else 0
    error_rates[i] = error_rate

error_rates_df = pd.DataFrame(list(error_rates.items()), columns=['Class', 'Error Rate (%)'])
error_rates_df

Unnamed: 0,Class,Error Rate (%)
0,0,5.147059
1,1,1.094891
2,2,1.342282
3,3,0.505051
4,4,2.884615
5,5,1.107011
6,6,0.280899


In [64]:
max_error_class = error_rates_df.loc[error_rates_df['Error Rate (%)'].idxmax()]
print(f"Class with the highest error rate: {max_error_class['Class']} with an error rate of {max_error_class['Error Rate (%)']:.2f}%")

Class with the highest error rate: 0.0 with an error rate of 5.15%
