# Day 09. Exercise 00
# Regularization

## 0. Imports

In [63]:
import pandas as pd
import numpy as np
import joblib

In [64]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


## 1. Preprocessing

1. Считайте файл `dayofweek.csv`, который вы использовали в предыдущий день, в кадр данных.
2. Используя `train_test_split` с параметрами `test_size=0.2`, `random_state=21`, получите `X_train`, `y_train`, `X_test`, `y_test`. Используйте дополнительный параметр `stratify`.

In [65]:
df = pd.read_csv('../data/dayofweek.csv')
x = df.drop('dayofweek', axis=1)
y = df.dayofweek
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=21, stratify=y)

## 2. Logreg regularization

### a. Default regularization

1. Обучите базовую модель с единственными параметрами `random_state=21`, `fit_intercept=False`.
2. Используйте стратифицированную K-кратную кросс-валидацию с `10` разбиениями для оценки точности модели.


Результат кода, в котором вы обучали и оценивали базовую модель, должен быть примерно таким (используйте `%%time`, чтобы получить информацию о том, сколько времени занял запуск ячейки):

```
train -  0.62902   |   valid -  0.59259
train -  0.64633   |   valid -  0.62963
train -  0.63479   |   valid -  0.56296
train -  0.65622   |   valid -  0.61481
train -  0.63397   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64138   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64333   |   valid -  0.59701
train -  0.63674   |   valid -  0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943
```

In [66]:
model = LogisticRegression(random_state=21, fit_intercept=False)
model.fit(x_train, y_train)
print(f'accuracy train = {accuracy_score(y_train, model.predict(x_train))}')
print(f'accuracy test = {accuracy_score(y_test, model.predict(x_test))}')
model.fit(x, y)
print(f'accuracy = {accuracy_score(y, model.predict(x))}')

accuracy train = 0.6454005934718101
accuracy test = 0.6331360946745562
accuracy = 0.6405693950177936


In [67]:
def crossval(x, y, model):
    scores_train = []
    scores_test = []
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

    for train_index, test_index in skf.split(x, y):
        x_train, x_test = x.iloc[train_index], x.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(x_train, y_train)
        scores_train.append(accuracy_score(y_train, model.predict(x_train)))
        scores_test.append(accuracy_score(y_test, model.predict(x_test)))

    for i in range(10):
        print(f'train - {scores_train[i]:.5f} | valid - {scores_test[i]:.5f}')
    print(f'Average accuracy on crossval is {np.mean(scores_test):.5f}')
    print(f'Std is {np.std(scores_test):.5f}')

In [68]:
%%time
crossval(x, y, model)

train - 0.63546 | valid - 0.65089
train - 0.65326 | valid - 0.60947
train - 0.63942 | valid - 0.63314
train - 0.63283 | valid - 0.57988
train - 0.65590 | valid - 0.57988
train - 0.64535 | valid - 0.62130
train - 0.63834 | valid - 0.60714
train - 0.63702 | valid - 0.59524
train - 0.64295 | valid - 0.68452
train - 0.63900 | valid - 0.56548
Average accuracy on crossval is 0.61269
Std is 0.03441
CPU times: total: 15.6 ms
Wall time: 199 ms


### b. Optimizing regularization parameters

1. В ячейках ниже попробуйте разные значения штрафа: `none`, `l1`, `l2` - вы можете менять и значения решателя.

In [69]:
%%time
model = LogisticRegression(random_state=21, fit_intercept=False, penalty='l1', solver='liblinear')
crossval(x, y, model)

train - 0.62887 | valid - 0.63314
train - 0.63349 | valid - 0.60947
train - 0.62360 | valid - 0.64497
train - 0.62624 | valid - 0.57988
train - 0.63744 | valid - 0.55621
train - 0.62558 | valid - 0.59172
train - 0.62912 | valid - 0.60119
train - 0.61397 | valid - 0.54762
train - 0.61792 | valid - 0.64286
train - 0.63241 | valid - 0.55952
Average accuracy on crossval is 0.59666
Std is 0.03422
CPU times: total: 109 ms
Wall time: 130 ms


In [70]:
%%time
model = LogisticRegression(random_state=21, fit_intercept=False, penalty='l2', solver='liblinear')
crossval(x, y, model)

train - 0.62360 | valid - 0.64497
train - 0.62953 | valid - 0.59763
train - 0.61833 | valid - 0.61538
train - 0.61833 | valid - 0.58580
train - 0.63151 | valid - 0.56213
train - 0.61898 | valid - 0.60355
train - 0.62121 | valid - 0.59524
train - 0.61462 | valid - 0.55952
train - 0.61067 | valid - 0.63095
train - 0.61858 | valid - 0.54762
Average accuracy on crossval is 0.59428
Std is 0.02991
CPU times: total: 78.1 ms
Wall time: 83.1 ms


## 3. SVM regularization

### a. Default regularization

1. Обучите базовую модель с единственными параметрами `probability=True`, `kernel='linear'`, `random_state=21`.
2. Используйте стратифицированную K-кратную кросс-валидацию с `10` разбиениями для оценки точности модели.
3. Формат результатов кода, в котором вы обучали и оценивали базовую модель, должен быть аналогичен тому, что вы получили для logreg.

In [71]:
model = SVC(probability=True, kernel='linear', random_state=21)
model.fit(x_train, y_train)
print(f'accuracy train = {accuracy_score(y_train, model.predict(x_train))}')
print(f'accuracy test = {accuracy_score(y_test, model.predict(x_test))}')
model.fit(x, y)
print(f'accuracy = {accuracy_score(y, model.predict(x))}')

accuracy train = 0.6891691394658753
accuracy test = 0.7159763313609467
accuracy = 0.702846975088968


In [72]:
%%time
crossval(x, y, model)

train - 0.70138 | valid - 0.71598
train - 0.69677 | valid - 0.68639
train - 0.70402 | valid - 0.71006
train - 0.69941 | valid - 0.63905
train - 0.71127 | valid - 0.62130
train - 0.70336 | valid - 0.69822
train - 0.69038 | valid - 0.67857
train - 0.70487 | valid - 0.69048
train - 0.69895 | valid - 0.71429
train - 0.70026 | valid - 0.61905
Average accuracy on crossval is 0.67734
Std is 0.03553
CPU times: total: 2.69 s
Wall time: 2.74 s


### b. Optimizing regularization parameters

1. В ячейках ниже попробуйте разные значения параметра `C`.

In [73]:
%%time
model = SVC(probability=True, kernel='linear', random_state=21, C=0.1)
crossval(x, y, model)

train - 0.58075 | valid - 0.59763
train - 0.57877 | valid - 0.54438
train - 0.57284 | valid - 0.57396
train - 0.58603 | valid - 0.61538
train - 0.59328 | valid - 0.50296
train - 0.56823 | valid - 0.57396
train - 0.57115 | valid - 0.54762
train - 0.57971 | valid - 0.58929
train - 0.58696 | valid - 0.58929
train - 0.59289 | valid - 0.54762
Average accuracy on crossval is 0.56821
Std is 0.03118
CPU times: total: 2.89 s
Wall time: 2.91 s


In [74]:
%%time
model = SVC(probability=True, kernel='linear', random_state=21, C=0.01)
crossval(x, y, model)

train - 0.40804 | valid - 0.41420
train - 0.41859 | valid - 0.38462
train - 0.43771 | valid - 0.41420
train - 0.44034 | valid - 0.44970
train - 0.39684 | valid - 0.36095
train - 0.43705 | valid - 0.48521
train - 0.44137 | valid - 0.43452
train - 0.39789 | valid - 0.40476
train - 0.44137 | valid - 0.42857
train - 0.43412 | valid - 0.45238
Average accuracy on crossval is 0.42291
Std is 0.03380
CPU times: total: 3.47 s
Wall time: 3.46 s


In [75]:
%%time
model = SVC(probability=True, kernel='linear', random_state=21, C=10)
crossval(x, y, model)

train - 0.77521 | valid - 0.75740
train - 0.77587 | valid - 0.73964
train - 0.77983 | valid - 0.75740
train - 0.78049 | valid - 0.75148
train - 0.78510 | valid - 0.69822
train - 0.76664 | valid - 0.75740
train - 0.78195 | valid - 0.77381
train - 0.78195 | valid - 0.75595
train - 0.77800 | valid - 0.78571
train - 0.76680 | valid - 0.71429
Average accuracy on crossval is 0.74913
Std is 0.02470
CPU times: total: 4.06 s
Wall time: 4.14 s


## 4. Tree

### a. Default regularization

1. Обучите базовую модель с единственным параметром `max_depth=10` и `random_state=21`.
2. Для оценки точности модели используйте стратифицированную K-кратную кросс-валидацию с разбиением `10`.
3. Формат результатов кода, в котором вы обучали и оценивали базовую модель, должен быть аналогичен тому, что вы получили для logreg.

In [76]:
model = DecisionTreeClassifier(max_depth=10, random_state=21)
model.fit(x_train, y_train)
print(f'accuracy train = {accuracy_score(y_train, model.predict(x_train))}')
print(f'accuracy test = {accuracy_score(y_test, model.predict(x_test))}')
model.fit(x, y)
print(f'accuracy = {accuracy_score(y, model.predict(x))}')

accuracy train = 0.7848664688427299
accuracy test = 0.7396449704142012
accuracy = 0.8196915776986952


In [77]:
%%time
crossval(x, y, model)

train - 0.82004 | valid - 0.79290
train - 0.82663 | valid - 0.69822
train - 0.82927 | valid - 0.76331
train - 0.81806 | valid - 0.71598
train - 0.82268 | valid - 0.74556
train - 0.80554 | valid - 0.77515
train - 0.83333 | valid - 0.75595
train - 0.81555 | valid - 0.76786
train - 0.81225 | valid - 0.77381
train - 0.81752 | valid - 0.69048
Average accuracy on crossval is 0.74792
Std is 0.03306
CPU times: total: 46.9 ms
Wall time: 60.1 ms


### b. Optimizing regularization parameters

1. В ячейках ниже попробуйте разные значения параметра `max_depth`.
2. В качестве бонуса поиграйте с другими параметрами регуляризации, пытаясь найти наилучшую комбинацию.

In [78]:
%%time
model = DecisionTreeClassifier(max_depth=1, random_state=21)
crossval(x, y, model)

train - 0.35662 | valid - 0.36686
train - 0.35992 | valid - 0.33728
train - 0.35860 | valid - 0.34911
train - 0.35662 | valid - 0.36686
train - 0.36058 | valid - 0.33136
train - 0.35333 | valid - 0.39645
train - 0.35837 | valid - 0.35119
train - 0.35837 | valid - 0.35119
train - 0.35705 | valid - 0.36310
train - 0.35705 | valid - 0.36310
Average accuracy on crossval is 0.35765
Std is 0.01731
CPU times: total: 31.2 ms
Wall time: 42.6 ms


In [79]:
%%time
model = DecisionTreeClassifier(max_depth=5, random_state=21)
crossval(x, y, model)

train - 0.62096 | valid - 0.62130
train - 0.61042 | valid - 0.55030
train - 0.62426 | valid - 0.59763
train - 0.60976 | valid - 0.59172
train - 0.61305 | valid - 0.53846
train - 0.60382 | valid - 0.56213
train - 0.61199 | valid - 0.61905
train - 0.61199 | valid - 0.59524
train - 0.60738 | valid - 0.64286
train - 0.62187 | valid - 0.55952
Average accuracy on crossval is 0.58782
Std is 0.03255
CPU times: total: 46.9 ms
Wall time: 53.6 ms


In [80]:
%%time
model = DecisionTreeClassifier(max_depth=20, random_state=21)
crossval(x, y, model)

train - 0.98682 | valid - 0.89349
train - 0.98550 | valid - 0.85207
train - 0.99473 | valid - 0.93491
train - 0.99341 | valid - 0.89941
train - 0.98484 | valid - 0.88757
train - 0.98286 | valid - 0.88166
train - 0.98617 | valid - 0.89881
train - 0.99341 | valid - 0.90476
train - 0.98551 | valid - 0.92262
train - 0.98946 | valid - 0.87500
Average accuracy on crossval is 0.89503
Std is 0.02228
CPU times: total: 46.9 ms
Wall time: 65.5 ms


In [81]:
%%time
model = DecisionTreeClassifier(random_state=21)
param_grid = {'criterion': ['gini', 'entropy', 'log_loss'],
              'splitter': ['best', 'random'],
              'max_depth': np.arange(5, 30, 5)}

gs = GridSearchCV(model, param_grid, scoring='accuracy', n_jobs=-1)
gs.fit(x, y)

print(f'Best params: {gs.best_params_}')
print(f'Best score: {gs.best_score_}')

Best params: {'criterion': 'entropy', 'max_depth': np.int64(20), 'splitter': 'random'}
Best score: 0.5166944673678296
CPU times: total: 15.6 ms
Wall time: 156 ms


## 5. Random forest

### a. Default regularization

1. Обучите базовую модель с единственными параметрами `n_estimators=50`, `max_depth=14`, `random_state=21`.
2. Для оценки точности модели используйте стратифицированную K-кратную кросс-валидацию с разбиением на `10`.
3. Формат результатов кода, в котором вы обучали и оценивали базовую модель, должен быть аналогичен тому, что вы получили для logreg.

In [82]:
model = RandomForestClassifier(n_estimators=50, max_depth=14, random_state=21)
model.fit(x_train, y_train)
print(f'accuracy train = {accuracy_score(y_train, model.predict(x_train))}')
print(f'accuracy test = {accuracy_score(y_test, model.predict(x_test))}')
model.fit(x, y)
print(f'accuracy = {accuracy_score(y, model.predict(x))}')

accuracy train = 0.9651335311572701
accuracy test = 0.908284023668639
accuracy = 0.966785290628707


In [83]:
%%time
crossval(x, y, model)

train - 0.97034 | valid - 0.90533
train - 0.96704 | valid - 0.87574
train - 0.96902 | valid - 0.91124
train - 0.97429 | valid - 0.89349
train - 0.96243 | valid - 0.86982
train - 0.96638 | valid - 0.94083
train - 0.97036 | valid - 0.92262
train - 0.97036 | valid - 0.91667
train - 0.96838 | valid - 0.89881
train - 0.97563 | valid - 0.88690
Average accuracy on crossval is 0.90214
Std is 0.02069
CPU times: total: 703 ms
Wall time: 786 ms


### b. Optimizing regularization parameters

1. В новых ячейках попробуйте разные значения параметров `max_depth` и `n_estimators`.
2. В качестве бонуса поиграйте с другими параметрами регуляризации, пытаясь найти наилучшую комбинацию.

In [84]:
%%time
model = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=21)
crossval(x, y, model)

train - 0.60382 | valid - 0.58580
train - 0.61503 | valid - 0.59763
train - 0.61239 | valid - 0.60947
train - 0.58339 | valid - 0.56213
train - 0.59591 | valid - 0.52071
train - 0.58668 | valid - 0.55621
train - 0.59157 | valid - 0.55952
train - 0.56983 | valid - 0.58929
train - 0.59618 | valid - 0.57143
train - 0.60277 | valid - 0.57738
Average accuracy on crossval is 0.57296
Std is 0.02388
CPU times: total: 594 ms
Wall time: 624 ms


In [85]:
%%time
model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=21)
crossval(x, y, model)

train - 0.88991 | valid - 0.85799
train - 0.86157 | valid - 0.77515
train - 0.86882 | valid - 0.83432
train - 0.89321 | valid - 0.82840
train - 0.90112 | valid - 0.79290
train - 0.87805 | valid - 0.84615
train - 0.88538 | valid - 0.85714
train - 0.86957 | valid - 0.85714
train - 0.88406 | valid - 0.84524
train - 0.89065 | valid - 0.79167
Average accuracy on crossval is 0.82861
Std is 0.02934
CPU times: total: 1.34 s
Wall time: 1.39 s


In [86]:
%%time
model = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=21)
crossval(x, y, model)

train - 0.99934 | valid - 0.92308
train - 0.99802 | valid - 0.92308
train - 0.99934 | valid - 0.93491
train - 0.99736 | valid - 0.93491
train - 0.99868 | valid - 0.92308
train - 0.99802 | valid - 0.94675
train - 0.99736 | valid - 0.94643
train - 0.99736 | valid - 0.91667
train - 0.99671 | valid - 0.93452
train - 0.99736 | valid - 0.92262
Average accuracy on crossval is 0.93060
Std is 0.00996
CPU times: total: 2.95 s
Wall time: 3.33 s


In [87]:
%%time
model = RandomForestClassifier(n_estimators=20, max_depth=20, random_state=21)
crossval(x, y, model)

train - 0.99209 | valid - 0.92899
train - 0.99407 | valid - 0.91716
train - 0.99341 | valid - 0.92899
train - 0.99341 | valid - 0.92308
train - 0.99473 | valid - 0.90533
train - 0.99407 | valid - 0.93491
train - 0.99473 | valid - 0.92857
train - 0.99802 | valid - 0.91667
train - 0.99473 | valid - 0.93452
train - 0.98814 | valid - 0.91071
Average accuracy on crossval is 0.92289
Std is 0.00957
CPU times: total: 344 ms
Wall time: 375 ms


In [88]:
%%time
model = RandomForestClassifier(random_state=21)
param_grid = {'n_estimators': np.arange(20, 100, 20),
              'criterion': ['gini', 'entropy', 'log_loss'],
              'max_depth': np.arange(5, 30, 5)}

gs = GridSearchCV(model, param_grid, scoring='accuracy', n_jobs=-1)
gs.fit(x, y)

print(f'Best params: {gs.best_params_}')
print(f'Best score: {gs.best_score_}')

Best params: {'criterion': 'entropy', 'max_depth': np.int64(20), 'n_estimators': np.int64(20)}
Best score: 0.5398346004600285
CPU times: total: 234 ms
Wall time: 3.66 s


## 6. Predictions

1. Выберите лучшую модель и используйте ее для прогнозирования тестового набора данных.
2. Рассчитайте итоговую точность.
3. Проанализируйте: для какого дня недели ваша модель делает больше всего ошибок (в % от общего числа образцов этого класса в тестовом наборе данных).
4. Сохраните модель.

In [89]:
model = RandomForestClassifier(n_estimators=20, max_depth=20, random_state=21)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(f'Final accuracy = {accuracy_score(y_test, y_pred)}')

Final accuracy = 0.9112426035502958


In [90]:
cm = confusion_matrix(y_test, y_pred, labels=np.unique(y_test))
cm_df = pd.DataFrame(cm, index=np.unique(y_test), columns=np.unique(y_test))

print('Confusion Matrix:')
print(cm_df)

total_samples = y_test.value_counts().sort_index()

errors = {}
for true_label in np.unique(y_test):
    errors[true_label] = cm_df.loc[true_label].sum() - cm_df.loc[true_label, true_label]

errors_df = pd.DataFrame({'Total Samples': total_samples, 'Errors': pd.Series(errors)})
errors_df['Error Rate'] = (errors_df['Errors'] / errors_df['Total Samples']) * 100
print(errors_df)

Confusion Matrix:
    0   1   2   3   4   5   6
0  20   1   0   1   0   1   4
1   1  50   0   2   0   2   0
2   0   0  28   2   0   0   0
3   0   0   1  78   0   0   1
4   0   0   0   0  18   3   0
5   0   0   0   2   0  48   4
6   0   0   0   1   0   4  66
   Total Samples  Errors  Error Rate
0             27       7   25.925926
1             55       5    9.090909
2             30       2    6.666667
3             80       2    2.500000
4             21       3   14.285714
5             54       6   11.111111
6             71       5    7.042254


In [91]:
joblib.dump(model, 'model.pkl')

['model.pkl']