# Day 09. Exercise 00
# Regularization

## 0. Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
import joblib

## 1. Preprocessing

1. Read the file `dayofweek.csv` that you used in the previous day to a dataframe.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [2]:
df = pd.read_csv('../../datasets/dayofweek.csv')
df

Unnamed: 0,numTrials,hour,dayofweek,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,-0.788667,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.756764,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.724861,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.692958,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-0.661055,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,-0.533442,0.945382,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1682,-0.629151,0.945382,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1683,-0.597248,0.945382,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1684,-0.565345,0.945382,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [3]:
X = df.drop('dayofweek', axis=1)
y = df['dayofweek']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=21, stratify=y, test_size=0.2)


In [4]:
y_train.count()

np.int64(1348)

## 2. Logreg regularization

### a. Default regularization

1. Train a baseline model with the only parameters `random_state=21`, `fit_intercept=False`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model


The result of the code where you trained and evaluated the baseline model should be exactly like this (use `%%time` to get the info about how long it took to run the cell):

```
train -  0.62902   |   valid -  0.59259
train -  0.64633   |   valid -  0.62963
train -  0.63479   |   valid -  0.56296
train -  0.65622   |   valid -  0.61481
train -  0.63397   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64138   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64333   |   valid -  0.59701
train -  0.63674   |   valid -  0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943
```

In [5]:
def crossval(n_splits, X, y, model):
    kf = StratifiedKFold(n_splits=n_splits)
    train_scores = []
    valid_scores = []

    for train, valid in kf.split(X, y):
        X_train, X_valid = X.iloc[train], X.iloc[valid]
        y_train, y_valid = y.iloc[train], y.iloc[valid]
        
        model.fit(X_train, y_train)
        
        train_score = model.score(X_train, y_train)
        valid_score = model.score(X_valid, y_valid)
        
        train_scores.append(train_score)
        valid_scores.append(valid_score)
        print(f'train - {train_score:.5f} | valid - {valid_score:.5f}')
    print(f'Average accuracy on crossval is {np.mean(valid_scores):.5f}')
    print(f"Std is {np.std(valid_scores):.5f}")

In [6]:
lg = LogisticRegression(random_state=21, fit_intercept=False)
lg.fit(X_train,y_train)
lg.score(X_train,y_train)

0.6454005934718101

In [7]:
%%time
lg = LogisticRegression(random_state=21, fit_intercept=False)
crossval(10,X_train,y_train,lg)

train - 0.62819 | valid - 0.59259
train - 0.64716 | valid - 0.62963
train - 0.63479 | valid - 0.57037
train - 0.65540 | valid - 0.61481
train - 0.63314 | valid - 0.57778
train - 0.64056 | valid - 0.59259
train - 0.64221 | valid - 0.65926
train - 0.65952 | valid - 0.56296
train - 0.64333 | valid - 0.59701
train - 0.63591 | valid - 0.62687
Average accuracy on crossval is 0.60239
Std is 0.02852
CPU times: user 2.19 s, sys: 90.3 ms, total: 2.28 s
Wall time: 1.4 s


### b. Optimizing regularization parameters

1. In the cells below try different values of penalty: `none`, `l1`, `l2` – you can change the values of solver too.

In [8]:
%%time
lg = LogisticRegression(random_state=21, fit_intercept=False)
lg.fit(X_train, y_train)
crossval(10,X_train,y_train,lg)

train - 0.62819 | valid - 0.59259
train - 0.64716 | valid - 0.62963
train - 0.63479 | valid - 0.57037
train - 0.65540 | valid - 0.61481
train - 0.63314 | valid - 0.57778
train - 0.64056 | valid - 0.59259
train - 0.64221 | valid - 0.65926
train - 0.65952 | valid - 0.56296
train - 0.64333 | valid - 0.59701
train - 0.63591 | valid - 0.62687
Average accuracy on crossval is 0.60239
Std is 0.02852
CPU times: user 1.96 s, sys: 72.4 ms, total: 2.03 s
Wall time: 1.11 s


In [9]:
%%time
lg = LogisticRegression(penalty='l1',solver='liblinear',random_state=21, fit_intercept=False)
lg.fit(X_train, y_train)
crossval(10,X_train,y_train,lg)

train - 0.61830 | valid - 0.54815
train - 0.62737 | valid - 0.62222
train - 0.60511 | valid - 0.54074
train - 0.63644 | valid - 0.62222
train - 0.62407 | valid - 0.55556
train - 0.62325 | valid - 0.58519
train - 0.61253 | valid - 0.63704
train - 0.64716 | valid - 0.58519
train - 0.63015 | valid - 0.59701
train - 0.61367 | valid - 0.59701
Average accuracy on crossval is 0.58903
Std is 0.03129
CPU times: user 321 ms, sys: 7.68 ms, total: 329 ms
Wall time: 343 ms


In [10]:
%%time
lg = LogisticRegression(penalty='l2',solver='liblinear',random_state=21, fit_intercept=False)
lg.fit(X_train, y_train)
crossval(10,X_train,y_train,lg)


train - 0.61006 | valid - 0.56296
train - 0.61665 | valid - 0.61481
train - 0.61336 | valid - 0.59259
train - 0.62902 | valid - 0.60741
train - 0.60923 | valid - 0.55556
train - 0.61500 | valid - 0.57778
train - 0.61665 | valid - 0.61481
train - 0.64056 | valid - 0.53333
train - 0.62109 | valid - 0.58209
train - 0.61120 | valid - 0.57463
Average accuracy on crossval is 0.58160
Std is 0.02532
CPU times: user 225 ms, sys: 5.26 ms, total: 231 ms
Wall time: 232 ms


## 3. SVM regularization

### a. Default regularization

1. Train a baseline model with the only parameters `probability=True`, `kernel='linear'`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [11]:
%%time
svc = SVC(probability=True, kernel='linear',random_state=21)
crossval(10,X_train,y_train,svc)

train - 0.70486 | valid - 0.65926
train - 0.69662 | valid - 0.75556
train - 0.69415 | valid - 0.62222
train - 0.70239 | valid - 0.65185
train - 0.69085 | valid - 0.65185
train - 0.68920 | valid - 0.64444
train - 0.69250 | valid - 0.72593
train - 0.70074 | valid - 0.62222
train - 0.69605 | valid - 0.61940
train - 0.71087 | valid - 0.63433
Average accuracy on crossval is 0.65871
Std is 0.04359
CPU times: user 7 s, sys: 56.9 ms, total: 7.06 s
Wall time: 7.39 s


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `C`.

In [12]:
%%time
svc = SVC(C=0.5,probability=True, kernel='linear',random_state=21)
crossval(10,X_train,y_train,svc)

train - 0.66694 | valid - 0.63704
train - 0.66612 | valid - 0.73333
train - 0.67271 | valid - 0.60741
train - 0.67354 | valid - 0.62963
train - 0.67766 | valid - 0.64444
train - 0.66529 | valid - 0.61481
train - 0.66200 | valid - 0.68889
train - 0.66529 | valid - 0.57037
train - 0.67463 | valid - 0.59701
train - 0.66804 | valid - 0.61194
Average accuracy on crossval is 0.63349
Std is 0.04471
CPU times: user 7.04 s, sys: 48.1 ms, total: 7.09 s
Wall time: 7.1 s


In [13]:
%%time
svc = SVC(C=1,probability=True, kernel='linear',random_state=21)
crossval(10,X_train,y_train,svc)

train - 0.70486 | valid - 0.65926
train - 0.69662 | valid - 0.75556
train - 0.69415 | valid - 0.62222
train - 0.70239 | valid - 0.65185
train - 0.69085 | valid - 0.65185
train - 0.68920 | valid - 0.64444
train - 0.69250 | valid - 0.72593
train - 0.70074 | valid - 0.62222
train - 0.69605 | valid - 0.61940
train - 0.71087 | valid - 0.63433
Average accuracy on crossval is 0.65871
Std is 0.04359
CPU times: user 7.01 s, sys: 36.1 ms, total: 7.04 s
Wall time: 7.15 s


In [14]:
%%time
svc = SVC(C=5,probability=True, kernel='linear',random_state=21)
crossval(10,X_train,y_train,svc)

train - 0.71723 | valid - 0.65185
train - 0.76257 | valid - 0.81481
train - 0.77906 | valid - 0.68148
train - 0.73537 | valid - 0.67407
train - 0.72135 | valid - 0.73333
train - 0.72960 | valid - 0.70370
train - 0.72712 | valid - 0.71852
train - 0.75021 | valid - 0.65926
train - 0.76112 | valid - 0.67164
train - 0.76359 | valid - 0.68657
Average accuracy on crossval is 0.69952
Std is 0.04542
CPU times: user 7.31 s, sys: 89.6 ms, total: 7.4 s
Wall time: 7.65 s


## 4. Tree

### a. Default regularization

1. Train a baseline model with the only parameter `max_depth=10` and `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [15]:
%%time
dt = DecisionTreeClassifier(max_depth=10,random_state=21)
crossval(10,X_train,y_train,dt)

train - 0.81039 | valid - 0.74074
train - 0.77741 | valid - 0.74074
train - 0.83347 | valid - 0.70370
train - 0.79720 | valid - 0.76296
train - 0.82440 | valid - 0.75556
train - 0.80379 | valid - 0.68889
train - 0.80709 | valid - 0.76296
train - 0.80132 | valid - 0.65926
train - 0.80807 | valid - 0.75373
train - 0.80478 | valid - 0.68657
Average accuracy on crossval is 0.72551
Std is 0.03562
CPU times: user 127 ms, sys: 2.07 ms, total: 129 ms
Wall time: 132 ms


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `max_depth`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [16]:
%%time
dt = DecisionTreeClassifier(max_depth=5,random_state=21)
crossval(10,X_train,y_train,dt)

train - 0.59522 | valid - 0.53333
train - 0.56307 | valid - 0.53333
train - 0.60181 | valid - 0.55556
train - 0.59604 | valid - 0.57037
train - 0.60264 | valid - 0.57778
train - 0.57955 | valid - 0.53333
train - 0.58368 | valid - 0.54815
train - 0.59275 | valid - 0.51111
train - 0.58237 | valid - 0.56716
train - 0.60132 | valid - 0.50000
Average accuracy on crossval is 0.54301
Std is 0.02423
CPU times: user 122 ms, sys: 5.26 ms, total: 127 ms
Wall time: 232 ms


In [17]:
%%time
dt = DecisionTreeClassifier(max_depth=10,random_state=21)
crossval(10,X_train,y_train,dt)

train - 0.81039 | valid - 0.74074
train - 0.77741 | valid - 0.74074
train - 0.83347 | valid - 0.70370
train - 0.79720 | valid - 0.76296
train - 0.82440 | valid - 0.75556
train - 0.80379 | valid - 0.68889
train - 0.80709 | valid - 0.76296
train - 0.80132 | valid - 0.65926
train - 0.80807 | valid - 0.75373
train - 0.80478 | valid - 0.68657
Average accuracy on crossval is 0.72551
Std is 0.03562
CPU times: user 123 ms, sys: 3.04 ms, total: 126 ms
Wall time: 128 ms


In [18]:
%%time
dt = DecisionTreeClassifier(max_depth=21,random_state=21)
crossval(10,X_train,y_train,dt)

train - 0.99176 | valid - 0.86667
train - 0.99176 | valid - 0.90370
train - 0.99093 | valid - 0.85926
train - 0.99176 | valid - 0.89630
train - 0.99093 | valid - 0.88148
train - 0.98516 | valid - 0.88148
train - 0.98928 | valid - 0.91852
train - 0.99423 | valid - 0.87407
train - 0.99588 | valid - 0.86567
train - 0.99012 | valid - 0.88060
Average accuracy on crossval is 0.88278
Std is 0.01757
CPU times: user 133 ms, sys: 5.12 ms, total: 138 ms
Wall time: 146 ms


In [19]:
%%time
dt = DecisionTreeClassifier(max_depth=22,random_state=21)
crossval(10,X_train,y_train,dt)

train - 0.99423 | valid - 0.85926
train - 0.99340 | valid - 0.88889
train - 0.99423 | valid - 0.86667
train - 0.99588 | valid - 0.89630
train - 0.99423 | valid - 0.88148
train - 0.99093 | valid - 0.85926
train - 0.99176 | valid - 0.91852
train - 0.99588 | valid - 0.88148
train - 0.99753 | valid - 0.86567
train - 0.99176 | valid - 0.88060
Average accuracy on crossval is 0.87981
Std is 0.01754
CPU times: user 130 ms, sys: 3.15 ms, total: 134 ms
Wall time: 136 ms


In [20]:
%%time
dt = DecisionTreeClassifier(max_depth=24,random_state=21)
crossval(10,X_train,y_train,dt)

train - 1.00000 | valid - 0.85926
train - 0.99835 | valid - 0.92593
train - 0.99753 | valid - 0.86667
train - 0.99918 | valid - 0.91111
train - 0.99835 | valid - 0.88148
train - 0.99670 | valid - 0.86667
train - 0.99588 | valid - 0.92593
train - 0.99835 | valid - 0.88148
train - 0.99918 | valid - 0.87313
train - 0.99753 | valid - 0.88060
Average accuracy on crossval is 0.88722
Std is 0.02346
CPU times: user 127 ms, sys: 5.18 ms, total: 132 ms
Wall time: 131 ms


## 5. Random forest

### a. Default regularization

1. Train a baseline model with the only parameters `n_estimators=50`, `max_depth=14`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [21]:
%%time
rf = RandomForestClassifier(n_estimators=50,max_depth=14, random_state=21)
crossval(10,X_train, y_train, rf)

train - 0.96455 | valid - 0.88148
train - 0.96208 | valid - 0.91852
train - 0.96785 | valid - 0.86667
train - 0.96455 | valid - 0.89630
train - 0.96538 | valid - 0.91111
train - 0.96538 | valid - 0.88148
train - 0.97115 | valid - 0.91852
train - 0.96867 | valid - 0.85185
train - 0.97364 | valid - 0.88060
train - 0.97941 | valid - 0.86567
Average accuracy on crossval is 0.88722
Std is 0.02204
CPU times: user 1.81 s, sys: 15.9 ms, total: 1.83 s
Wall time: 1.84 s


### b. Optimizing regularization parameters

1. In the new cells try different values of the parameters `max_depth` and `n_estimators`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [22]:
%%time
rf = RandomForestClassifier(n_estimators=50,max_depth=14, random_state=21)
crossval(10,X_train, y_train, rf)

train - 0.96455 | valid - 0.88148
train - 0.96208 | valid - 0.91852
train - 0.96785 | valid - 0.86667
train - 0.96455 | valid - 0.89630
train - 0.96538 | valid - 0.91111
train - 0.96538 | valid - 0.88148
train - 0.97115 | valid - 0.91852
train - 0.96867 | valid - 0.85185
train - 0.97364 | valid - 0.88060
train - 0.97941 | valid - 0.86567
Average accuracy on crossval is 0.88722
Std is 0.02204
CPU times: user 1.74 s, sys: 31 ms, total: 1.77 s
Wall time: 1.98 s


In [23]:
%%time
rf = RandomForestClassifier(n_estimators=50,max_depth=27, random_state=21)
crossval(10,X_train, y_train, rf)

train - 1.00000 | valid - 0.89630
train - 1.00000 | valid - 0.93333
train - 1.00000 | valid - 0.90370
train - 1.00000 | valid - 0.93333
train - 0.99918 | valid - 0.91852
train - 0.99918 | valid - 0.88889
train - 1.00000 | valid - 0.92593
train - 1.00000 | valid - 0.90370
train - 1.00000 | valid - 0.94030
train - 0.99918 | valid - 0.87313
Average accuracy on crossval is 0.91171
Std is 0.02092
CPU times: user 1.99 s, sys: 46.3 ms, total: 2.03 s
Wall time: 2.34 s


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.
3. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your test dataset).
4. Save the model.

In [24]:
%%time
rf = RandomForestClassifier(n_estimators=50,max_depth=27, random_state=21)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)

CPU times: user 195 ms, sys: 1.92 ms, total: 197 ms
Wall time: 245 ms


0.9260355029585798

In [25]:
%%time
error = pd.DataFrame({'real': y_test, 'pred': y_pred})
error['error'] = error['real'] != error['pred']
days = (error.groupby('real')['error'].mean().sort_values(ascending=False))*100
days

CPU times: user 3.64 ms, sys: 0 ns, total: 3.64 ms
Wall time: 3.84 ms


real
0    25.925926
4    14.285714
1    10.909091
5     7.407407
2     6.666667
3     2.500000
6     1.408451
Name: error, dtype: float64

In [26]:
joblib.dump(rf,'model.pkl')

['model.pkl']