# Day 09. Exercise 00
# Regularization

## 0. Imports

In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import joblib

## 1. Preprocessing

1. Read the file `dayofweek.csv` that you used in the previous day to a dataframe.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [3]:
df = pd.read_csv("../data/dayofweek.csv")
df.head()

Unnamed: 0,numTrials,hour,dayofweek,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,-0.788667,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.756764,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.724861,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.692958,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-0.661055,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1686 entries, 0 to 1685
Data columns (total 44 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   numTrials         1686 non-null   float64
 1   hour              1686 non-null   float64
 2   dayofweek         1686 non-null   int64  
 3   uid_user_0        1686 non-null   float64
 4   uid_user_1        1686 non-null   float64
 5   uid_user_10       1686 non-null   float64
 6   uid_user_11       1686 non-null   float64
 7   uid_user_12       1686 non-null   float64
 8   uid_user_13       1686 non-null   float64
 9   uid_user_14       1686 non-null   float64
 10  uid_user_15       1686 non-null   float64
 11  uid_user_16       1686 non-null   float64
 12  uid_user_17       1686 non-null   float64
 13  uid_user_18       1686 non-null   float64
 14  uid_user_19       1686 non-null   float64
 15  uid_user_2        1686 non-null   float64
 16  uid_user_20       1686 non-null   float64


In [5]:
X = df.drop('dayofweek', axis=1)
y = df['dayofweek']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

## 2. Logreg regularization

### a. Default regularization

1. Train a baseline model with the only parameters `random_state=21`, `fit_intercept=False`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model


The result of the code where you trained and evaluated the baseline model should be exactly like this (use `%%time` to get the info about how long it took to run the cell):

```
train -  0.62902   |   valid -  0.59259
train -  0.64633   |   valid -  0.62963
train -  0.63479   |   valid -  0.56296
train -  0.65622   |   valid -  0.61481
train -  0.63397   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64138   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64333   |   valid -  0.59701
train -  0.63674   |   valid -  0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943
```

In [7]:
def trained_and_evaluated(model, X, y):
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)
    train_scores, valid_scores = [], []
    for train_index, valid_index in skf.split(X, y):
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
        y_valid_pred = model.predict(X_valid)

        train_accuracy = accuracy_score(y_train, y_train_pred)
        valid_accuracy = accuracy_score(y_valid, y_valid_pred)

        train_scores.append(train_accuracy)
        valid_scores.append(valid_accuracy)

        print(f'train -  {train_accuracy:.5f}   |   valid -  {valid_accuracy:.5f}')

    avg_train_accuracy = np.mean(train_scores)
    avg_valid_accuracy = np.mean(valid_scores)
    std_valid_accuracy = np.std(valid_scores)

    print(f'Average accuracy on crossval is {avg_valid_accuracy:.5f}')
    print(f'Std is {std_valid_accuracy:.5f}')

In [8]:
%%time
trained_and_evaluated(LogisticRegression(random_state=21, fit_intercept=False), X_train, y_train)

train -  0.64056   |   valid -  0.65926
train -  0.63561   |   valid -  0.62222
train -  0.64468   |   valid -  0.60000
train -  0.64056   |   valid -  0.64444
train -  0.65375   |   valid -  0.60741
train -  0.62902   |   valid -  0.60000
train -  0.66117   |   valid -  0.60000
train -  0.63726   |   valid -  0.54074
train -  0.63756   |   valid -  0.66418
train -  0.64745   |   valid -  0.61194
Average accuracy on crossval is 0.61502
Std is 0.03399
CPU times: total: 141 ms
Wall time: 161 ms


### b. Optimizing regularization parameters

1. In the cells below try different values of penalty: `none`, `l1`, `l2` – you can change the values of solver too.

In [9]:
%%time
clf = LogisticRegression(penalty=None, solver='lbfgs', max_iter=1000)
trained_and_evaluated(clf, X_train, y_train)

train -  0.65870   |   valid -  0.67407
train -  0.67189   |   valid -  0.60741
train -  0.65787   |   valid -  0.62963
train -  0.65870   |   valid -  0.66667
train -  0.66777   |   valid -  0.60741
train -  0.64798   |   valid -  0.62963
train -  0.66859   |   valid -  0.59259
train -  0.66612   |   valid -  0.60741
train -  0.66063   |   valid -  0.70149
train -  0.66886   |   valid -  0.62687
Average accuracy on crossval is 0.63432
Std is 0.03340
CPU times: total: 516 ms
Wall time: 568 ms


In [10]:
%%time
clf = LogisticRegression(penalty='l1', solver='liblinear', max_iter=1000)
clf.fit(X_train, y_train)
trained_and_evaluated(clf, X_train, y_train)

train -  0.62242   |   valid -  0.62963
train -  0.61913   |   valid -  0.59259
train -  0.62984   |   valid -  0.60741
train -  0.62407   |   valid -  0.62222
train -  0.63067   |   valid -  0.57037
train -  0.61748   |   valid -  0.59259
train -  0.65128   |   valid -  0.60000
train -  0.63314   |   valid -  0.53333
train -  0.62850   |   valid -  0.65672
train -  0.62685   |   valid -  0.58209
Average accuracy on crossval is 0.59870
Std is 0.03222
CPU times: total: 203 ms
Wall time: 192 ms


In [11]:
%%time
clf = LogisticRegression(penalty='l2', solver='saga', max_iter=1000)
trained_and_evaluated(clf, X_train, y_train)

train -  0.64386   |   valid -  0.65926
train -  0.64798   |   valid -  0.62963
train -  0.64386   |   valid -  0.60741
train -  0.64468   |   valid -  0.64444
train -  0.65870   |   valid -  0.60741
train -  0.63397   |   valid -  0.60000
train -  0.66364   |   valid -  0.60000
train -  0.63726   |   valid -  0.54074
train -  0.64086   |   valid -  0.67910
train -  0.64827   |   valid -  0.60448
Average accuracy on crossval is 0.61725
Std is 0.03645
CPU times: total: 625 ms
Wall time: 650 ms


## 3. SVM regularization

### a. Default regularization

1. Train a baseline model with the only parameters `probability=True`, `kernel='linear'`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [12]:
%%time
svc = SVC(probability=True, kernel='linear', random_state=21)
trained_and_evaluated(svc, X_train, y_train)

train -  0.70651   |   valid -  0.68148
train -  0.68920   |   valid -  0.64444
train -  0.69744   |   valid -  0.66667
train -  0.68920   |   valid -  0.65926
train -  0.69497   |   valid -  0.63704
train -  0.68673   |   valid -  0.68148
train -  0.69827   |   valid -  0.61481
train -  0.70486   |   valid -  0.57778
train -  0.68863   |   valid -  0.72388
train -  0.71005   |   valid -  0.64179
Average accuracy on crossval is 0.65286
Std is 0.03800
CPU times: total: 1.7 s
Wall time: 1.7 s


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `C`.

In [13]:
%%time
svc = SVC(C=0.1, probability=True, kernel='linear', random_state=21)
trained_and_evaluated(svc, X_train, y_train)

train -  0.57049   |   valid -  0.55556
train -  0.56884   |   valid -  0.59259
train -  0.57543   |   valid -  0.54074
train -  0.56142   |   valid -  0.60000
train -  0.59110   |   valid -  0.57037
train -  0.57873   |   valid -  0.53333
train -  0.59687   |   valid -  0.54074
train -  0.59439   |   valid -  0.52593
train -  0.56590   |   valid -  0.58209
train -  0.58731   |   valid -  0.53731
Average accuracy on crossval is 0.55787
Std is 0.02522
CPU times: total: 1.48 s
Wall time: 1.58 s


In [14]:
%%time
svc = SVC(C=0.5, probability=True, kernel='linear', random_state=21)
trained_and_evaluated(svc, X_train, y_train)

train -  0.67436   |   valid -  0.65185
train -  0.66612   |   valid -  0.62963
train -  0.67024   |   valid -  0.65926
train -  0.66777   |   valid -  0.62963
train -  0.67189   |   valid -  0.62963
train -  0.66035   |   valid -  0.62222
train -  0.68261   |   valid -  0.59259
train -  0.67766   |   valid -  0.57037
train -  0.67298   |   valid -  0.70896
train -  0.67792   |   valid -  0.60448
Average accuracy on crossval is 0.62986
Std is 0.03638
CPU times: total: 1.56 s
Wall time: 1.56 s


In [15]:
%%time
svc = SVC(C=1, probability=True, kernel='linear', random_state=21)
trained_and_evaluated(svc, X_train, y_train)

train -  0.70651   |   valid -  0.68148
train -  0.68920   |   valid -  0.64444
train -  0.69744   |   valid -  0.66667
train -  0.68920   |   valid -  0.65926
train -  0.69497   |   valid -  0.63704
train -  0.68673   |   valid -  0.68148
train -  0.69827   |   valid -  0.61481
train -  0.70486   |   valid -  0.57778
train -  0.68863   |   valid -  0.72388
train -  0.71005   |   valid -  0.64179
Average accuracy on crossval is 0.65286
Std is 0.03800
CPU times: total: 1.56 s
Wall time: 1.6 s


## 4. Tree

### a. Default regularization

1. Train a baseline model with the only parameter `max_depth=10` and `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [16]:
%%time
tree_model = DecisionTreeClassifier(max_depth=10, random_state=21)
trained_and_evaluated(tree_model, X_train, y_train)

train -  0.80874   |   valid -  0.77037
train -  0.79802   |   valid -  0.70370
train -  0.81286   |   valid -  0.72593
train -  0.80049   |   valid -  0.74815
train -  0.80956   |   valid -  0.68889
train -  0.78978   |   valid -  0.74074
train -  0.80627   |   valid -  0.60741
train -  0.82688   |   valid -  0.71111
train -  0.78995   |   valid -  0.79104
train -  0.80313   |   valid -  0.70896
Average accuracy on crossval is 0.71963
Std is 0.04791
CPU times: total: 46.9 ms
Wall time: 46 ms


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `max_depth`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [17]:
param_grid = {
    'max_depth': [2, 3, 4, 5, 10],
    'min_samples_split': [2, 3, 4, 5],
    'min_samples_leaf': [1, 2, 3, 4],
    'max_features': [None, 'sqrt', 'log2'],
    'max_leaf_nodes': [5, 10, 15],
    'random_state' : [21]
}
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

In [18]:
%%time
grid_search = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=param_grid, 
                           scoring='accuracy', cv=cv, n_jobs=-1)
grid_search.fit(X_train, y_train)

print(f'Best parametrs {grid_search.best_params_}')
print(f'Best score {grid_search.best_score_}')

Best parametrs {'max_depth': 10, 'max_features': None, 'max_leaf_nodes': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'random_state': 21}
Best score 0.5482310668877833
CPU times: total: 1.66 s
Wall time: 4.73 s


## 5. Random forest

### a. Default regularization

1. Train a baseline model with the only parameters `n_estimators=50`, `max_depth=14`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [19]:
%%time
forest_model = RandomForestClassifier(n_estimators=50, max_depth=14, random_state=21)
trained_and_evaluated(forest_model, X_train, y_train)

train -  0.97939   |   valid -  0.85185
train -  0.96620   |   valid -  0.85926
train -  0.96208   |   valid -  0.91852
train -  0.97115   |   valid -  0.91852
train -  0.97197   |   valid -  0.88148
train -  0.96538   |   valid -  0.86667
train -  0.96455   |   valid -  0.88889
train -  0.96867   |   valid -  0.87407
train -  0.96458   |   valid -  0.93284
train -  0.96787   |   valid -  0.86567
Average accuracy on crossval is 0.88578
Std is 0.02673
CPU times: total: 594 ms
Wall time: 599 ms


### b. Optimizing regularization parameters

1. In the new cells try different values of the parameters `max_depth` and `n_estimators`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [32]:
%%time
forest_model = RandomForestClassifier(n_estimators=20, max_depth=14, random_state=21)
trained_and_evaluated(rfc, X_train, y_train)

train -  0.97197   |   valid -  0.85926
train -  0.95548   |   valid -  0.83704
train -  0.95548   |   valid -  0.88889
train -  0.95796   |   valid -  0.88889
train -  0.95466   |   valid -  0.85926
train -  0.94641   |   valid -  0.86667
train -  0.95218   |   valid -  0.89630
train -  0.95796   |   valid -  0.83704
train -  0.95387   |   valid -  0.92537
train -  0.95799   |   valid -  0.87313
Average accuracy on crossval is 0.87318
Std is 0.02607
CPU times: total: 266 ms
Wall time: 255 ms


In [34]:
%%time
forest_model = RandomForestClassifier(n_estimators=50, max_depth=20, random_state=21)
trained_and_evaluated(forest_model, X_train, y_train)

train -  0.99670   |   valid -  0.88889
train -  0.99918   |   valid -  0.85926
train -  0.99670   |   valid -  0.95556
train -  0.99835   |   valid -  0.92593
train -  0.99753   |   valid -  0.93333
train -  0.99423   |   valid -  0.91111
train -  0.99670   |   valid -  0.95556
train -  0.99835   |   valid -  0.88148
train -  0.99835   |   valid -  0.94776
train -  0.99506   |   valid -  0.87313
Average accuracy on crossval is 0.91320
Std is 0.03386
CPU times: total: 609 ms
Wall time: 663 ms


In [40]:
%%time
forest_model = RandomForestClassifier(max_features='log2', criterion='entropy', n_estimators=20, max_depth=20, random_state=21)
trained_and_evaluated(forest_model, X_train, y_train)

train -  0.99588   |   valid -  0.89630
train -  0.99753   |   valid -  0.85926
train -  0.99670   |   valid -  0.93333
train -  0.99505   |   valid -  0.91852
train -  0.99670   |   valid -  0.90370
train -  0.99588   |   valid -  0.91852
train -  0.99753   |   valid -  0.93333
train -  0.99753   |   valid -  0.89630
train -  0.99835   |   valid -  0.94030
train -  0.99423   |   valid -  0.86567
Average accuracy on crossval is 0.90652
Std is 0.02643
CPU times: total: 281 ms
Wall time: 292 ms


In [41]:
%%time
forest_model = RandomForestClassifier(max_features='sqrt', criterion='entropy', n_estimators=20, max_depth=20, random_state=21)
trained_and_evaluated(forest_model, X_train, y_train)

train -  0.99340   |   valid -  0.87407
train -  0.99670   |   valid -  0.85185
train -  0.99835   |   valid -  0.93333
train -  0.99588   |   valid -  0.91852
train -  0.99423   |   valid -  0.91111
train -  0.99423   |   valid -  0.91111
train -  0.99588   |   valid -  0.94815
train -  0.99918   |   valid -  0.87407
train -  0.99835   |   valid -  0.94030
train -  0.99835   |   valid -  0.88806
Average accuracy on crossval is 0.90506
Std is 0.03032
CPU times: total: 297 ms
Wall time: 302 ms


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.
3. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your test dataset).
4. Save the model.

In [22]:
forest_model = RandomForestClassifier(max_features='log2', criterion='entropy', n_estimators=20, max_depth=20, random_state=21)
forest_model.fit(X_train, y_train)
y_pred = forest_model.predict(X_test)

In [23]:
accuracy_score(y_test, y_pred)

0.9201183431952663

In [25]:
df_pred = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

In [26]:
df_pred['Error'] = (df_pred['Actual'] != df_pred['Predicted'])*1

error_summary = df_pred.groupby('Actual').agg(
    Total_Samples=('Error', 'size'),
    Total_Errors=('Error', 'sum')
)

error_summary['Error_percent'] = ((error_summary.Total_Errors/error_summary.Total_Samples)*100).round(2)

In [27]:
error_summary.sort_values(by='Error_percent', ascending=False)

Unnamed: 0_level_0,Total_Samples,Total_Errors,Error_percent
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,21,4,19.05
0,27,5,18.52
1,55,7,12.73
5,54,5,9.26
2,30,2,6.67
3,80,3,3.75
6,71,1,1.41


In [29]:
conf_matrix = confusion_matrix(df_pred['Actual'], df_pred['Predicted'])
print(conf_matrix)

[[22  1  0  0  0  1  3]
 [ 2 48  1  2  0  2  0]
 [ 1  0 28  1  0  0  0]
 [ 2  0  0 77  0  0  1]
 [ 0  0  0  0 17  4  0]
 [ 0  0  0  2  0 49  3]
 [ 0  0  0  1  0  0 70]]


In [30]:
joblib.dump(forest_model, '../model/forest_model_ex00.joblib')

['../model/forest_model_ex00.joblib']