# Day 09. Exercise 02
# Metrics

## 0. Imports

In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score,roc_auc_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import joblib

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [18]:
df = pd.read_csv('../../datasets/day-of-week-not-scaled.csv')
day = pd.read_csv('../../datasets/dayofweek.csv')
df['dayofweek'] = day['dayofweek']
df

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,dayofweek
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,9,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3
1682,6,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3
1683,7,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3
1684,8,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3


In [19]:
X = df.drop('dayofweek',axis=1)
y = df['dayofweek']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, stratify=y, random_state=21)

## 2. SVM

1. Use the best parameters from the previous exercise and train the model of SVM.
2. You need to calculate `accuracy`, `precision`, `recall`, `ROC AUC`.

 - `precision` and `recall` should be calculated for each class (use `average='weighted'`)
 - `ROC AUC` should be calculated for each class against any other class (all possible pairwise combinations) and then weighted average should be applied for the final metric
 - the code in the cell should display the result as below:

```
accuracy is 0.88757
precision is 0.89267
recall is 0.88757
roc_auc is 0.97878
```

In [20]:
svc = SVC(C=10,class_weight=None,gamma='auto', kernel='rbf', probability=True, random_state=21)
svc.fit(X_train,y_train)
y_pred = svc.predict(X_test)
print(f'accuracy is {accuracy_score(y_test, y_pred):.5f}')
print(f'precision is {precision_score(y_test, y_pred, average='weighted'):.5f}')
print(f'recall is {recall_score(y_test, y_pred, average='weighted'):.5f}')
proba = svc.predict_proba(X_test)
print(f'roc_auc is {roc_auc_score(y_test, proba, average='weighted', multi_class='ovo'):.5f}')

accuracy is 0.88757
precision is 0.89267
recall is 0.88757
roc_auc is 0.97878


## 3. Decision tree

1. The same task for decision tree

In [21]:
dt = DecisionTreeClassifier(class_weight='balanced',criterion='gini', max_depth=22, random_state=21)
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)
print(f'accuracy is {accuracy_score(y_test, y_pred):.5f}')
print(f'precision is {precision_score(y_test, y_pred, average='weighted'):.5f}')
print(f'recall is {recall_score(y_test, y_pred, average='weighted'):.5f}')
proba = dt.predict_proba(X_test)
print(f'roc_auc is {roc_auc_score(y_test, proba, average='weighted', multi_class='ovo'):.5f}')

accuracy is 0.89053
precision is 0.89262
recall is 0.89053
roc_auc is 0.93664


## 4. Random forest

1. The same task for random forest.

In [22]:
rf = RandomForestClassifier(class_weight=None,criterion='gini', max_depth=28,n_estimators=50, random_state=21)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
print(f'accuracy is {accuracy_score(y_test, y_pred):.5f}')
print(f'precision is {precision_score(y_test, y_pred, average='weighted'):.5f}')
print(f'recall is {recall_score(y_test, y_pred, average='weighted'):.5f}')
proba = rf.predict_proba(X_test)
print(f'roc_auc is {roc_auc_score(y_test, proba, average='weighted', multi_class='ovo'):.5f}')

accuracy is 0.92899
precision is 0.93009
recall is 0.92899
roc_auc is 0.99033


## 5. Predictions

1. Choose the best model.
2. Analyze: for which `weekday` your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which `labname` and for which `users`.
3. Save the model.

In [23]:
rf = RandomForestClassifier(class_weight=None,criterion='gini', max_depth=28,n_estimators=50, random_state=21)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)


In [24]:
Xd = X_test.copy()
Xd['real'] = y_test
Xd['error'] = Xd['real'] != y_pred
days = (Xd.groupby('real')['error'].mean().sort_values(ascending=False))*100
days.head(7)

real
0    25.925926
4    14.285714
1    10.909091
2     6.666667
5     5.555556
3     2.500000
6     1.408451
Name: error, dtype: float64

In [25]:
labs = [col for col in Xd.columns if col.startswith('labname_')]
labname = []
for lab in labs:
    labval = Xd[Xd[lab] == 1]
    laberr = labval['error'].mean()*100
    labname.append({'labname': lab,
                    'err_rate': laberr})
labname_df = pd.DataFrame(labname).sort_values('err_rate', ascending=False)
labname_df.head(5)
    

Unnamed: 0,labname,err_rate
2,labname_lab03,100.0
3,labname_lab03s,100.0
8,labname_laba06,22.222222
5,labname_laba04,17.142857
4,labname_lab05s,16.666667


In [26]:
users = [col for col in Xd.columns if col.startswith('uid_')]
username = []
for user in users:
    userval = Xd[Xd[user] == 1]
    usererr = userval['error'].mean()*100
    username.append({'labname': user,
                    'err_rate': usererr})
username_df = pd.DataFrame(username).sort_values('err_rate', ascending=False)
username_df.head(5)

Unnamed: 0,labname,err_rate
15,uid_user_22,100.0
8,uid_user_16,40.0
27,uid_user_6,25.0
11,uid_user_19,21.052632
20,uid_user_27,16.666667


In [30]:
joblib.dump(rf,'model.joblib')

['model.joblib']

## 6. Function

1. Write a function that takes a list of different models and a corresponding list of parameters (dicts) and returns a dict that contains all the 4 metrics for each model.

In [28]:
def models_metrics(models, params):
    res= []
    for model, param in zip(models, params):
        model.set_params(**param)
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        proba = model.predict_proba(X_test)
        metrics = [{'accuracy': accuracy_score(y_test,y_pred),
                   'precision': precision_score(y_test, y_pred, average='weighted'),
                   'recall': recall_score(y_test, y_pred, average='weighted'),
                   'roc_auc': roc_auc_score(y_test, proba, average='weighted',multi_class='ovo')}]
        res.append(metrics)
    return res

In [29]:

models = [SVC(),DecisionTreeClassifier(),RandomForestClassifier()]
metrics = [{'C': 10,'class_weight': None,'gamma': 'auto', 'kernel': 'rbf', 'probability': True},
           {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 22, 'random_state': 21},
           {'class_weight': None, 'criterion': 'gini', 'max_depth': 28,'n_estimators': 50, 'random_state': 21}]
models_metrics(models,metrics)

[[{'accuracy': 0.8875739644970414,
   'precision': 0.8926729169690374,
   'recall': 0.8875739644970414,
   'roc_auc': np.float64(0.9785659753794684)}],
 [{'accuracy': 0.8905325443786982,
   'precision': 0.8926192681313897,
   'recall': 0.8905325443786982,
   'roc_auc': np.float64(0.9366351447213223)}],
 [{'accuracy': 0.9289940828402367,
   'precision': 0.9300865038851309,
   'recall': 0.9289940828402367,
   'roc_auc': np.float64(0.9903274757720744)}]]