# Day 09. Exercise 02
# Metrics

## 0. Imports

In [58]:
import pandas as pd
import numpy as np
import joblib

In [59]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix

## 1. Preprocessing

1. Создайте тот же кадр данных, что и в предыдущем упражнении.
2. Используя `train_test_split` с параметрами `test_size=0.2`, `random_state=21`, получите `X_train`, `y_train`, `X_test`, `y_test`. Используйте дополнительный параметр `stratify`.

In [60]:
df = pd.read_csv('../data/day-of-week-not-scaled.csv')
df1 = pd.read_csv('../data/dayofweek.csv')
df['dayofweek'] = df1['dayofweek']

x = df.drop(['dayofweek'], axis=1)
y = df['dayofweek']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=21, stratify=y)

## 2. SVM

1. Используйте лучшие параметры из предыдущего упражнения и обучите модель SVM.
2. Необходимо вычислить `точность`, `точность`, `результативность`, `ROC AUC`.

 - `precision` и `recall` должны быть рассчитаны для каждого класса (используйте `average='weighted'`)
 - `ROC AUC` должен быть рассчитан для каждого класса относительно любого другого класса (все возможные парные комбинации), а затем для итоговой метрики должно быть применено средневзвешенное значение
 - код в ячейке должен отображать результат, как показано ниже:

```
accuracy is 0.88757
precision is 0.89267
recall is 0.88757
roc_auc is 0.97878
```

In [61]:
best_params = {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf'}
model = SVC(random_state=21, probability=True, **best_params)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [62]:
print(f"accuracy is {accuracy_score(y_test, y_pred):.5f}")
print(f"precision is {precision_score(y_test, y_pred, average='weighted'):.5f}")
print(f"recall is {recall_score(y_test, y_pred, average='weighted'):.5f}")
print(f"roc_auc is {roc_auc_score(y_test, model.predict_proba(x_test), average='weighted', multi_class='ovo'):.5f}")

accuracy is 0.88757
precision is 0.89267
recall is 0.88757
roc_auc is 0.97878


## 3. Decision tree

1. Аналогичная задача для дерева решений

In [63]:
best_params  = {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': np.int64(22)}
model = DecisionTreeClassifier(random_state=21, **best_params)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [64]:
print(f"accuracy is {accuracy_score(y_test, y_pred):.5f}")
print(f"precision is {precision_score(y_test, y_pred, average='weighted'):.5f}")
print(f"recall is {recall_score(y_test, y_pred, average='weighted'):.5f}")
print(f"roc_auc is {roc_auc_score(y_test, model.predict_proba(x_test), average='weighted', multi_class='ovo'):.5f}")

accuracy is 0.89053
precision is 0.89262
recall is 0.89053
roc_auc is 0.93664


## 4. Random forest

1. Аналогичная задача для случайного леса.

In [65]:
best_params = {'class_weight': None, 'criterion': 'gini', 'max_depth': np.int64(28), 'n_estimators': 50}
model = RandomForestClassifier(random_state=21, **best_params)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [66]:
print(f"accuracy is {accuracy_score(y_test, y_pred):.5f}")
print(f"precision is {precision_score(y_test, y_pred, average='weighted'):.5f}")
print(f"recall is {recall_score(y_test, y_pred, average='weighted'):.5f}")
print(f"roc_auc is {roc_auc_score(y_test, model.predict_proba(x_test), average='weighted', multi_class='ovo'):.5f}")

accuracy is 0.92899
precision is 0.93009
recall is 0.92899
roc_auc is 0.99033


## 5. Predictions

1. Выберите лучшую модель.
2. Проанализируйте: для какого `weekday` ваша модель делает больше всего ошибок (в % от общего числа образцов этого класса в вашем полном наборе данных), для какого `labname` и для каких `users`.
3. Сохраните модель.

In [67]:
def error_rate(cm, day_i):
    cm_df = confusion_matrix(y_test, y_pred, labels=np.unique(y_test))
    cm_df = pd.DataFrame(cm_df, index=np.unique(y_test), columns=np.unique(y_test))

    if cm:
        print(f'Confusion Matrix:\n{cm_df}')

    print(f'Error rates:')
    if len(day_i) == 0:
        day_i = cm_df.index

    for i in day_i:
        day = (cm_df.loc[i].sum() - cm_df.loc[i, i]) / cm_df.loc[i].sum() * 100
        print(f'day_{i}: Error Rate: {day:.5f} %')

def day_error(day_i):
    print(f'Day errors:')
    y_t_p = pd.DataFrame({'index': y_test.index,'y_test': y_test, 'y_pred': y_pred})

    if len(day_i) == 0:
        day_i = np.unique(y_test)

    for i in day_i:
        yy = y_t_p[y_t_p['y_test'] == i]
        yy = yy[yy['y_test'] != yy['y_pred']]

        for j in range(len(yy)):
            s = df.loc[yy.iloc[j, 0]]
            s = s[s != 0]
            print(f'True dayofweek: {i} | Pred dayofweek: {yy.iloc[j, 2]} | User UID: {s.index[2]} | Project Name: {s.index[3]}')

In [68]:
error_rate(True, [5, 6])
day_error([5, 6])

Confusion Matrix:
    0   1   2   3   4   5   6
0  20   0   0   1   1   1   4
1   2  49   1   1   0   1   1
2   0   0  28   2   0   0   0
3   1   0   1  78   0   0   0
4   0   0   0   0  18   3   0
5   0   0   0   2   0  51   1
6   0   0   0   0   0   1  70
Error rates:
day_5: Error Rate: 5.55556 %
day_6: Error Rate: 1.40845 %
Day errors:
True dayofweek: 5 | Pred dayofweek: 6 | User UID: uid_user_3 | Project Name: labname_project1
True dayofweek: 5 | Pred dayofweek: 3 | User UID: uid_user_19 | Project Name: labname_laba04
True dayofweek: 5 | Pred dayofweek: 3 | User UID: uid_user_31 | Project Name: labname_project1
True dayofweek: 6 | Pred dayofweek: 5 | User UID: uid_user_27 | Project Name: labname_laba04


In [69]:
joblib.dump(model, 'model.pkl')

['model.pkl']

## 6. Function

1. Напишите функцию, которая принимает список различных моделей и соответствующий список параметров (dicts) и возвращает dict, содержащий все 4 метрики для каждой модели.

In [70]:
def metrics_calculation(class_models, best_params):
    result = dict()
    for i in range(len(class_models)):
        row = dict()
        model = class_models[i](**best_params[i])
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)

        row['accuracy'] = accuracy_score(y_test, y_pred)
        row['precision'] = precision_score(y_test, y_pred, average='weighted')
        row['recall'] = recall_score(y_test, y_pred, average='weighted')
        row['roc_auc'] = float(roc_auc_score(y_test, model.predict_proba(x_test), average='weighted', multi_class='ovo'))

        result[class_models[i].__name__] = row

    return result

In [71]:
class_models = [SVC, DecisionTreeClassifier, RandomForestClassifier]
best_params = [{'random_state': 21, 'probability':True, 'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf'},
               {'random_state': 21, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': np.int64(22)},
               {'random_state': 21, 'class_weight': None, 'criterion': 'gini', 'max_depth': np.int64(28), 'n_estimators': 50}]
result = metrics_calculation(class_models, best_params)
result

{'SVC': {'accuracy': 0.8875739644970414,
  'precision': 0.8926729169690374,
  'recall': 0.8875739644970414,
  'roc_auc': 0.9787793228216216},
 'DecisionTreeClassifier': {'accuracy': 0.8905325443786982,
  'precision': 0.8926192681313897,
  'recall': 0.8905325443786982,
  'roc_auc': 0.9366351447213223},
 'RandomForestClassifier': {'accuracy': 0.9289940828402367,
  'precision': 0.9300865038851309,
  'recall': 0.9289940828402367,
  'roc_auc': 0.9903274757720744}}