# Day 09. Exercise 02
# Metrics

## 0. Imports

In [292]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
import numpy as np
from joblib import dump

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [293]:
df = pd.read_csv("../data/day-of-week-not-scaled.csv")
df["dayofweek"] = pd.read_csv("../data/dayofweek.csv", usecols=["dayofweek"])

x = df.drop(columns="dayofweek")
y = df["dayofweek"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=21, stratify=y)

## 2. SVM

1. Use the best parameters from the previous exercise and train the model of SVM.
2. You need to calculate `accuracy`, `precision`, `recall`, `ROC AUC`.

 - `precision` and `recall` should be calculated for each class (use `average='weighted'`)
 - `ROC AUC` should be calculated for each class against any other class (all possible pairwise combinations) and then weighted average should be applied for the final metric
 - the code in the cell should display the result as below:

```
accuracy is 0.88757
precision is 0.89267
recall is 0.88757
roc_auc is 0.97878
```

In [294]:
model = SVC(random_state=21, C=10, class_weight=None, gamma="auto", kernel="rbf", probability=True)
model.fit(x_train, y_train)
predict = model.predict(x_test)
print(f"accuracy is {accuracy_score(y_test, predict):.5f}")
print(f"precision is {precision_score(y_test, predict, average='weighted'):.5f}")
print(f"recall is {recall_score(y_test, predict, average='weighted'):.5f}")

predict_proba = model.predict_proba(x_test)
print(f"roc_auc is {roc_auc_score(y_test, predict_proba, multi_class='ovo', average='weighted'):.5f}")

accuracy is 0.88757
precision is 0.89267
recall is 0.88757
roc_auc is 0.97878


## 3. Decision tree

1. The same task for decision tree

In [295]:
model = DecisionTreeClassifier(random_state=21, class_weight="balanced", criterion="gini", max_depth=21)
model.fit(x_train, y_train)
predict = model.predict(x_test)
print(f"accuracy is {accuracy_score(y_test, predict):.5f}")
print(f"precision is {precision_score(y_test, predict, average='weighted'):.5f}")
print(f"recall is {recall_score(y_test, predict, average='weighted'):.5f}")

predict_proba = model.predict_proba(x_test)
print(f"roc_auc is {roc_auc_score(y_test, predict_proba, multi_class='ovo', average='weighted'):.5f}")

accuracy is 0.88462
precision is 0.88765
recall is 0.88462
roc_auc is 0.93528


## 4. Random forest

1. The same task for random forest.

In [296]:
model = RandomForestClassifier(random_state=21, class_weight="balanced", criterion="entropy", max_depth=24, n_estimators=100)
model.fit(x_train, y_train)
predict = model.predict(x_test)
print(f"accuracy is {accuracy_score(y_test, predict):.5f}")
print(f"precision is {precision_score(y_test, predict, average='weighted'):.5f}")
print(f"recall is {recall_score(y_test, predict, average='weighted'):.5f}")

predict_proba = model.predict_proba(x_test)
print(f"roc_auc is {roc_auc_score(y_test, predict_proba, multi_class='ovo', average='weighted'):.5f}")

accuracy is 0.92604
precision is 0.92754
recall is 0.92604
roc_auc is 0.98939


## 5. Predictions

1. Choose the best model.
2. Analyze: for which `weekday` your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which `labname` and for which `users`.
3. Save the model.

In [306]:
model = RandomForestClassifier(random_state=21, class_weight="balanced", criterion="entropy", max_depth=24, n_estimators=100)
model.fit(x_train, y_train)
predict = model.predict(x)
df["predict"] = predict

In [307]:
df[df.predict != df.dayofweek].dayofweek.value_counts() / len(predict)

0    0.003559
5    0.002966
1    0.002372
4    0.001779
3    0.001779
6    0.001186
2    0.001186
Name: dayofweek, dtype: float64

In [308]:
error = df[df["predict"] != df["dayofweek"]]
users = [i for i in df.columns if i.startswith("uid_")]
labnames = [i for i in df.columns if i.startswith("labname_")]

In [309]:
max_error = 0
max_user = ''
for user in users:
    error_perc = error[user].sum() / len(predict)
    if error_perc > max_error:
        max_error = error_perc
        max_user = user
print(f"max user error: {max_user}, error percent: {(max_error * 100):.3f}%")

max user error: uid_user_2, error percent: 0.178%


In [310]:
max_error = 0
max_user = ''
for lab in labnames:
    error_perc = error[lab].sum() / len(predict)
    if error_perc > max_error:
        max_error = error_perc
        max_lab = lab
print(f"max lab error: {max_lab}, error percent: {(max_error * 100):.3f}%")

max lab error: labname_project1, error percent: 0.593%


In [311]:
dump(model, "model.joblib")

['model.joblib']

## 6. Function

1. Write a function that takes a list of different models and a corresponding list of parameters (dicts) and returns a dict that contains all the 4 metrics for each model.

In [303]:
def list_models(models, params):
    results = {}
    for model_, param in zip(models, params):

        model = model_(**param)
        model.fit(x_train, y_train)
        predict = model.predict(x_test)

        predict_proba = model.predict_proba(x_test)

        results[model.__class__.__name__] = {
            "accuracy": accuracy_score(y_test, predict),
            "precision": precision_score(y_test, predict, average="weighted"),
            "recall": recall_score(y_test, predict, average="weighted"),
            "roc_auc": roc_auc_score(y_test, predict_proba, multi_class="ovo", average="weighted")
        }
    
    return results

In [304]:
models = [RandomForestClassifier, SVC, DecisionTreeClassifier]
params = [
    {"class_weight": "balanced",
        "criterion": "entropy",
        "max_depth": 24,
        "n_estimators": 100, "random_state": 21},
    {"C": 10, "class_weight": None, "gamma": "auto", "kernel": "rbf", "probability": True, "random_state": 21},
    {"class_weight": "balanced",
    "criterion": "gini",
    "max_depth": 21,
    "random_state": 21}
]

list_models(models, params)

{'RandomForestClassifier': {'accuracy': 0.9260355029585798,
  'precision': 0.9275374670957044,
  'recall': 0.9260355029585798,
  'roc_auc': 0.9893851880258296},
 'SVC': {'accuracy': 0.8875739644970414,
  'precision': 0.8926729169690374,
  'recall': 0.8875739644970414,
  'roc_auc': 0.9787793228216216},
 'DecisionTreeClassifier': {'accuracy': 0.8846153846153846,
  'precision': 0.8876518218623483,
  'recall': 0.8846153846153846,
  'roc_auc': 0.935280206669359}}