# Day 09. Exercise 03
# Ensembles

## 0. Imports

In [95]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, StackingClassifier
from joblib import dump

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test` and then get `X_train`, `y_train`, `X_valid`, `y_valid` from the previous `X_train`, `y_train`. Use the additional parameter `stratify`.

In [96]:
df = pd.read_csv("../data/dayofweek-not-scaled.csv")

In [97]:
X = df.drop(['dayofweek'], axis=1)
y = df['dayofweek']

In [98]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y,
                                                    random_state=21)

In [99]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train,
                                                    random_state=21)

## 2. Individual classifiers

1. Train SVM, decision tree and random forest again with the best parameters that you got from the 01 exercise with `random_state=21` for all of them.
2. Evaluate `accuracy`, `precision`, and `recall` for them on the validation set.
3. The result of each cell of the section should look like this:

```
accuracy is 0.87778
precision is 0.88162
recall is 0.87778
```

In [100]:
# Лучшие параметры для моделей
best_params_svm = {'kernel': 'rbf', 'C': 10.0, 'gamma': 'auto', 'probability': True, 'random_state': 21}
best_params_dt = {'max_depth': 17, 'random_state': 21}
best_params_rf = {'n_estimators': 100, 'max_depth': 28, 'random_state': 21}

# Инициализация моделей с лучшими параметрами
svm_model = SVC(**best_params_svm)
dt_model = DecisionTreeClassifier(**best_params_dt)
rf_model = RandomForestClassifier(**best_params_rf)

# Обучение моделей
svm_model.fit(X_train, y_train)
dt_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)

# Прогнозирование на валидационном наборе данных
y_val_pred_svm = svm_model.predict(X_valid)
y_val_pred_dt = dt_model.predict(X_valid)
y_val_pred_rf = rf_model.predict(X_valid)

# Оценка метрик для SVM
accuracy_svm = accuracy_score(y_valid, y_val_pred_svm)
precision_svm = precision_score(y_valid, y_val_pred_svm, average='weighted')
recall_svm = recall_score(y_valid, y_val_pred_svm, average='weighted')

print("SVM:")
print(f"accuracy is {accuracy_svm:.5f}")
print(f"precision is {precision_svm:.5f}")
print(f"recall is {recall_svm:.5f}")
print()

# Оценка метрик для Decision Tree
accuracy_dt = accuracy_score(y_valid, y_val_pred_dt)
precision_dt = precision_score(y_valid, y_val_pred_dt, average='weighted')
recall_dt = recall_score(y_valid, y_val_pred_dt, average='weighted')

print("Decision Tree:")
print(f"accuracy is {accuracy_dt:.5f}")
print(f"precision is {precision_dt:.5f}")
print(f"recall is {recall_dt:.5f}")
print()

# Оценка метрик для Random Forest
accuracy_rf = accuracy_score(y_valid, y_val_pred_rf)
precision_rf = precision_score(y_valid, y_val_pred_rf, average='weighted')
recall_rf = recall_score(y_valid, y_val_pred_rf, average='weighted')

print("Random Forest:")
print(f"accuracy is {accuracy_rf:.5f}")
print(f"precision is {precision_rf:.5f}")
print(f"recall is {recall_rf:.5f}")

SVM:
accuracy is 0.87778
precision is 0.88162
recall is 0.87778

Decision Tree:
accuracy is 0.84444
precision is 0.84596
recall is 0.84444

Random Forest:
accuracy is 0.88889
precision is 0.88940
recall is 0.88889


## 3. Voting classifiers

1. Using `VotingClassifier` and the three models that you have just trained, calculate the `accuracy`, `precision`, and `recall` on the validation set.
2. Play with the other parameteres.
3. Calculate the `accuracy`, `precision` and `recall` on the test set for the model with the best weights in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).

In [101]:
estimators=[
    ('svm', svm_model),
    ('dt', dt_model),
    ('rfc', rf_model)]
weights = [
    [1, 1, 1],  # Все модели имеют одинаковый вес
    [2, 1, 1],  # Первая модель имеет больший вес
    [1, 2, 1],  # Вторая модель имеет больший вес
    [1, 1, 2],   # Третья модель имеет больший вес
    [4, 1, 1],
    [1, 4, 1],
    [1, 1, 4],
    [4, 1, 4]

]

In [102]:
svm_model = SVC(**best_params_svm)
dt_model = DecisionTreeClassifier(**best_params_dt)
rf_model = RandomForestClassifier(**best_params_rf)

# Создание VotingClassifier
voting_clf = VotingClassifier(
    estimators=[
        ('svm', svm_model),
        ('dt', dt_model),
        ('rf', rf_model)
    ],
    voting='soft',  # Используем soft voting (вероятности)
    weights=[1, 1, 1]  # Веса моделей (можно менять)
)

# Обучение VotingClassifier
voting_clf.fit(X_train, y_train)

# Прогнозирование на валидационном наборе данных
y_val_pred_voting = voting_clf.predict(X_valid)

# Оценка метрик для VotingClassifier
accuracy_voting = accuracy_score(y_valid, y_val_pred_voting)
precision_voting = precision_score(y_valid, y_val_pred_voting, average='weighted')
recall_voting = recall_score(y_valid, y_val_pred_voting, average='weighted')

print("VotingClassifier:")
print(f"accuracy is {accuracy_voting:.5f}")
print(f"precision is {precision_voting:.5f}")
print(f"recall is {recall_voting:.5f}")
print()

# Подбор лучших весов
best_accuracy = 0
best_weights = None

weights_range = [
    [1, 1, 1],  # Все модели имеют одинаковый вес
    [2, 1, 1],  # Первая модель имеет больший вес
    [1, 2, 1],  # Вторая модель имеет больший вес
    [1, 1, 2],   # Третья модель имеет больший вес
    [4, 1, 1],
    [1, 4, 1],
    [1, 1, 4],
    [4, 1, 4]

]

# Перебор различных весов
for weights in weights_range:
    voting_clf.set_params(weights=weights)
    voting_clf.fit(X_train, y_train)
    y_val_pred = voting_clf.predict(X_valid)
    accuracy = accuracy_score(y_valid, y_val_pred)
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_weights = weights

# Обучение модели с лучшими весами
voting_clf.set_params(weights=best_weights)
voting_clf.fit(X_train, y_train)

# Прогнозирование на тестовом наборе данных
y_test_pred = voting_clf.predict(X_test)

# Оценка метрик на тестовом наборе данных
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred, average='weighted')
recall_test = recall_score(y_test, y_test_pred, average='weighted')

print("Best VotingClassifier on Test Set:")
print(f"accuracy is {accuracy_test:.5f}")
print(f"precision is {precision_test:.5f}")
print(f"recall is {recall_test:.5f}")
print(f"Best param: {best_weights}")

VotingClassifier:
accuracy is 0.87407
precision is 0.87528
recall is 0.87407

Best VotingClassifier on Test Set:
accuracy is 0.90237
precision is 0.90544
recall is 0.90237
Best param: [4, 1, 4]


## 4. Bagging classifiers

1. Using `BaggingClassifier` and `SVM` with the best parameters create an ensemble, try different values of the `n_estimators`, use `random_state=21`.
2. Play with the other parameters.
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision)

In [103]:
svc = SVC(C=10, gamma='auto', probability=True, random_state=21, kernel='rbf')
bc = BaggingClassifier(estimator=svc, n_estimators=10, random_state=21)
bc.fit(X_train, y_train)
y_pred = bc.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.5f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.5f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.5f}")

Accuracy: 0.86391
Precision: 0.86966
Recall: 0.86391


In [104]:
param_grid = {'n_estimators': range(10, 60, 10),
               "max_features": [0.1, 0.5, 1.0],
               "max_samples": [0.1, 0.5, 1.0], 
               'random_state': [21]}

gs = GridSearchCV(bc, param_grid, scoring='accuracy', n_jobs=-1, verbose=2)
gs.fit(X_train, y_train)
print(f'Best param: {gs.best_params_}')
print(f'Best score: {gs.best_score_}')
y_pred = gs.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.5f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.5f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.5f}")

Fitting 5 folds for each of 45 candidates, totalling 225 fits


 0.38128768 0.39055556 0.37940138 0.36548665 0.38498708 0.39795866
 0.39518088 0.39704134 0.37104651 0.47216624 0.4638329  0.39426357
        nan        nan 0.58350129 0.59091731 0.58999569 0.58907407
 0.58255383 0.66602929 0.67347115 0.67255814 0.68367786 0.67812661
 0.40630922        nan        nan        nan        nan 0.69387166
 0.69944014 0.71429371 0.71429371 0.71521533 0.82093023 0.82650301
 0.82556417 0.82555986 0.83113264]


Best param: {'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 50, 'random_state': 21}
Best score: 0.8311326442721791
Accuracy: 0.88462
Precision: 0.88941
Recall: 0.88462


## 5. Stacking classifiers

1. To achieve reproducibility in this case you will have to create an object of cross-validation generator: `StratifiedKFold(n_splits=n, shuffle=True, random_state=21)`, where `n` you will try to optimize (the details are below).
2. Using `StackingClassifier` and the three models that you have recently trained, calculate the `accuracy`, `precision` and `recall` on the validation set, try different values of `n_splits` `[2, 3, 4, 5, 6, 7]` in the cross-validation generator and parameter `passthrough` in the classifier itself,
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision). Use `final_estimator=LogisticRegression(solver='liblinear')`.

In [105]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=21)

In [106]:
estimators = [('svm', svm_model), ('dt', dt_model), ('rfc', rf_model)]
results = []

for n_splits in [2, 3, 4, 5, 6, 7]:
    for passthrough in [True, False]:
        cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=21)
        stacking_model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(solver='liblinear'), passthrough=passthrough)

        accuracy_scores = []
        precision_scores = []
        recall_scores = []

        for train_index, val_index in cv.split(X_train, y_train):
            X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
            y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

            stacking_model.fit(X_train_fold, y_train_fold)
            y_pred = stacking_model.predict(X_val_fold)

            accuracy_scores.append(accuracy_score(y_val_fold, y_pred))
            precision_scores.append(precision_score(y_val_fold, y_pred, average='weighted', zero_division=0)) # zero_division=0 чтобы избежать предупреждений
            recall_scores.append(recall_score(y_val_fold, y_pred, average='weighted'))

        avg_accuracy = np.mean(accuracy_scores)
        avg_precision = np.mean(precision_scores)
        avg_recall = np.mean(recall_scores)

        # results[(n_splits, passthrough)] = {
        #     'accuracy': avg_accuracy,
        #     'precision': avg_precision,
        #     'recall': avg_recall
        # }
        results.append((n_splits, passthrough, avg_accuracy))

In [107]:
print(results)

[(2, True, np.float64(0.8506493506493507)), (2, False, np.float64(0.8478664192949907)), (3, True, np.float64(0.8701201898277108)), (3, False, np.float64(0.8664087485814506)), (4, True, np.float64(0.886802973977695)), (4, False, np.float64(0.880314608288586)), (5, True, np.float64(0.8998062015503876)), (5, False, np.float64(0.8979586563307494)), (6, True, np.float64(0.8933116076970826)), (6, False, np.float64(0.8961049037864681)), (7, True, np.float64(0.8998144712430427)), (7, False, np.float64(0.9007421150278293))]


## 6. Predictions

1. Choose the best model in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).
2. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which labname and for which users.
3. Save the model.

In [108]:
best_model = VotingClassifier(estimators=estimators, voting='soft', weights = [4, 1, 4]).fit(X_train, y_train)

In [109]:
df_forecast = pd.DataFrame({"predict":best_model.predict(X_test)}, index= y_test.index)
df_fit = pd.DataFrame({"predict":best_model.predict(X_train)}, index= y_train.index)
df['forecast'] = pd.concat([df_fit, df_forecast])

In [110]:
df['Error'] = (df["forecast"]!=df.dayofweek)*1
error_analysis = df.groupby(['dayofweek']).agg(
    total_samples=('Error', 'size'),
    total_errors=('Error', 'sum')
).reset_index()
error_analysis['perc_error'] = (error_analysis.total_errors/error_analysis.total_samples).round(2)

In [111]:
error_analysis.sort_values(by = 'perc_error', ascending=False)

Unnamed: 0,dayofweek,total_samples,total_errors,perc_error
0,0,136,32,0.24
5,5,271,52,0.19
4,4,104,20,0.19
6,6,356,64,0.18
1,1,274,50,0.18
2,2,149,26,0.17
3,3,396,68,0.17


In [112]:
((df[df.Error == 1].iloc[:,2:31].sum()/df.iloc[:,2:31].sum()).round(2)).sort_values(ascending=False)

uid_user_23    0.50
uid_user_6     0.33
uid_user_17    0.32
uid_user_18    0.29
uid_user_22    0.29
uid_user_16    0.28
uid_user_29    0.25
uid_user_15    0.24
uid_user_3     0.24
uid_user_14    0.23
uid_user_27    0.22
uid_user_4     0.21
uid_user_2     0.21
uid_user_20    0.19
uid_user_24    0.18
dayofweek      0.18
uid_user_26    0.17
uid_user_25    0.16
uid_user_31    0.15
uid_user_1     0.15
uid_user_10    0.14
uid_user_19    0.13
uid_user_28    0.13
uid_user_30    0.13
uid_user_12    0.12
uid_user_13    0.12
uid_user_21    0.09
uid_user_0     0.00
uid_user_11    0.00
dtype: float64

In [113]:
((df[df.Error == 1].iloc[:,33:-2].sum()/df.iloc[:,33:-2].sum()).round(2)).sort_values(ascending=False)

labname_lab03       1.00
labname_lab03s      1.00
labname_lab05s      0.25
labname_laba06      0.23
labname_laba04s     0.22
labname_laba04      0.21
labname_laba06s     0.20
labname_project1    0.18
labname_code_rvw    0.18
labname_laba05      0.16
labname_lab02       0.00
dtype: float64

In [114]:
dump(best_model, '../model/best_model_ex03.joblib')

['../model/best_model_ex03.joblib']