# Day 09. Exercise 03
# Ensembles

## 0. Imports

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
import joblib

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test` and then get `X_train`, `y_train`, `X_valid`, `y_valid` from the previous `X_train`, `y_train`. Use the additional parameter `stratify`.

In [12]:
df = pd.read_csv('../data/day-of-week-not-scaled.csv')
data_scaled = pd.read_csv("../data/dayofweek.csv")
df["dayofweek"] = data_scaled["dayofweek"]
df.head()

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,dayofweek
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4


In [13]:
X = df.drop('dayofweek', axis=1)
y = df['dayofweek']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=21, stratify=y, test_size=0.2)

## 2. Individual classifiers

1. Train SVM, decision tree and random forest again with the best parameters that you got from the 01 exercise with `random_state=21` for all of them.
2. Evaluate `accuracy`, `precision`, and `recall` for them on the validation set.
3. The result of each cell of the section should look like this:

```
accuracy is 0.87778
precision is 0.88162
recall is 0.87778
```

In [15]:
svc = SVC(C=10, gamma="auto", kernel="rbf", probability=True, random_state=21)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)

print("accuracy is", round(accuracy_score(y_test, y_pred), 5))
print("precision is", round(precision_score(y_test, y_pred, average="weighted"), 5))
print("recall is", round(recall_score(y_test, y_pred, average="weighted"), 5))

accuracy is 0.88757
precision is 0.89267
recall is 0.88757


In [16]:
dtc = DecisionTreeClassifier(
    max_depth=21, class_weight="balanced", random_state=21, criterion="gini"
)
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)

print("accuracy is", round(accuracy_score(y_test, y_pred), 5))
print("precision is", round(precision_score(y_test, y_pred, average="weighted"), 5))
print("recall is", round(recall_score(y_test, y_pred, average="weighted"), 5))

accuracy is 0.88462
precision is 0.88765
recall is 0.88462


In [17]:
rfc = RandomForestClassifier(
    n_estimators=100, max_depth=24, random_state=21, criterion="entropy", class_weight="balanced"
)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

print("accuracy is", round(accuracy_score(y_test, y_pred), 5))
print("precision is", round(precision_score(y_test, y_pred, average="weighted"), 5))
print("recall is", round(recall_score(y_test, y_pred, average="weighted"), 5))

accuracy is 0.92604
precision is 0.92754
recall is 0.92604


## 3. Voting classifiers

1. Using `VotingClassifier` and the three models that you have just trained, calculate the `accuracy`, `precision`, and `recall` on the validation set.
2. Play with the other parameteres.
3. Calculate the `accuracy`, `precision` and `recall` on the test set for the model with the best weights in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).

In [18]:
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=21, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.2, random_state=21, stratify=y_train_val
)

In [19]:
vc = VotingClassifier(
    estimators=[("svc", svc), ("dtc", dtc), ("rfc", rfc)],
    voting="hard",
    weights=[1, 1, 1]
)

vc.fit(X_train, y_train)

y_pred_val = vc.predict(X_val)

accuracy = accuracy_score(y_val, y_pred_val)
precision = precision_score(y_val, y_pred_val, average="weighted")
recall = recall_score(y_val, y_pred_val, average="weighted")

print("accuracy:", round(accuracy, 5))
print("precision:", round(precision, 5))
print("recall:", round(recall, 5))

accuracy: 0.9
precision: 0.89993
recall: 0.9


In [20]:
params_combinations = []

for w1 in [1, 2, 3, 4, 5]:
    for w2 in [1, 2, 3, 4, 5]:
        for w3 in [1, 2, 3, 4, 5]:
            params_combinations.append({'weights': [w1, w2, w3], 'voting': 'soft'})  
            params_combinations.append({'weights': [w1, w2, w3], 'voting': 'hard'})  

best_val_accuracy = 0
best_val_precision = 0
best_params = None
best_model = None

for params in params_combinations:
    clf = VotingClassifier(
        estimators=[('svc', svc), ('dtc', dtc), ('rfc', rfc)],
        voting=params['voting'],
        weights=params['weights'],
        n_jobs=-1
    )
    clf.fit(X_train, y_train)
    y_val_pred = clf.predict(X_val)

    val_acc = accuracy_score(y_val, y_val_pred)
    val_prec = precision_score(y_val, y_val_pred, average='weighted')

    if val_acc > best_val_accuracy or (val_acc == best_val_accuracy and val_prec > best_val_precision):
        best_val_accuracy = val_acc
        best_val_precision = val_prec
        best_params = params
        best_model = clf

print("Лучшие параметры для валидных данных:", best_params)

Лучшие параметры для валидных данных: {'weights': [4, 1, 4], 'voting': 'soft'}


In [21]:
X_train_full = np.vstack((X_train, X_val))
y_train_full = np.hstack((y_train, y_val))
best_model.fit(X_train_full, y_train_full)
y_test_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred, average='weighted')
test_recall = recall_score(y_test, y_test_pred, average='weighted')

print("\nТест Metrics для Best Model:")
print(f"Accuracy: {test_accuracy:.5f}")
print(f"Precision: {test_precision:.5f}")
print(f"Recall: {test_recall:.5f}")


Тест Metrics для Best Model:
Accuracy: 0.92899
Precision: 0.93151
Recall: 0.92899


## 4. Bagging classifiers

1. Using `BaggingClassifier` and `SVM` with the best parameters create an ensemble, try different values of the `n_estimators`, use `random_state=21`.
2. Play with the other parameters.
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision)

In [22]:
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=21, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=21, stratify=y_train_val
)

In [23]:
n_estimators_options = [5, 10, 20, 50, 100]
results_n_estimators = []

for n_estimators in n_estimators_options:
    clf = BaggingClassifier(
        base_estimator=svc,
        n_estimators=n_estimators,
        random_state=21,
        n_jobs=-1
    )
    clf.fit(X_train, y_train)
    y_val_pred = clf.predict(X_val)

    val_acc = accuracy_score(y_val, y_val_pred)
    val_prec = precision_score(y_val, y_val_pred, average='weighted')
    val_rec = recall_score(y_val, y_val_pred, average='weighted')

    results_n_estimators.append({
        'n_estimators': n_estimators,
        'val_accuracy': val_acc,
        'val_precision': val_prec,
        'val_recall': val_rec
    })

df_n_estimators = pd.DataFrame(results_n_estimators)
print("Результаты по n_estimators:")
print(df_n_estimators)

Результаты по n_estimators:
   n_estimators  val_accuracy  val_precision  val_recall
0             5      0.857567       0.864003    0.857567
1            10      0.851632       0.860596    0.851632
2            20      0.863501       0.870604    0.863501
3            50      0.863501       0.870896    0.863501
4           100      0.860534       0.868602    0.860534


In [24]:
best_row = df_n_estimators.sort_values(by=["val_accuracy","val_precision"], ascending=False).iloc[0]
best_n_estimators = int(best_row["n_estimators"])
print("\nЛучший n_estimators:", best_n_estimators)


Лучший n_estimators: 50


In [25]:
max_samples_options = [0.5, 0.7, 1.0]
max_features_options = [0.5, 0.7, 1.0]
bootstrap_options = [True, False]

results_other = []

for max_samples in max_samples_options:
    for max_features in max_features_options:
        for bootstrap in bootstrap_options:
            clf = BaggingClassifier(
                base_estimator=svc,
                n_estimators=best_n_estimators,
                max_samples=max_samples,
                max_features=max_features,
                bootstrap=bootstrap,
                random_state=21,
                n_jobs=-1
            )
            clf.fit(X_train, y_train)
            y_val_pred = clf.predict(X_val)

            val_acc = accuracy_score(y_val, y_val_pred)
            val_prec = precision_score(y_val, y_val_pred, average='weighted')
            
            results_other.append({
                'max_samples': max_samples,
                'max_features': max_features,
                'bootstrap': bootstrap,
                'val_accuracy': val_acc,
                'val_precision': val_prec
            })

df_other = pd.DataFrame(results_other)
print("\nРезультаты по max_samples, max_features, bootstrap:")
print(df_other)


Результаты по max_samples, max_features, bootstrap:
    max_samples  max_features  bootstrap  val_accuracy  val_precision
0           0.5           0.5       True      0.646884       0.741657
1           0.5           0.5      False      0.670623       0.728694
2           0.5           0.7       True      0.727003       0.758283
3           0.5           0.7      False      0.771513       0.803420
4           0.5           1.0       True      0.780415       0.813091
5           0.5           1.0      False      0.839763       0.850355
6           0.7           0.5       True      0.667656       0.747808
7           0.7           0.5      False      0.721068       0.765486
8           0.7           0.7       True      0.759644       0.782360
9           0.7           0.7      False      0.810089       0.829995
10          0.7           1.0       True      0.833828       0.850388
11          0.7           1.0      False      0.851632       0.862143
12          1.0           0.5       T

In [26]:
best_row_other = df_other.sort_values(by=["val_accuracy","val_precision"], ascending=False).iloc[0]
best_other = {
    "max_samples": best_row_other["max_samples"],
    "max_features": best_row_other["max_features"],
    "bootstrap": best_row_other["bootstrap"]
    }
print("\nЛучшие параметры:", best_other)


Лучшие параметры: {'max_samples': 1.0, 'max_features': 1.0, 'bootstrap': False}


In [27]:
best_row_other = df_other.sort_values(by=["val_accuracy","val_precision"], ascending=False).iloc[0]
best_params = {
    "n_estimators": best_n_estimators,
    "max_samples": best_row_other["max_samples"],
    "max_features": best_row_other["max_features"],
    "bootstrap": best_row_other["bootstrap"]
}
print("\nЛучшие параметры:", best_params)


Лучшие параметры: {'n_estimators': 50, 'max_samples': 1.0, 'max_features': 1.0, 'bootstrap': False}


In [28]:
X_train_full = np.vstack((X_train, X_val))
y_train_full = np.hstack((y_train, y_val))

best_model = BaggingClassifier(
    base_estimator=svc,
    n_estimators=best_params["n_estimators"],
    max_samples=best_params["max_samples"],
    max_features=best_params["max_features"],
    bootstrap=best_params["bootstrap"],
    random_state=21,
    n_jobs=-1
)
best_model.fit(X_train_full, y_train_full)
y_test_pred = best_model.predict(X_test)

test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred, average='weighted')
test_recall = recall_score(y_test, y_test_pred, average='weighted')

print("\nТест Metrics для Best Model:")
print(f"Accuracy:  {test_accuracy:.5f}")
print(f"Precision: {test_precision:.5f}")
print(f"Recall:    {test_recall:.5f}")


Тест Metrics для Best Model:
Accuracy:  0.89645
Precision: 0.90063
Recall:    0.89645


## 5. Stacking classifiers

1. To achieve reproducibility in this case you will have to create an object of cross-validation generator: `StratifiedKFold(n_splits=n, shuffle=True, random_state=21)`, where `n` you will try to optimize (the details are below).
2. Using `StackingClassifier` and the three models that you have recently trained, calculate the `accuracy`, `precision` and `recall` on the validation set, try different values of `n_splits` `[2, 3, 4, 5, 6, 7]` in the cross-validation generator and parameter `passthrough` in the classifier itself,
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision). Use `final_estimator=LogisticRegression(solver='liblinear')`.

In [29]:
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=21, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.2, random_state=21, stratify=y_train_val
)

In [30]:
base_models = [
    ('svc', svc),
    ('dtc', dtc),
    ('rfc', rfc)
]

n_splits_options = [2, 3, 4, 5, 6, 7]
passthrough_options = [True, False]

results_stacking = []

for n_splits in n_splits_options:
        
    for passthrough in passthrough_options:
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=21)
        clf = StackingClassifier(
            estimators=base_models,
            final_estimator=LogisticRegression(solver='liblinear'),
            cv=skf,
            passthrough=passthrough,
            n_jobs=-1
        )
        clf.fit(X_train, y_train)
        y_val_pred = clf.predict(X_val)

        val_acc = accuracy_score(y_val, y_val_pred)
        val_prec = precision_score(y_val, y_val_pred, average='weighted')
        val_rec = recall_score(y_val, y_val_pred, average='weighted')

        results_stacking.append({
            'n_splits': n_splits,
            'passthrough': passthrough,
            'val_accuracy': val_acc,
            'val_precision': val_prec,
            'val_recall': val_rec
        })

df_stacking = pd.DataFrame(results_stacking)
print("Результаты StackingClassifier:")
print(df_stacking)

Результаты StackingClassifier:
    n_splits  passthrough  val_accuracy  val_precision  val_recall
0          2         True      0.903704       0.905080    0.903704
1          2        False      0.896296       0.896784    0.896296
2          3         True      0.903704       0.906322    0.903704
3          3        False      0.896296       0.897592    0.896296
4          4         True      0.911111       0.913269    0.911111
5          4        False      0.903704       0.905703    0.903704
6          5         True      0.900000       0.902167    0.900000
7          5        False      0.900000       0.900558    0.900000
8          6         True      0.903704       0.904500    0.903704
9          6        False      0.903704       0.904365    0.903704
10         7         True      0.903704       0.906397    0.903704
11         7        False      0.903704       0.905376    0.903704


In [31]:
best_row = df_stacking.sort_values(by=['val_accuracy','val_precision'], ascending=False).iloc[0]
best_params = {
    'n_splits': best_row['n_splits'],
    'passthrough': best_row['passthrough']
}
print("\nЛучшие параметры StackingClassifier:", best_params)


Лучшие параметры StackingClassifier: {'n_splits': 4, 'passthrough': True}


In [32]:
X_train_full = np.vstack((X_train, X_val))
y_train_full = np.hstack((y_train, y_val))

skf_best = StratifiedKFold(n_splits=int(best_params['n_splits']), shuffle=True, random_state=21)
best_stacking_model = StackingClassifier(
    estimators=base_models,
    final_estimator=LogisticRegression(solver='liblinear'),
    cv=skf_best,
    passthrough=best_params['passthrough'],
    n_jobs=-1
)
best_stacking_model.fit(X_train_full, y_train_full)

y_test_pred = best_stacking_model.predict(X_test)

test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred, average='weighted')
test_recall = recall_score(y_test, y_test_pred, average='weighted')

print("\nТест Metrics для Best Stacking Model:")
print(f"Accuracy:  {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall:    {test_recall:.4f}")


Тест Metrics для Best Stacking Model:
Accuracy:  0.9408
Precision: 0.9425
Recall:    0.9408


## 6. Predictions

1. Choose the best model in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).
2. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which labname and for which users.
3. Save the model.

In [33]:
best_model = StackingClassifier(
    estimators=[('svс', svc), ('dtc', dtc), ('rfc', rfc)],
    final_estimator=LogisticRegression(solver='liblinear'),
    cv=StratifiedKFold(n_splits=6, shuffle=True, random_state=21),
    passthrough=True
)
best_model.fit(X_train_full, y_train_full)
y_test_pred = best_model.predict(X_test)

print(f"accuracy is {accuracy_score(y_test, y_test_pred):.5f}")
print(f"precision is {precision_score(y_test, y_test_pred, average='weighted'):.5f}")
print(f"recall is {recall_score(y_test, y_test_pred, average='weighted'):.5f}")

accuracy is 0.93787
precision is 0.93926
recall is 0.93787


In [34]:
df['predicted'] = best_model.predict(X)
df['is_error'] = (df['dayofweek'] != df['predicted']).astype(int)
error_by_weekday = df.groupby('dayofweek')['is_error'].mean() * 100
print("Ошибки по дням недели (%):")
print(error_by_weekday.sort_values(ascending=False))

Ошибки по дням недели (%):
dayofweek
0    3.676471
1    2.554745
3    1.515152
5    1.476015
2    1.342282
4    0.961538
6    0.280899
Name: is_error, dtype: float64


In [35]:
lab_columns = [col for col in df.columns if col.startswith('labname_')]
if lab_columns:
    df['labname'] = df[lab_columns].idxmax(axis=1).str.replace('labname_', '')
    error_by_lab = df.groupby('labname')['is_error'].mean() * 100
    print("\nОшибки по лабораториям (%):")
    print(error_by_lab.sort_values(ascending=False).head(10))
else:
    print("Нет столбцов с labname_")


Ошибки по лабораториям (%):
labname
lab03       100.000000
laba06        4.166667
laba04        3.932584
code_rvw      3.658537
laba06s       3.278689
lab05s        2.777778
laba04s       1.923077
project1      0.841220
laba05        0.000000
lab03s        0.000000
Name: is_error, dtype: float64


In [36]:
uid_columns = [col for col in df.columns if col.startswith('uid_')]

if uid_columns:
    df['uid'] = df[uid_columns].idxmax(axis=1).str.replace('uid_', '')

    error_by_user = df.groupby('uid')['is_error'].mean() * 100
    print("\nОшибки по пользователям (%):")
    print(error_by_user.sort_values(ascending=False).head(10))
else:
    print("Нет столбцов с uid_")


Ошибки по пользователям (%):
uid
user_22    14.285714
user_17    11.764706
user_6      8.333333
user_29     4.687500
user_19     4.395604
user_16     3.125000
user_18     2.857143
user_3      2.816901
user_30     2.564103
user_1      2.173913
Name: is_error, dtype: float64


In [37]:
joblib.dump(best_model, 'best_model.joblib')
print("Модель сохранена как 'best_model.joblib'")

Модель сохранена как 'best_model.joblib'
