## Day 14. Task 04
## Ансамбли
### 0. Импорты

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

### 1. Препроцессинг
 
1. Загрузите снова тот же датафрейм, как и в прошлом задании.
2. Воспользуйтесь train_test_split с параметрами test_size=0.2, random_state=21 и получите X_train, y_train, X_test, y_test. Используйте дополнительный параметр stratify. А затем из полученных X_train, y_train сделайте снова сплит на X_train, y_train, X_valid, y_valid. Таким образом у вас появятся три датасета: обучающая выборка, валидационная выборка и тестовая выборка.

In [3]:
df = pd.read_csv('day-of-week-not-scaled.csv')
X = df.drop(columns=['dayofweek'])
y = df['dayofweek']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=21)

In [5]:
y_train.value_counts(normalize=True)

3    0.234421
6    0.211424
1    0.162463
5    0.160979
2    0.088279
0    0.080861
4    0.061573
Name: dayofweek, dtype: float64

In [6]:
y_test.value_counts(normalize=True)

3    0.236686
6    0.210059
1    0.162722
5    0.159763
2    0.088757
0    0.079882
4    0.062130
Name: dayofweek, dtype: float64

In [7]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2, random_state=21)

In [8]:
y_train.value_counts(normalize=True)

3    0.234694
6    0.211503
1    0.162338
5    0.161410
2    0.088126
0    0.080705
4    0.061224
Name: dayofweek, dtype: float64

In [9]:
y_valid.value_counts(normalize=True)

3    0.233333
6    0.211111
1    0.162963
5    0.159259
2    0.088889
0    0.081481
4    0.062963
Name: dayofweek, dtype: float64

### 2. Индивидуальные классификаторы
 
1. Обучите SVM, дерево классификации и случайный лес опять же с наилучшими параметрами, которые вы обнаружили в task 02. Используйте параметр random_state=21 для всех из них.
2. Оцените accuracy, precision и recall для всех из них на валидационном куске данных.
3. Формат результата каждой ячейки должен выглядеть так:  
*accuracy is 0.87778*  
*precision is 0.88162*  
*recall is 0.87778*  

In [13]:
svc = SVC(C=10,
          class_weight=None,
          gamma='auto',
          kernel='rbf',
          random_state=21,
          probability=True)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_valid)
print(f"accuracy is {accuracy_score(y_valid, y_pred):.5f}\nprecision is {precision_score(y_valid, y_pred, average='weighted'):.5f}\nrecall is {recall_score(y_valid, y_pred, average='weighted'):.5f}")

accuracy is 0.87778
precision is 0.88162
recall is 0.87778


In [14]:
dt = DecisionTreeClassifier(class_weight='balanced',
                            criterion='gini',
                            max_depth=21,
                            random_state=21)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_valid)
print(f"accuracy is {accuracy_score(y_valid, y_pred):.5f}\nprecision is {precision_score(y_valid, y_pred, average='weighted'):.5f}\nrecall is {recall_score(y_valid, y_pred, average='weighted'):.5f}")

accuracy is 0.86667
precision is 0.87170
recall is 0.86667


In [15]:
rf = RandomForestClassifier(class_weight=None,
                            criterion='gini',
                            max_depth=28,
                            n_estimators=50,
                            random_state=21)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_valid)
print(f"accuracy is {accuracy_score(y_valid, y_pred):.5f}\nprecision is {precision_score(y_valid, y_pred, average='weighted'):.5f}\nrecall is {recall_score(y_valid, y_pred, average='weighted'):.5f}")

accuracy is 0.89259
precision is 0.89361
recall is 0.89259


### 3. Voting classifiers
 
1. Объедините три модели, которые вы только что обучили, в один VotingClassifier. Посчитайте accuracy, precision, recall на валидационной выборке.
2. Поиграйте с разными параметрами VotingClassifier.
3. Посчитайте accuracy, precision и recall для тестовой выборки для того VotingClassifier, у которого наилучшие веса с точки зрения accuracy на валидационной выборке (если несколько моделей имеют одинаковое accuracy, то выберите ту, у которой более высокий precision).

In [16]:
best_accuracy = 0
best_precision = 0
best_recall = 0
best_params = 0
for voting in ['hard', 'soft']:
    for weight_1 in range(1, 6):
        for weight_2 in range(1, 6):
            for weight_3 in range(1, 6):
                vc = VotingClassifier(estimators=[('SVC', svc), ('DecisionTreeClassifier', dt), ('RandomForestClassifier', rf)], voting=voting, weights=[weight_1, weight_2, weight_3])
                vc.fit(X_train, y_train)
                y_pred = vc.predict(X_valid)
                print(f'voting is {voting}, weights is {[weight_1, weight_2, weight_3]}')
                print(f"accuracy is {accuracy_score(y_valid, y_pred):.5f}\nprecision is {precision_score(y_valid, y_pred, average='weighted'):.5f}\nrecall is {recall_score(y_valid, y_pred, average='weighted'):.5f}")
                print()
                if (accuracy_score(y_valid, y_pred) > best_accuracy) or (accuracy_score(y_valid, y_pred) == best_accuracy and best_precision < precision_score(y_valid, y_pred, average='weighted')):
                    best_accuracy = accuracy_score(y_valid, y_pred)
                    best_precision = precision_score(y_valid, y_pred, average='weighted')
                    best_recall = recall_score(y_valid, y_pred, average='weighted')
                    best_params = {'voting': voting, 'weights': [weight_1, weight_2, weight_3]}
print(f'Best params: {best_params}, accuracy is {best_accuracy}, precision is {best_precision}, recall is {best_recall}')


voting is hard, weights is [1, 1, 1]
accuracy is 0.89259
precision is 0.89236
recall is 0.89259

voting is hard, weights is [1, 1, 2]
accuracy is 0.89630
precision is 0.89712
recall is 0.89630

voting is hard, weights is [1, 1, 3]
accuracy is 0.89259
precision is 0.89361
recall is 0.89259

voting is hard, weights is [1, 1, 4]
accuracy is 0.89259
precision is 0.89361
recall is 0.89259

voting is hard, weights is [1, 1, 5]
accuracy is 0.89259
precision is 0.89361
recall is 0.89259

voting is hard, weights is [1, 2, 1]
accuracy is 0.88889
precision is 0.89097
recall is 0.88889

voting is hard, weights is [1, 2, 2]
accuracy is 0.90000
precision is 0.90072
recall is 0.90000

voting is hard, weights is [1, 2, 3]
accuracy is 0.89630
precision is 0.89712
recall is 0.89630

voting is hard, weights is [1, 2, 4]
accuracy is 0.89259
precision is 0.89361
recall is 0.89259

voting is hard, weights is [1, 2, 5]
accuracy is 0.89259
precision is 0.89361
recall is 0.89259

voting is hard, weights is [1,

In [17]:
vc = VotingClassifier(estimators=[('SVC', svc), ('DecisionTreeClassifier', dt), ('RandomForestClassifier', rf)], voting='soft', weights=[5, 1, 2])
vc.fit(X_train, y_train)
y_pred = vc.predict(X_test)
print(f"accuracy is {accuracy_score(y_test, y_pred):.5f}\nprecision is {precision_score(y_test, y_pred, average='weighted'):.5f}\nrecall is {recall_score(y_test, y_pred, average='weighted'):.5f}")

accuracy is 0.90237
precision is 0.90634
recall is 0.90237


### 4. Bagging classifiers
 
1. Используйте SVM с наилучшими параметрами для создания ансамбля BaggingClassifier. Попробуйте различные значения n_estimators. Используйте random_state=21.
2. Поиграйте с разными параметрами BaggingClassifier.
3. Посчитайте accuracy, precision и recall для тестовой выборки для того BaggingClassifier, у которого наилучшие веса с точки зрения accuracy на валидационной выборке (если несколько моделей имеют одинаковое accuracy, то выберите ту, у которой более высокий precision).

In [19]:
best_accuracy = 0
best_precision = 0
best_recall = 0
best_n_estimators = 0
for n_estimators in range(1, 101):
    bc = BaggingClassifier(base_estimator = svc, n_estimators=n_estimators, random_state=21, n_jobs= -1)
    bc.fit(X_train, y_train)
    y_pred = bc.predict(X_valid)
    print(f'n_estimators is {n_estimators}')
    print(f"accuracy is {accuracy_score(y_valid, y_pred):.5f}\nprecision is {precision_score(y_valid, y_pred, average='weighted'):.5f}\nrecall is {recall_score(y_valid, y_pred, average='weighted'):.5f}")
    print()
    if (accuracy_score(y_valid, y_pred) > best_accuracy) or (accuracy_score(y_valid, y_pred) == best_accuracy and best_precision < precision_score(y_valid, y_pred, average='weighted')):
        best_accuracy = accuracy_score(y_valid, y_pred)
        best_precision = precision_score(y_valid, y_pred, average='weighted')
        best_recall = recall_score(y_valid, y_pred, average='weighted')
        best_n_estimators = n_estimators
print(f'Best n_estimators: {best_n_estimators}, accuracy is {best_accuracy}, precision is {best_precision}, recall is {best_recall}')


n_estimators is 1
accuracy is 0.81852
precision is 0.82401
recall is 0.81852

n_estimators is 2
accuracy is 0.85926
precision is 0.86595
recall is 0.85926

n_estimators is 3
accuracy is 0.86667
precision is 0.87315
recall is 0.86667

n_estimators is 4
accuracy is 0.87407
precision is 0.88205
recall is 0.87407

n_estimators is 5
accuracy is 0.87407
precision is 0.88423
recall is 0.87407

n_estimators is 6
accuracy is 0.88148
precision is 0.89157
recall is 0.88148

n_estimators is 7
accuracy is 0.87778
precision is 0.88756
recall is 0.87778

n_estimators is 8
accuracy is 0.87778
precision is 0.88756
recall is 0.87778

n_estimators is 9
accuracy is 0.88148
precision is 0.89076
recall is 0.88148

n_estimators is 10
accuracy is 0.88519
precision is 0.89427
recall is 0.88519

n_estimators is 11
accuracy is 0.87778
precision is 0.88556
recall is 0.87778

n_estimators is 12
accuracy is 0.88148
precision is 0.88818
recall is 0.88148

n_estimators is 13
accuracy is 0.88148
precision is 0.88816
r

In [20]:
print('n_estimators = ', 43)

n_estimators =  43


In [21]:
bc = BaggingClassifier(base_estimator = svc, n_estimators=43, random_state=21, n_jobs= -1)
bc.fit(X_train, y_train)
y_pred = bc.predict(X_test)
print(f"accuracy is {accuracy_score(y_test, y_pred):.5f}\nprecision is {precision_score(y_test, y_pred, average='weighted'):.5f}\nrecall is {recall_score(y_test, y_pred, average='weighted'):.5f}")

accuracy is 0.88166
precision is 0.88674
recall is 0.88166


### 5. Stacking classifiers
 
1. Чтобы в этой подзадаче мы могли сохранить воспроизводимость результатов, вам потребуется вначале создать объект генератора кросс-валидации StratifiedKFold(n_splits=n, shuffle=True, random_state=21), где значение n вам нужно будет оптимизировать. Подробности ниже.
2. Объедините три модели, которые вы обучили ранее, в один StackingClassifier. Посчитайте accuracy, precision, recall на валидационной выборке. Попробуйте разные значения n_splits [2, 3, 4, 5, 6, 7] в генераторе кроссвалидации и разные значения параметра passthrough в самом StackingClassifier.
3. Посчитайте accuracy, precision и recall для тестовой выборки для того StackingClassifier, у которого наилучшие веса с точки зрения accuracy на валидационной выборке (если несколько моделей имеют одинаковое accuracy, то выберите ту, у которой более высокий precision). Используйте final_estimator=LogisticRegression(solver='liblinear').

In [25]:
best_accuracy = 0
best_precision = 0
best_recall = 0
best_params = 0
for n_splits in [2, 3, 4, 5, 6, 7]:
    for passthrough in [True, False]:
        sc = StackingClassifier(estimators=[('SVC', svc), ('DecisionTreeClassifier', dt), ('RandomForestClassifier', rf)], final_estimator=LogisticRegression(solver='liblinear'), cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=21), passthrough=passthrough, n_jobs=-1)
        sc.fit(X_train, y_train)
        y_pred = sc.predict(X_valid)
        print(f'n_splits is {n_splits}, passthrough is {passthrough}')
        print(f"accuracy is {accuracy_score(y_valid, y_pred):.5f}\nprecision is {precision_score(y_valid, y_pred, average='weighted'):.5f}\nrecall is {recall_score(y_valid, y_pred, average='weighted'):.5f}")
        print()
        if (accuracy_score(y_valid, y_pred) > best_accuracy) or (accuracy_score(y_valid, y_pred) == best_accuracy and best_precision < precision_score(y_valid, y_pred, average='weighted')):
            best_accuracy = accuracy_score(y_valid, y_pred)
            best_precision = precision_score(y_valid, y_pred, average='weighted')
            best_recall = recall_score(y_valid, y_pred, average='weighted')
            best_params = {'n_splits': n_splits, 'passthrough': passthrough}
print(f'Best params: {best_params}, accuracy is {best_accuracy}, precision is {best_precision}, recall is {best_recall}')


n_splits is 2, passthrough is True
accuracy is 0.90000
precision is 0.90106
recall is 0.90000

n_splits is 2, passthrough is False
accuracy is 0.88889
precision is 0.88997
recall is 0.88889

n_splits is 3, passthrough is True
accuracy is 0.89630
precision is 0.89841
recall is 0.89630

n_splits is 3, passthrough is False
accuracy is 0.90000
precision is 0.90103
recall is 0.90000

n_splits is 4, passthrough is True
accuracy is 0.90370
precision is 0.90666
recall is 0.90370

n_splits is 4, passthrough is False
accuracy is 0.90370
precision is 0.90594
recall is 0.90370

n_splits is 5, passthrough is True
accuracy is 0.91111
precision is 0.91452
recall is 0.91111

n_splits is 5, passthrough is False
accuracy is 0.91111
precision is 0.91319
recall is 0.91111

n_splits is 6, passthrough is True
accuracy is 0.91111
precision is 0.91229
recall is 0.91111

n_splits is 6, passthrough is False
accuracy is 0.90370
precision is 0.90474
recall is 0.90370

n_splits is 7, passthrough is True
accuracy i

In [26]:
sc = StackingClassifier(estimators=[('SVC', svc), ('DecisionTreeClassifier', dt), ('RandomForestClassifier', rf)], final_estimator=LogisticRegression(solver='liblinear'), cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=21), passthrough=True)
sc.fit(X_train, y_train)
y_pred = sc.predict(X_test)
print(f"accuracy is {accuracy_score(y_test, y_pred):.5f}\nprecision is {precision_score(y_test, y_pred, average='weighted'):.5f}\nrecall is {recall_score(y_test, y_pred, average='weighted'):.5f}")

accuracy is 0.90828
precision is 0.91087
recall is 0.90828


### 6. Прогнозы
 
1. Выберите лучшую модель с точки зрения accuracy (если несколько моделей имеют одинаковое accuracy, то выберите ту, у которой более высокий precision).
2. Проанализируйте: для какого дня недели модель делает больше всего ошибок (в % от общего количество наблюдений этого класса в вашем датасете). Также проанализируйте на какой лаборатороной работе и на каком юзере модель делает больше всего ошибок.
3. Сохраните модель.
 

*Stacking classifiers* - лучшая модель

In [27]:
for day in range(7):
    print(f'day:{day}, mistakes: {sum((y_test != sc.predict(X_test)) & (y_test == day)) / sum(y_test == day)*100:.3f}%')

day:0, mistakes: 29.630%
day:1, mistakes: 5.455%
day:2, mistakes: 10.000%
day:3, mistakes: 3.750%
day:4, mistakes: 9.524%
day:5, mistakes: 16.667%
day:6, mistakes: 4.225%


Больше всего классификатор ошибается для понедельника в 29.63%

In [28]:
for column in X.columns:
    if 'user' in column:
        print(f'user: {column}, mistakes: ', end='')
        if len (X_test[X_test[column] == 1.0]) > 0:
            print(f'{1 - accuracy_score(y_test[X_test[column] == 1.0], sc.predict(X_test[X_test[column] == 1.0])):.3%}')
        else:
            print('not in test')

user: uid_user_0, mistakes: not in test
user: uid_user_1, mistakes: 0.000%
user: uid_user_10, mistakes: 0.000%
user: uid_user_11, mistakes: not in test
user: uid_user_12, mistakes: 0.000%
user: uid_user_13, mistakes: 11.765%
user: uid_user_14, mistakes: 9.677%
user: uid_user_15, mistakes: 0.000%
user: uid_user_16, mistakes: 20.000%
user: uid_user_17, mistakes: 0.000%
user: uid_user_18, mistakes: 16.667%
user: uid_user_19, mistakes: 15.789%
user: uid_user_2, mistakes: 14.286%
user: uid_user_20, mistakes: 0.000%
user: uid_user_21, mistakes: 7.143%
user: uid_user_22, mistakes: 100.000%
user: uid_user_23, mistakes: 0.000%
user: uid_user_24, mistakes: 9.091%
user: uid_user_25, mistakes: 9.091%
user: uid_user_26, mistakes: 5.882%
user: uid_user_27, mistakes: 0.000%
user: uid_user_28, mistakes: 0.000%
user: uid_user_29, mistakes: 9.091%
user: uid_user_3, mistakes: 21.429%
user: uid_user_30, mistakes: 12.500%
user: uid_user_31, mistakes: 11.111%
user: uid_user_4, mistakes: 7.407%
user: uid_use

Классификатор чаще всего ошибается для *uid_user_22* в 100% случаев, 
*uid_user_0, uid_user_11, uid_user_7* нет в тестовой выборке

In [29]:
for column in X.columns:
    if 'labname' in column:
        print(f'labname: {column}, mistakes: ', end='')
        if len (X_test[X_test[column] == 1.0]) > 0:
            print(f'{1 - accuracy_score(y_test[X_test[column] == 1.0], sc.predict(X_test[X_test[column] == 1.0])):.3%}')
        else:
            print('not in test')

labname: labname_code_rvw, mistakes: 7.692%
labname: labname_lab02, mistakes: not in test
labname: labname_lab03, mistakes: 100.000%
labname: labname_lab03s, mistakes: 0.000%
labname: labname_lab05s, mistakes: 16.667%
labname: labname_laba04, mistakes: 22.857%
labname: labname_laba04s, mistakes: 24.000%
labname: labname_laba05, mistakes: 0.000%
labname: labname_laba06, mistakes: 11.111%
labname: labname_laba06s, mistakes: 13.333%
labname: labname_project1, mistakes: 5.914%


Классификатор чаще всего ошибается для *labname_lab03* в 100% случаев, 
*labname_lab02* нет в тестовой выборке

In [30]:
from joblib import dump, load

In [31]:
dump(sc, 'best.joblib')

['best.joblib']

In [32]:
best1 = load('best.joblib')

In [33]:
accuracy_score(y_test, best1.predict(X_test))

0.908284023668639