# Day 09. Exercise 03
# Ensembles

## 0. Imports

In [121]:
import pandas as pd
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.metrics import accuracy_score,precision_score,recall_score,roc_auc_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,VotingClassifier,BaggingClassifier,StackingClassifier
from sklearn.linear_model import LogisticRegression
import joblib

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test` and then get `X_train`, `y_train`, `X_valid`, `y_valid` from the previous `X_train`, `y_train`. Use the additional parameter `stratify`.

In [26]:
df = pd.read_csv("../data/day-of-week-not-scaled.csv")
df['dayofweek'] = pd.read_csv("../data/dayofweek.csv")['dayofweek']
df

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,dayofweek
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,9,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3
1682,6,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3
1683,7,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3
1684,8,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3


In [27]:
Y = df['dayofweek']
X = df.drop(columns='dayofweek')

In [28]:
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=21,stratify=Y)
X_train,X_valid,y_train,y_valid = train_test_split(X_train,y_train,test_size=0.2,random_state=21,stratify=y_train)

## 2. Individual classifiers

1. Train SVM, decision tree and random forest again with the best parameters that you got from the 01 exercise with `random_state=21` for all of them.
2. Evaluate `accuracy`, `precision`, and `recall` for them on the validation set.
3. The result of each cell of the section should look like this:

```
accuracy is 0.87778
precision is 0.88162
recall is 0.87778
```

In [52]:
best_param_svm = {'C': 10, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'rbf'}
best_param_tree = {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 23}
best_param_random = {'class_weight': 'balanced','criterion': 'gini','max_depth': 35,'n_estimators': 50}
def print_example_metric(mod,X,Y)->dict:
    accuracy = accuracy_score(y_true=Y, y_pred=mod.predict(X))
    pr = precision_score(y_true=y_test, y_pred=mod.predict(X_test), average='weighted')
    rec = recall_score(y_true=y_test, y_pred=mod.predict(X_test), average='weighted')
    #print(f'accuracy is {accuracy}')
    #print(f'precision is {pr}')
    #print(f'recall is {rec}')
    return {'accuracy':accuracy,'precision':pr,'recall':rec}


In [53]:
model_svm = SVC(**best_param_svm,random_state=21,probability=True)
model_svm.fit(X_train,y_train)
print_example_metric(model_svm,X_valid,y_valid)

{'accuracy': 0.8592592592592593,
 'precision': 0.8710631929138204,
 'recall': 0.8698224852071006}

In [54]:
model_tree = DecisionTreeClassifier(**best_param_tree,random_state=21)
model_tree.fit(X_train,y_train)
print_example_metric(model_tree,X_valid,y_valid)


{'accuracy': 0.8592592592592593,
 'precision': 0.8822382391481718,
 'recall': 0.878698224852071}

In [55]:
model_random = RandomForestClassifier(**best_param_random,random_state=21)
model_random.fit(X_train,y_train)
print_example_metric(model_random,X_valid,y_valid)

{'accuracy': 0.9037037037037037,
 'precision': 0.9138567130524126,
 'recall': 0.9112426035502958}

## 3. Voting classifiers

1. Using `VotingClassifier` and the three models that you have just trained, calculate the `accuracy`, `precision`, and `recall` on the validation set.
2. Play with the other parameteres.
3. Calculate the `accuracy`, `precision` and `recall` on the test set for the model with the best weights in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).

In [56]:
eclf_hard = VotingClassifier(estimators=[('model1',model_svm),('model2',model_tree),('model3',model_random)],voting='soft')
eclf_hard.fit(X_train,y_train)

In [57]:
print_example_metric(eclf_hard,X_valid,y_valid)

{'accuracy': 0.8851851851851852,
 'precision': 0.8988623209168486,
 'recall': 0.893491124260355}

In [58]:
eclf_soft = VotingClassifier(estimators=[('model1',model_svm),('model2',model_tree),('model3',model_random)],voting='hard')
eclf_soft.fit(X_train,y_train)

In [59]:
print_example_metric(eclf_soft,X_valid,y_valid)

{'accuracy': 0.9074074074074074,
 'precision': 0.9138233325389009,
 'recall': 0.9112426035502958}

In [87]:
weight = [[1,1,1],[2,1,1],[1,2,1],[1,1,2]] # веса модели
best_param = {'model' : None,'accuracy':0,'precision' : 0,'recall' : 0,'weight' : 0}

In [101]:
def func_new(weight_list:list,bst_par:dict):
    for items in weight_list:
        model_vot = VotingClassifier(estimators=[('model1',model_svm),('model2',model_tree),('model3',model_random)],weights=items)
        model_vot.fit(X_train,y_train)
        res = print_example_metric(model_vot,X_valid,y_valid)
        if res['accuracy'] > bst_par['accuracy'] and res['precision'] >  bst_par['precision'] and res['recall'] > bst_par['recall']:
            bst_par['accuracy'] = res['accuracy']
            bst_par['precision'] = res['precision']
            bst_par['recall'] = res['recall']
            bst_par['model'] = model_vot
            bst_par['weights'] = items
    return bst_par
        
          


In [102]:
a = func_new(weight,best_param)
print(a)


{'model': VotingClassifier(estimators=[('model1',
                              SVC(C=10, class_weight='balanced', gamma='auto',
                                  probability=True, random_state=21)),
                             ('model2',
                              DecisionTreeClassifier(class_weight='balanced',
                                                     max_depth=23,
                                                     random_state=21)),
                             ('model3',
                              RandomForestClassifier(class_weight='balanced',
                                                     max_depth=35,
                                                     n_estimators=50,
                                                     random_state=21))],
                 weights=[1, 1, 2]), 'accuracy': 0.9111111111111111, 'precision': 0.912780477818809, 'recall': 0.9112426035502958, 'weight': 0, 'weights': [1, 1, 2]}


## 4. Bagging classifiers

1. Using `BaggingClassifier` and `SVM` with the best parameters create an ensemble, try different values of the `n_estimators`, use `random_state=21`.
2. Play with the other parameters.
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision)

In [110]:
best_param_svm = {'model' : None,'accuracy':0,'precision' : 0,'recall' : 0,'n_estimators' : 0}
list_est = range(10,100,10)

In [111]:
def func_new_svm(estimators_list:list,bst_par:dict):
    for items in estimators_list:
        model_bag = BaggingClassifier(model_svm,items,random_state=21)
        model_bag.fit(X_train,y_train)
        res = print_example_metric(model_bag,X_valid,y_valid)
        if res['accuracy'] > bst_par['accuracy'] or (res['precision'] >  bst_par['precision'] and res['recall'] > bst_par['recall']):
            bst_par['accuracy'] = res['accuracy']
            bst_par['precision'] = res['precision']
            bst_par['recall'] = res['recall']
            bst_par['model'] = model_bag
            bst_par['n_estimators'] = items
    return bst_par

In [112]:
a = func_new_svm(list_est,best_param_svm)
print(a)

{'model': BaggingClassifier(estimator=SVC(C=10, class_weight='balanced', gamma='auto',
                                probability=True, random_state=21),
                  n_estimators=90, random_state=21), 'accuracy': 0.8851851851851852, 'precision': 0.872948954319544, 'recall': 0.8698224852071006, 'n_estimators': 90}


## 5. Stacking classifiers

1. To achieve reproducibility in this case you will have to create an object of cross-validation generator: `StratifiedKFold(n_splits=n, shuffle=True, random_state=21)`, where `n` you will try to optimize (the details are below).
2. Using `StackingClassifier` and the three models that you have recently trained, calculate the `accuracy`, `precision` and `recall` on the validation set, try different values of `n_splits` `[2, 3, 4, 5, 6, 7]` in the cross-validation generator and parameter `passthrough` in the classifier itself,
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision). Use `final_estimator=LogisticRegression(solver='liblinear')`.

In [114]:
best_param_stacking = {
	'model': None,
	'accuracy':0,
	'precision':0,
	'recall':0,
	'n_splits':0
}

In [117]:
for items in range(2,10,2):
    for shuffle in [True,False]:
        skf = StratifiedKFold(n_splits=items,shuffle=True,random_state=21)
        stack = StackingClassifier(estimators=[('model_svm',model_svm),('model_tree',model_tree),('model_random',model_random)],passthrough=shuffle,final_estimator=LogisticRegression(solver='liblinear'))
        for train,val in skf.split(X_train,y_train):
            X_train_fold, X_val_fold = X_train.iloc[train], X_train.iloc[val]
            Y_train_fold, Y_val_fold = y_train.iloc[train], y_train.iloc[val]
            stack.fit(X_train_fold,Y_train_fold)
            result = print_example_metric(stack,X_val_fold,Y_val_fold)
            if result['accuracy'] > best_param_stacking['accuracy'] or (result['precision'] >  best_param_stacking['precision'] and result['recall'] > best_param_stacking['recall']):
                best_param_stacking['accuracy'] = result['accuracy']
                best_param_stacking['precision'] = result['precision']
                best_param_stacking['recall'] = result['recall']
                best_param_stacking['model'] = stack
                best_param_stacking['n_splits'] = items
            


In [118]:
best_param_stacking

{'model': StackingClassifier(estimators=[('model_svm',
                                 SVC(C=10, class_weight='balanced', gamma='auto',
                                     probability=True, random_state=21)),
                                ('model_tree',
                                 DecisionTreeClassifier(class_weight='balanced',
                                                        max_depth=23,
                                                        random_state=21)),
                                ('model_random',
                                 RandomForestClassifier(class_weight='balanced',
                                                        max_depth=35,
                                                        n_estimators=50,
                                                        random_state=21))],
                    final_estimator=LogisticRegression(solver='liblinear')),
 'accuracy': 0.9185185185185185,
 'precision': 0.9094517691685,
 'recall': 0.9053254437869

## 6. Predictions

1. Choose the best model in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).
2. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which labname and for which users.
3. Save the model.

In [120]:
model_the_best = best_param_stacking['model']
model_the_best

In [124]:
model = df.copy()
pr_y = model_the_best.predict(X)
model['pr_y'] = pr_y
model


Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,dayofweek,pr_y
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4,4
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4,4
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4,4
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4,4
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,9,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3,3
1682,6,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3,3
1683,7,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3,3
1684,8,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3,3


In [128]:
model[model['dayofweek'] != model['pr_y']]['dayofweek'].value_counts().sort_values(ascending=True) / len(pr_y)

dayofweek
4    0.001779
2    0.003559
6    0.004745
3    0.008304
5    0.009490
1    0.009490
0    0.010083
Name: count, dtype: float64

In [144]:
error = model[model['pr_y'] != model['dayofweek']]
user = [i for i in model.columns if i.startswith('uid_user_')]
lab = [i for i in model.columns if i.startswith('labname')]
lab


['labname_code_rvw',
 'labname_lab02',
 'labname_lab03',
 'labname_lab03s',
 'labname_lab05s',
 'labname_laba04',
 'labname_laba04s',
 'labname_laba05',
 'labname_laba06',
 'labname_laba06s',
 'labname_project1']

In [145]:
err_mx = 0
use_name = ''
for i in user:
    er = error[i].sum()/len(pr_y)
    if er > err_mx:
        err_mx = er
        use_name = user
print(f'User max error is {use_name[-1]} and it amounts to {err_mx * 100} % ')

User max error is uid_user_8 and it amounts to 0.8303677342823249 % 


In [146]:
er_mx = 0
lab_name = ''
for i in lab:
    er = error[i].sum()/len(pr_y)
    if er > er_mx:
        er_mx = er
        lab_name = lab
print(f'User max error is {lab_name[-1]} and it amounts to {er_mx * 100} % ')

User max error is labname_project1 and it amounts to 2.0759193357058123 % 


In [147]:
joblib.dump(model_the_best,'best_model')

['best_model']