# Day 09. Exercise 03
# Ensembles

## 0. Imports

In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, StackingClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test` and then get `X_train`, `y_train`, `X_valid`, `y_valid` from the previous `X_train`, `y_train`. Use the additional parameter `stratify`.

In [2]:
df = pd.read_csv('../data/day-of-week-not-scaled.csv')
df['dayofweek']=pd.read_csv('../data/dayofweek.csv')['dayofweek']
df

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,dayofweek
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,9,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3
1682,6,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3
1683,7,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3
1684,8,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3


In [3]:
X=df.drop(columns=['dayofweek'])
y=df['dayofweek']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=21,stratify=y)

## 2. Individual classifiers

1. Train SVM, decision tree and random forest again with the best parameters that you got from the 01 exercise with `random_state=21` for all of them.
2. Evaluate `accuracy`, `precision`, and `recall` for them on the validation set.
3. The result of each cell of the section should look like this:

```
accuracy is 0.87778
precision is 0.88162
recall is 0.87778
```

In [5]:
def metrics(model, X_test, y_test, show=False):
	pred_y = model.predict(X_test)
	accuracy = accuracy_score(pred_y, y_test)
	precision = precision_score(pred_y, y_test, average='weighted')
	recall = recall_score(pred_y,y_test, average='weighted')
	if show:
		print(f"Accuracy: {accuracy:.5f}")
		print(f"Precision: {precision:.5f}")
		print(f"Recall: {recall:.5f}")
	return {'accuracy':accuracy,'precision':precision,'recall':recall}

In [6]:
model_svm = SVC(C=10, class_weight='balanced', gamma='auto', kernel='rbf',random_state=21,probability=True)
model_svm.fit(X_train, y_train)
metrics(model_svm, X_test, y_test, show=True)

Accuracy: 0.88757
Precision: 0.88778
Recall: 0.88757


{'accuracy': 0.8875739644970414,
 'precision': 0.8877813050077406,
 'recall': 0.8875739644970414}

In [7]:
model_tree = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=30, random_state=21)
model_tree.fit(X_train, y_train)
metrics(model_tree, X_test, y_test, show=True)


Accuracy: 0.86686
Precision: 0.86876
Recall: 0.86686


{'accuracy': 0.8668639053254438,
 'precision': 0.8687628004393987,
 'recall': 0.8668639053254438}

In [8]:
model_forest = RandomForestClassifier(class_weight='balanced', criterion='gini', max_depth=40, n_estimators=50, random_state=21)
model_forest.fit(X_train, y_train)
metrics(model_forest, X_test, y_test, show=True)

Accuracy: 0.92899
Precision: 0.93328
Recall: 0.92899


{'accuracy': 0.9289940828402367,
 'precision': 0.9332817570155058,
 'recall': 0.9289940828402367}

## 3. Voting classifiers

1. Using `VotingClassifier` and the three models that you have just trained, calculate the `accuracy`, `precision`, and `recall` on the validation set.
2. Play with the other parameteres.
3. Calculate the `accuracy`, `precision` and `recall` on the test set for the model with the best weights in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).

In [9]:
model_voting_hard = VotingClassifier(estimators=[('forest',model_forest),('svm', model_svm),('tree',model_tree)])
model_voting_hard.fit(X_train, y_train)
metrics(model_voting_hard, X_test, y_test)

{'accuracy': 0.9260355029585798,
 'precision': 0.9287747789508352,
 'recall': 0.9260355029585798}

In [10]:
model_voting_soft = VotingClassifier(estimators=[('forest',model_forest),('svm', model_svm),('tree',model_tree)], voting='soft')
model_voting_soft.fit(X_train, y_train)
metrics(model_voting_soft, X_test, y_test)

{'accuracy': 0.9053254437869822,
 'precision': 0.9080256487623768,
 'recall': 0.9053254437869822}

In [11]:
weights = [
	[1,1,1],
	[2,1,1],
	[1,2,1],
	[1,1,2]
]

best_param = {
	'model': None,
	'accuracy':0,
	'precision':0,
	'recall':0,
	'weights':[]
}

In [12]:
for item_weights in weights:
	t_model_voting = VotingClassifier(estimators=[('forest',model_forest),('svm', model_svm),('tree',model_tree)], weights=item_weights)
	t_model_voting.fit(X_train, y_train)
	t_res=metrics(t_model_voting,X_test,y_test)
	if t_res['accuracy']>best_param['accuracy'] or (t_res['accuracy']==best_param['accuracy'] and t_res['precision']>best_param['precision']):
		best_param['accuracy']=t_res['accuracy']
		best_param['precision']=t_res['precision']
		best_param['recall']=t_res['recall']
		best_param['model']=t_model_voting
		best_param['weights']=item_weights

print(f"Weights: {best_param['weights']}")
print(f"Accuracy: {best_param['accuracy']}")
print(f"Precision: {best_param['precision']}")
print(f"Recall: {best_param['recall']}")

Weights: [2, 1, 1]
Accuracy: 0.9289940828402367
Precision: 0.9319120441073311
Recall: 0.9289940828402367


## 4. Bagging classifiers

1. Using `BaggingClassifier` and `SVM` with the best parameters create an ensemble, try different values of the `n_estimators`, use `random_state=21`.
2. Play with the other parameters.
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision)

In [13]:
list_n_estimators = range(10,100,10)
best_param_bagging = {
	'model': None,
	'accuracy':0,
	'precision':0,
	'recall':0,
	'n_estimators':0
}
	

In [14]:
for n_estimators in list_n_estimators:
	t_model_bagging = BaggingClassifier(estimator=model_svm, n_estimators=n_estimators, random_state=21)
	t_model_bagging.fit(X_train, y_train)
	t_res=metrics(t_model_bagging,X_test,y_test)
	if t_res['accuracy']>best_param_bagging['accuracy'] or (t_res['accuracy']==best_param_bagging['accuracy'] and t_res['precision']>best_param_bagging['precision']):
		best_param_bagging['accuracy']=t_res['accuracy']
		best_param_bagging['precision']=t_res['precision']
		best_param_bagging['recall']=t_res['recall']
		best_param_bagging['model']=t_model_bagging
		best_param_bagging['n_estimators']=n_estimators

In [15]:
print(f"N_estimators: {best_param_bagging['n_estimators']}")
print(f"Accuracy: {best_param_bagging['accuracy']}")
print(f"Precision: {best_param_bagging['precision']}")
print(f"Recall: {best_param_bagging['recall']}")

N_estimators: 60
Accuracy: 0.8964497041420119
Precision: 0.8973594069747918
Recall: 0.8964497041420119


## 5. Stacking classifiers

1. To achieve reproducibility in this case you will have to create an object of cross-validation generator: `StratifiedKFold(n_splits=n, shuffle=True, random_state=21)`, where `n` you will try to optimize (the details are below).
2. Using `StackingClassifier` and the three models that you have recently trained, calculate the `accuracy`, `precision` and `recall` on the validation set, try different values of `n_splits` `[2, 3, 4, 5, 6, 7]` in the cross-validation generator and parameter `passthrough` in the classifier itself,
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision). Use `final_estimator=LogisticRegression(solver='liblinear')`.

In [16]:
list_n_splits = range(2,10,2)

best_param_bagging = {
	'model': None,
	'accuracy':0,
	'precision':0,
	'recall':0,
	'n_splits':0
}

In [17]:
for n_splits in list_n_splits:
	for passthrough in [True, False]:
		skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=21)

		t_model_stacking = StackingClassifier(
			estimators=[('svс', model_svm), ('forest', model_forest), ('tree', model_tree)],
            passthrough=passthrough,
            final_estimator=LogisticRegression(solver='liblinear')
		)

		for train_index, val_index in skf.split(X_train, y_train):
			X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
			y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

			t_model_stacking.fit(X_train_fold, y_train_fold)
			t_res = metrics(t_model_stacking, X_val_fold, y_val_fold)

			if t_res['accuracy']>best_param_bagging['accuracy'] or (t_res['accuracy']==best_param_bagging['accuracy'] and t_res['precision']>best_param_bagging['precision']):
				best_param_bagging['accuracy']=t_res['accuracy']
				best_param_bagging['precision']=t_res['precision']
				best_param_bagging['recall']=t_res['recall']
				best_param_bagging['model']=t_model_bagging
				best_param_bagging['n_splits']=n_splits




In [18]:
best_param_bagging

{'model': BaggingClassifier(estimator=SVC(C=10, class_weight='balanced', gamma='auto',
                                 probability=True, random_state=21),
                   n_estimators=90, random_state=21),
 'accuracy': 0.9467455621301775,
 'precision': 0.9480653030948889,
 'recall': 0.9467455621301775,
 'n_splits': 8}

## 6. Predictions

1. Choose the best model in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).
2. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which labname and for which users.
3. Save the model.

In [19]:
model_best = best_param_bagging['model']

In [20]:
t_df=df.copy()
pred_y=model_best.predict(X)
t_df['pred']=pred_y

In [21]:
t_df[t_df['pred'] != t_df['dayofweek']]['dayofweek'].value_counts() / len(pred_y)

dayofweek
6    0.013049
5    0.009490
1    0.009490
3    0.008304
0    0.004152
4    0.001779
2    0.001186
Name: count, dtype: float64

In [22]:
error = t_df[t_df['pred'] != t_df['dayofweek']]
users_list = [i for i in t_df.columns if i.startswith('uid')]
lab_list = [i for i in t_df.columns if i.startswith('labname')]
max_error = 0
max_user = ''
for user in users_list:
    error_perc = error[user].sum() / len(pred_y)
    if error_perc > max_error:
        max_error = error_perc
        max_user = user
print(f"User with max error: {max_user} | Percent error: {max_error * 100}%")

User with max error: uid_user_25 | Percent error: 0.41518386714116245%


In [23]:
max_error = 0
max_lab = ''
for lab in lab_list:
    error_perc = error[lab].sum() / len(pred_y)
    if error_perc > max_error:
        max_error = error_perc
        max_lab = lab
print(f"Labname with max error: {max_lab} | Percent error: {max_error * 100}%")

Labname with max error: labname_project1 | Percent error: 1.9572953736654803%


In [24]:
joblib.dump(model_best, 'model_ex03.joblib')

['model_ex03.joblib']