# Day 09. Exercise 01
# Gridsearch

## 0. Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm
import joblib
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

## 1. Preprocessing

1. Read the file [`day-of-week-not-scaled.csv`](https://drive.google.com/file/d/1AlGvsJDSzPT_70caausx8bFuupIEZkfh/view?usp=sharing). It is similar to the one from the previous exercise, but this time we did not scale continuous features (we are not going to use logreg anymore).
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [2]:
data = pd.read_csv('../data/day-of-week-not-scaled.csv')
data

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,9,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1682,6,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1683,7,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1684,8,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [3]:
weeks = pd.read_csv('../data/dayofweek.csv')
weeks = weeks['dayofweek']
weeks

0       4
1       4
2       4
3       4
4       4
       ..
1681    3
1682    3
1683    3
1684    3
1685    3
Name: dayofweek, Length: 1686, dtype: int64

In [4]:
X = data
y = weeks

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

## 2. SVM gridsearch

1. Using `GridSearchCV` try different parameters of kernel (`linear`, `rbf`, `sigmoid`), C (`0.01`, `0.1`, `1`, `1.5`, `5`, `10`), gamma (`scale`, `auto`), class_weight (`balanced`, `None`) use `random_state=21` and `probability=True` and get the best combination of them in terms of accuracy.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`. Check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [5]:
svc = SVC(probability=True, random_state=21)

param_grid = {
    'kernel': ['linear', 'rbf', 'sigmoid'],
    'C': [0.01, 0.1, 1, 1.5, 5, 10],
    'gamma': ['scale', 'auto'],
    'class_weight': [None, 'balanced']
}

grid_search = GridSearchCV(svc, param_grid, scoring='accuracy', cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
best_model = grid_search.best_estimator_

{'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf'}


In [6]:
results = pd.DataFrame(grid_search.cv_results_)

In [7]:
sorted_results = results.sort_values(by='rank_test_score')
sorted_results[['params', 'mean_test_score', 'rank_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score
64,"{'C': 10, 'class_weight': None, 'gamma': 'auto...",0.876109,1
70,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.863500,2
52,"{'C': 5, 'class_weight': None, 'gamma': 'auto'...",0.816018,3
58,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.808608,4
66,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.721052,5
...,...,...,...
59,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.129792,68
71,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.115693,69
47,"{'C': 1.5, 'class_weight': 'balanced', 'gamma'...",0.079380,70
23,"{'C': 0.1, 'class_weight': 'balanced', 'gamma'...",0.062310,71


In [8]:
best_score = grid_search.best_score_
print("Best accuracy:", best_score)

Best accuracy: 0.8761090458488228


## 3. Decision tree

1. Using `GridSearchCV` try different parameters of `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use `random_state=21`.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [9]:
tree = DecisionTreeClassifier(random_state=21)

param_grid = {
    'max_depth': list(range(1, 50)), 
    'class_weight': [None, 'balanced'],
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(tree, param_grid, scoring='accuracy', cv=10)

grid_search.fit(X_train, y_train)

print(grid_search.best_params_)

best_model_dtc = grid_search.best_estimator_

{'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 25}


In [10]:
results = pd.DataFrame(grid_search.cv_results_)

In [11]:
sorted_results = results.sort_values(by='rank_test_score')
sorted_results[['params', 'mean_test_score', 'rank_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score
188,"{'class_weight': 'balanced', 'criterion': 'ent...",0.890182,1
189,"{'class_weight': 'balanced', 'criterion': 'ent...",0.890182,1
190,"{'class_weight': 'balanced', 'criterion': 'ent...",0.890182,1
187,"{'class_weight': 'balanced', 'criterion': 'ent...",0.890182,1
191,"{'class_weight': 'balanced', 'criterion': 'ent...",0.890182,1
...,...,...,...
100,"{'class_weight': 'balanced', 'criterion': 'gin...",0.379784,192
0,"{'class_weight': None, 'criterion': 'gini', 'm...",0.355318,193
49,"{'class_weight': None, 'criterion': 'entropy',...",0.355318,193
98,"{'class_weight': 'balanced', 'criterion': 'gin...",0.298209,195


In [12]:
best_score = grid_search.best_score_
print("Best accuracy:", best_score)

Best accuracy: 0.8901824212271974


## 4. Random forest

1. Using `GridSearchCV` try different parameters of `n_estimators` (`5`, `10`, `50`, `100`), `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use random_state=21.
2. Create a dataframe from the results of the gridsearch and sort it ascendengly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [13]:
forest = RandomForestClassifier(random_state=21)

param_grid = {
    'n_estimators': [5, 10, 50, 100], 
    'max_depth': list(range(1, 50)),  
    'class_weight': [None, 'balanced'],
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(forest, param_grid, scoring='accuracy', cv=10, n_jobs=-1)

grid_search.fit(X_train, y_train)

print(grid_search.best_params_)

best_model_forest = grid_search.best_estimator_

{'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 22, 'n_estimators': 100}


In [14]:
results = pd.DataFrame(grid_search.cv_results_)

In [15]:
sorted_results = results.sort_values(by='rank_test_score')
sorted_results[['params', 'mean_test_score', 'rank_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score
479,"{'class_weight': 'balanced', 'criterion': 'gin...",0.920625,1
323,"{'class_weight': None, 'criterion': 'entropy',...",0.919132,2
331,"{'class_weight': None, 'criterion': 'entropy',...",0.919127,3
311,"{'class_weight': None, 'criterion': 'entropy',...",0.919127,3
307,"{'class_weight': None, 'criterion': 'entropy',...",0.919127,3
...,...,...,...
197,"{'class_weight': None, 'criterion': 'entropy',...",0.359022,780
592,"{'class_weight': 'balanced', 'criterion': 'ent...",0.340520,781
396,"{'class_weight': 'balanced', 'criterion': 'gin...",0.325650,782
588,"{'class_weight': 'balanced', 'criterion': 'ent...",0.275218,783


In [16]:
best_score = grid_search.best_score_
print("Best accuracy:", best_score)

Best accuracy: 0.9206246545052515


## 5. Progress bar

Gridsearch can be a quite long process and you may find yourself wondering when it will end.
1. Create a manual gridsearch for the same parameters values of random forest iterating through the list of the possible values and calculating `cross_val_score` for each combination. Try to increase `n_jobs`. The value `cv` for `cross_val_score` is 5.
2. Track the progress using the library `tqdm.notebook`.
3. Create a dataframe from the results of the gridsearch with the columns corresponding to the names of the parameters and `mean_accuracy` and `std_accuracy`.
4. Sort it descendingly by the `mean_accuracy`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [17]:
import warnings
warnings.filterwarnings('ignore')

n_estimators = [5, 10, 50, 100]
max_depths = list(range(1, 50))
class_weights = [None, 'balanced']
criteria = ['gini', 'entropy']

results = []

for n in tqdm(n_estimators, desc="n_estimators"):
    for depth in tqdm(max_depths, desc="max_depth", leave=False):
        for weight in class_weights:
            for criterion in criteria:
                
                rfc = RandomForestClassifier(n_estimators=n, max_depth=depth, 
                                             class_weight=weight, criterion=criterion, 
                                             random_state=21)

                scores = cross_val_score(rfc, X_train, y_train, cv=5, n_jobs=-1)

                results.append({
                    'n_estimators': n,
                    'max_depth': depth,
                    'class_weight': weight,
                    'criterion': criterion,
                    'mean_accuracy': scores.mean(),
                    'std_accuracy': scores.std()
                })

n_estimators:   0%|          | 0/4 [00:00<?, ?it/s]

max_depth:   0%|          | 0/49 [00:00<?, ?it/s]

max_depth:   0%|          | 0/49 [00:00<?, ?it/s]

max_depth:   0%|          | 0/49 [00:00<?, ?it/s]

max_depth:   0%|          | 0/49 [00:00<?, ?it/s]

In [18]:
results_df = pd.DataFrame(results)

sorted_results = results_df.sort_values(by='mean_accuracy', ascending=False)
sorted_results

Unnamed: 0,n_estimators,max_depth,class_weight,criterion,mean_accuracy,std_accuracy
500,50,28,,gini,0.904290,0.010961
708,100,31,,gini,0.903547,0.014380
510,50,30,balanced,gini,0.902817,0.013554
526,50,34,balanced,gini,0.902809,0.013010
740,100,39,,gini,0.902806,0.010460
...,...,...,...,...,...,...
1,5,1,,entropy,0.353832,0.016467
7,5,2,balanced,entropy,0.353110,0.021165
6,5,2,balanced,gini,0.346419,0.029749
2,5,1,balanced,gini,0.283390,0.011062


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.

In [19]:
y_pred = best_model_forest.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print(f"accuracy: {accuracy:.4f}")

accuracy: 0.9349


In [18]:
y_pred = best_model_forest.predict(X)

accuracy = accuracy_score(y, y_pred)

print(f"accuracy: {accuracy:.4f}")

accuracy: 0.9419
