# Day 09. Exercise 01
# Gridsearch

## 0. Imports

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

## 1. Preprocessing

1. Read the file [`day-of-week-not-scaled.csv`](https://drive.google.com/file/d/1AlGvsJDSzPT_70caausx8bFuupIEZkfh/view?usp=sharing). It is similar to the one from the previous exercise, but this time we did not scale continuous features (we are not going to use logreg anymore). Don't forget to enrich the table with the 'dayofweek' column from the previous day's .csv-file.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [2]:
df = pd.read_csv('../data/day-of-week-not-scaled.csv')
df['dayofweek']=pd.read_csv('../data/dayofweek.csv')['dayofweek']
df

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,dayofweek
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,9,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3
1682,6,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3
1683,7,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3
1684,8,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3


In [3]:
X=df.drop(columns=['dayofweek'])
y=df['dayofweek']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

## 2. SVM gridsearch

1. Using `GridSearchCV` try different parameters of kernel (`linear`, `rbf`, `sigmoid`), C (`0.01`, `0.1`, `1`, `1.5`, `5`, `10`), gamma (`scale`, `auto`), class_weight (`balanced`, `None`) use `random_state=21` and `probability=True` and get the best combination of them in terms of accuracy.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`. Check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [5]:
model_svm = SVC(random_state=21, probability=True)

param_grid_svm = {
    'kernel': ['linear', 'rbf', 'sigmoid'],
    'C': [0.01, 0.1, 1, 1.5, 5, 10],
    'gamma': ['scale', 'auto'],
    'class_weight': ['balanced', None]
}

In [6]:
grid_search_svm = GridSearchCV(estimator=model_svm, param_grid=param_grid_svm, cv=5, scoring='accuracy',verbose=1,n_jobs=-1)
grid_search_svm.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


0,1,2
,estimator,SVC(probabili...ndom_state=21)
,param_grid,"{'C': [0.01, 0.1, ...], 'class_weight': ['balanced', None], 'gamma': ['scale', 'auto'], 'kernel': ['linear', 'rbf', ...]}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,C,10
,kernel,'rbf'
,degree,3
,gamma,'auto'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,'balanced'


In [7]:
grid_search_svm.best_params_

{'C': 10, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'rbf'}

In [8]:
results_df = pd.DataFrame(grid_search_svm.cv_results_)
results_df.sort_values(by='rank_test_score', ascending=True)[['params','mean_test_score']]

Unnamed: 0,params,mean_test_score
64,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.863916
70,"{'C': 10, 'class_weight': None, 'gamma': 'auto...",0.859947
58,"{'C': 5, 'class_weight': None, 'gamma': 'auto'...",0.801437
52,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.796703
60,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.727831
...,...,...
65,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.138469
53,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.125817
41,"{'C': 1.5, 'class_weight': 'balanced', 'gamma'...",0.100442
29,"{'C': 1, 'class_weight': 'balanced', 'gamma': ...",0.099667


## 3. Decision tree

1. Using `GridSearchCV` try different parameters of `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use `random_state=21`.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [9]:
model_tree = DecisionTreeClassifier(random_state=21)

param_grid_tree = {
    'max_depth': [1,5,10,20,30,40,45,49],
    'class_weight': ['balanced', None],
    'criterion':['entropy','gini']
}

In [10]:
grid_search_tree = GridSearchCV(estimator=model_tree, param_grid=param_grid_tree, scoring='accuracy', cv=5, verbose=1, n_jobs=-1)
grid_search_tree.fit(X_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


0,1,2
,estimator,DecisionTreeC...ndom_state=21)
,param_grid,"{'class_weight': ['balanced', None], 'criterion': ['entropy', 'gini'], 'max_depth': [1, 5, ...]}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,30
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,21
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [11]:
grid_search_tree.best_params_

{'class_weight': None, 'criterion': 'gini', 'max_depth': 30}

In [12]:
pd.DataFrame(grid_search_tree.cv_results_).sort_values(by='rank_test_score', ascending=True)[['params','mean_test_score']]

Unnamed: 0,params,mean_test_score
31,"{'class_weight': None, 'criterion': 'gini', 'm...",0.873389
30,"{'class_weight': None, 'criterion': 'gini', 'm...",0.873389
29,"{'class_weight': None, 'criterion': 'gini', 'm...",0.873389
28,"{'class_weight': None, 'criterion': 'gini', 'm...",0.873389
12,"{'class_weight': 'balanced', 'criterion': 'gin...",0.872614
13,"{'class_weight': 'balanced', 'criterion': 'gin...",0.872614
14,"{'class_weight': 'balanced', 'criterion': 'gin...",0.872614
15,"{'class_weight': 'balanced', 'criterion': 'gin...",0.872614
27,"{'class_weight': None, 'criterion': 'gini', 'm...",0.869437
11,"{'class_weight': 'balanced', 'criterion': 'gin...",0.868662


## 4. Random forest

1. Using `GridSearchCV` try different parameters of `n_estimators` (`5`, `10`, `50`, `100`), `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use random_state=21.
2. Create a dataframe from the results of the gridsearch and sort it ascendengly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [13]:
model_forest = RandomForestClassifier(random_state=21)

param_grid_forest = {
    'max_depth': [1,5,10,20,30,40,45,49],
    'n_estimators' : [5,10,50,100],
    'class_weight': ['balanced', None],
    'criterion':['entropy','gini']
}

In [14]:
grid_search_forest = GridSearchCV(estimator=model_forest, param_grid=param_grid_forest, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search_forest.fit(X_train, y_train)

Fitting 5 folds for each of 128 candidates, totalling 640 fits


0,1,2
,estimator,RandomForestC...ndom_state=21)
,param_grid,"{'class_weight': ['balanced', None], 'criterion': ['entropy', 'gini'], 'max_depth': [1, 5, ...], 'n_estimators': [5, 10, ...]}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,50
,criterion,'gini'
,max_depth,40
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [15]:
grid_search_forest.best_params_

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 40,
 'n_estimators': 50}

In [16]:
pd.DataFrame(grid_search_forest.cv_results_).sort_values(by='rank_test_score', ascending=True)[['params','mean_test_score']]

Unnamed: 0,params,mean_test_score
62,"{'class_weight': 'balanced', 'criterion': 'gin...",0.902666
54,"{'class_weight': 'balanced', 'criterion': 'gin...",0.902666
58,"{'class_weight': 'balanced', 'criterion': 'gin...",0.902666
55,"{'class_weight': 'balanced', 'criterion': 'gin...",0.901082
59,"{'class_weight': 'balanced', 'criterion': 'gin...",0.901082
...,...,...
96,"{'class_weight': None, 'criterion': 'gini', 'm...",0.347312
64,"{'class_weight': None, 'criterion': 'entropy',...",0.345737
33,"{'class_weight': 'balanced', 'criterion': 'gin...",0.330705
0,"{'class_weight': 'balanced', 'criterion': 'ent...",0.249994


## 5. Progress bar

Gridsearch can be a quite long process and you may find yourself wondering when it will end.
1. Create a manual gridsearch for the same parameters values of random forest iterating through the list of the possible values and calculating `cross_val_score` for each combination. Try to increase `n_jobs`. The value `cv` for `cross_val_score` is 5.
2. Track the progress using the library `tqdm.notebook`.
3. Create a dataframe from the results of the gridsearch with the columns corresponding to the names of the parameters and `mean_accuracy` and `std_accuracy`.
4. Sort it descendingly by the `mean_accuracy`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [17]:
param_grid_forest

{'max_depth': [1, 5, 10, 20, 30, 40, 45, 49],
 'n_estimators': [5, 10, 50, 100],
 'class_weight': ['balanced', None],
 'criterion': ['entropy', 'gini']}

In [18]:
results = []
for n_estimators in tqdm(param_grid_forest['n_estimators'], desc="n_estimators"):
    for max_depth in param_grid_forest['max_depth']:
        for class_weight in param_grid_forest['class_weight']:
            for criterion in param_grid_forest['criterion']:
                
                model = RandomForestClassifier(
                    n_estimators=n_estimators,
                    max_depth=max_depth,
                    class_weight=class_weight,
                    random_state=21,
                    n_jobs=-1 
                )
                
                scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
                
                results.append({
                    'n_estimators': n_estimators,
                    'max_depth': max_depth,
                    'class_weight': class_weight,
                    'min_samples_leaf': criterion,
                    'mean_accuracy': np.mean(scores),
                    'std_accuracy': np.std(scores)
                })

n_estimators: 100%|██████████| 4/4 [00:46<00:00, 11.65s/it]


In [19]:
results_df = pd.DataFrame(results)
results_df.sort_values(by='mean_accuracy', ascending=False)

Unnamed: 0,n_estimators,max_depth,class_weight,min_samples_leaf,mean_accuracy,std_accuracy
92,50,49,balanced,entropy,0.902666,0.021584
93,50,49,balanced,gini,0.902666,0.021584
85,50,40,balanced,gini,0.902666,0.021584
84,50,40,balanced,entropy,0.902666,0.021584
89,50,45,balanced,gini,0.902666,0.021584
...,...,...,...,...,...,...
3,5,1,,gini,0.347312,0.014034
33,10,1,balanced,gini,0.330705,0.048310
32,10,1,balanced,entropy,0.330705,0.048310
0,5,1,balanced,entropy,0.236564,0.016674


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.

In [20]:
model_best = RandomForestClassifier(class_weight=None, criterion='gini', max_depth=40, n_estimators=100, random_state=21)
model_best.fit(X_test, y_test)
accuracy_score(model_best.predict(X_test),y_test)

1.0