# Day 09. Exercise 01
# Gridsearch

## 0. Imports

In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from tqdm.notebook import tqdm
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import ParameterGrid

## 1. Preprocessing

1. Read the file [`day-of-week-not-scaled.csv`](https://drive.google.com/file/d/1AlGvsJDSzPT_70caausx8bFuupIEZkfh/view?usp=sharing). It is similar to the one from the previous exercise, but this time we did not scale continuous features (we are not going to use logreg anymore).
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [30]:
df_tem = pd.read_csv("../data/day-of-week-not-scaled.csv")
df_tem.head()

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [31]:
df = pd.read_csv("../data/dayofweek-not-scaled.csv")
df.drop(['dayofweek'], axis=1).equals(df_tem)

True

In [32]:
X = df.drop('dayofweek', axis=1)
y = df['dayofweek']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

## 2. SVM gridsearch

1. Using `GridSearchCV` try different parameters of kernel (`linear`, `rbf`, `sigmoid`), C (`0.01`, `0.1`, `1`, `1.5`, `5`, `10`), gamma (`scale`, `auto`), class_weight (`balanced`, `None`) use `random_state=21` and `probability=True` and get the best combination of them in terms of accuracy.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`. Check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [33]:
def gridSearch(X_train, y_train, model, param):
    # Set up GridSearchCV
    skf = StratifiedKFold(n_splits=2)
    grid_search = GridSearchCV(estimator=model, param_grid=param, scoring='accuracy', cv=skf, return_train_score=True,
                               n_jobs=-1)

    grid_search.fit(X_train, y_train)
    # Get the best parameters and score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    results_df = pd.DataFrame(grid_search.cv_results_)

    df_sorted = results_df.sort_values(by='rank_test_score')

    return best_params, best_score, df_sorted

In [34]:
param_grid = {
    'kernel': ['linear', 'rbf', 'sigmoid'],
    'C': [0.01, 0.1, 1, 1.5, 5, 10],
    'gamma': ['scale', 'auto'],
    'class_weight': ['balanced', None],
    'probability': [True],
    'random_state': [21]
}
# Create an SVM model
svm = SVC()
best_params, best_score, df_sorted = gridSearch(X_train, y_train, svm, param_grid)

In [35]:
print(f'лучшие параметры: {best_params}')
print(f'лучший score: {best_score}')

лучшие параметры: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
лучший score: 0.8093471810089021


In [36]:
df_sorted.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,param_gamma,param_kernel,param_probability,param_random_state,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,mean_train_score,std_train_score
70,0.936716,0.028144,0.163158,0.092329,10.0,,auto,rbf,True,21,"{'C': 10, 'class_weight': None, 'gamma': 'auto...",0.805638,0.813056,0.809347,0.003709,1,0.94362,0.937685,0.940653,0.002967
64,0.332627,0.011625,0.083326,0.001984,10.0,balanced,auto,rbf,True,21,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.801187,0.807122,0.804154,0.002967,2,0.928783,0.937685,0.933234,0.004451
58,0.265072,0.005815,0.079665,0.003173,5.0,,auto,rbf,True,21,"{'C': 5, 'class_weight': None, 'gamma': 'auto'...",0.750742,0.744807,0.747774,0.002967,3,0.873887,0.866469,0.870178,0.003709
52,0.266382,0.006601,0.080214,0.001,5.0,balanced,auto,rbf,True,21,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.732938,0.74184,0.737389,0.004451,4,0.863501,0.87092,0.867211,0.003709
60,25.252386,0.761229,0.0081,0.00449,10.0,balanced,scale,linear,True,21,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.712166,0.70178,0.706973,0.005193,5,0.783383,0.795252,0.789318,0.005935


## 3. Decision tree

1. Using `GridSearchCV` try different parameters of `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use `random_state=21`.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [37]:
param_grid = {
    'max_depth': range(1, 50),
    'class_weight': [None, 'balanced'],
    'criterion': ['gini', 'entropy'],
    'random_state': [21]
}
dt = DecisionTreeClassifier()

best_params_dt, best_score_dt, df_sorted_dt = gridSearch(X_train, y_train, dt, param_grid)

In [38]:
print(f'лучшие параметры: {best_params_dt}')
print(f'лучший score: {best_score_dt}')

лучшие параметры: {'class_weight': None, 'criterion': 'gini', 'max_depth': 17, 'random_state': 21}
лучший score: 0.8293768545994065


In [39]:
df_sorted_dt.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_depth,param_random_state,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,mean_train_score,std_train_score
16,0.00485,0.000896,0.0015,0.0005,,gini,17,21,"{'class_weight': None, 'criterion': 'gini', 'm...",0.832344,0.826409,0.829377,0.002967,1,0.98368,0.98368,0.98368,0.0
24,0.005646,0.0014,0.001263,0.001263,,gini,25,21,"{'class_weight': None, 'criterion': 'gini', 'm...",0.830861,0.826409,0.828635,0.002226,2,1.0,1.0,1.0,0.0
26,0.007562,0.002008,0.000977,0.000977,,gini,27,21,"{'class_weight': None, 'criterion': 'gini', 'm...",0.830861,0.826409,0.828635,0.002226,2,1.0,1.0,1.0,0.0
25,0.003723,0.003723,0.003839,0.001715,,gini,26,21,"{'class_weight': None, 'criterion': 'gini', 'm...",0.830861,0.826409,0.828635,0.002226,2,1.0,1.0,1.0,0.0
28,0.006631,0.00195,0.0,0.0,,gini,29,21,"{'class_weight': None, 'criterion': 'gini', 'm...",0.830861,0.826409,0.828635,0.002226,2,1.0,1.0,1.0,0.0


## 4. Random forest

1. Using `GridSearchCV` try different parameters of `n_estimators` (`5`, `10`, `50`, `100`), `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use random_state=21.
2. Create a dataframe from the results of the gridsearch and sort it ascendengly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [40]:
param_grid_rfc = {
    'n_estimators': [5, 10, 50, 100],
    'max_depth': range(1, 50),
    'class_weight': [None, 'balanced'],
    'criterion': ['gini', 'entropy'],
    'random_state': [21]
}
rfc = RandomForestClassifier()

best_params_rfc, best_score_rfc, df_sorted_rfc = gridSearch(X_train, y_train, rfc, param_grid_rfc)

In [41]:
print(f'лучшие параметры: {best_params_rfc}')
print(f'лучший score: {best_score_rfc}')

лучшие параметры: {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 26, 'n_estimators': 100, 'random_state': 21}
лучший score: 0.8768545994065282


In [42]:
df_sorted_rfc.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_depth,param_n_estimators,param_random_state,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,mean_train_score,std_train_score
691,0.303668,0.026631,0.022032,0.006,balanced,entropy,26,100,21,"{'class_weight': 'balanced', 'criterion': 'ent...",0.885757,0.867953,0.876855,0.008902,1,1.0,1.0,1.0,0.0
327,0.272512,0.003492,0.019763,0.001092,,entropy,33,100,21,"{'class_weight': None, 'criterion': 'entropy',...",0.884273,0.864985,0.874629,0.009644,2,1.0,1.0,1.0,0.0
307,0.258247,0.015835,0.016147,0.000217,,entropy,28,100,21,"{'class_weight': None, 'criterion': 'entropy',...",0.884273,0.864985,0.874629,0.009644,2,1.0,1.0,1.0,0.0
311,0.236726,0.004545,0.017399,0.00325,,entropy,29,100,21,"{'class_weight': None, 'criterion': 'entropy',...",0.884273,0.864985,0.874629,0.009644,2,1.0,1.0,1.0,0.0
315,0.252776,0.00673,0.019837,0.000324,,entropy,30,100,21,"{'class_weight': None, 'criterion': 'entropy',...",0.882789,0.864985,0.873887,0.008902,5,1.0,1.0,1.0,0.0


## 5. Progress bar

Gridsearch can be a quite long process and you may find yourself wondering when it will end.
1. Create a manual gridsearch for the same parameters values of random forest iterating through the list of the possible values and calculating `cross_val_score` for each combination. Try to increase `n_jobs`. The value `cv` for `cross_val_score` is 5.
2. Track the progress using the library `tqdm.notebook`.
3. Create a dataframe from the results of the gridsearch with the columns corresponding to the names of the parameters and `mean_accuracy` and `std_accuracy`.
4. Sort it descendingly by the `mean_accuracy`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [43]:
grid = list(ParameterGrid(param_grid_rfc))
data = []

for params in tqdm(grid):
    d = {}
    estimator = RandomForestClassifier(**params)
    sc = cross_val_score(estimator, X_train, y_train, cv=5, n_jobs=1)
    d = {**params, 'mean_accuracy': np.mean(sc), 'std_accuracy': np.std(sc)}
    data.append(d)

  0%|          | 0/784 [00:00<?, ?it/s]

In [44]:
result = pd.DataFrame(data)
result = result.sort_values('mean_accuracy', ascending=False)
result

Unnamed: 0,class_weight,criterion,max_depth,n_estimators,random_state,mean_accuracy,std_accuracy
110,,gini,28,50,21,0.904290,0.010961
123,,gini,31,100,21,0.903547,0.014380
510,balanced,gini,30,50,21,0.902817,0.013554
526,balanced,gini,34,50,21,0.902809,0.013010
114,,gini,29,50,21,0.902806,0.011698
...,...,...,...,...,...,...,...
196,,entropy,1,5,21,0.353832,0.016467
592,balanced,entropy,2,5,21,0.353110,0.021165
396,balanced,gini,2,5,21,0.346419,0.029749
392,balanced,gini,1,5,21,0.283390,0.011062


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.

In [45]:
model = RandomForestClassifier(
                max_depth=28,
                n_estimators=50,
                class_weight=None,
                criterion='entropy',
                random_state=21,
                n_jobs=-1
            )
model.fit(X_train, y_train)
predict = model.predict(X_test)
accuracy_score(y_test, predict)

0.9349112426035503