# Day 09. Exercise 01
# Gridsearch

## 0. Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from tqdm.notebook import tqdm
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

## 1. Preprocessing

1. Read the file [`day-of-week-not-scaled.csv`](https://drive.google.com/file/d/1AlGvsJDSzPT_70caausx8bFuupIEZkfh/view?usp=sharing). It is similar to the one from the previous exercise, but this time we did not scale continuous features (we are not going to use logreg anymore). Don't forget to enrich the table with the 'dayofweek' column from the previous day's .csv-file.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [2]:
df = pd.read_csv('../data/day-of-week-not-scaled.csv')
data_scaled = pd.read_csv("../data/dayofweek.csv")
df["dayofweek"] = data_scaled["dayofweek"]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1686 entries, 0 to 1685
Data columns (total 44 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   numTrials         1686 non-null   int64  
 1   hour              1686 non-null   int64  
 2   uid_user_0        1686 non-null   float64
 3   uid_user_1        1686 non-null   float64
 4   uid_user_10       1686 non-null   float64
 5   uid_user_11       1686 non-null   float64
 6   uid_user_12       1686 non-null   float64
 7   uid_user_13       1686 non-null   float64
 8   uid_user_14       1686 non-null   float64
 9   uid_user_15       1686 non-null   float64
 10  uid_user_16       1686 non-null   float64
 11  uid_user_17       1686 non-null   float64
 12  uid_user_18       1686 non-null   float64
 13  uid_user_19       1686 non-null   float64
 14  uid_user_2        1686 non-null   float64
 15  uid_user_20       1686 non-null   float64
 16  uid_user_21       1686 non-null   float64


In [3]:
X = df.drop('dayofweek', axis=1)
y = df['dayofweek']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=21, stratify=y, test_size=0.2)

## 2. SVM gridsearch

1. Using `GridSearchCV` try different parameters of kernel (`linear`, `rbf`, `sigmoid`), C (`0.01`, `0.1`, `1`, `1.5`, `5`, `10`), gamma (`scale`, `auto`), class_weight (`balanced`, `None`) use `random_state=21` and `probability=True` and get the best combination of them in terms of accuracy.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`. Check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [4]:
svc = SVC(random_state=21, probability=True)

param_grid = {
    'C' : [0.01, 0.1, 1, 1.5, 5, 10],
    'kernel' : ['linear', 'rbf', 'sigmoid'],
    'gamma' : ['scale', 'auto'],
    'class_weight' : ['balanced', None],
    'random_state': [21],
    'probability': [True]
}
gcv = GridSearchCV(svc, param_grid, cv=2, scoring='accuracy', n_jobs=-1, verbose=2)
gcv.fit(X_train, y_train)

print(f'лучшие параметры: {gcv.best_params_}')
print(f'лучший score: {gcv.best_score_}')

Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   51.3s finished


лучшие параметры: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
лучший score: 0.8093471810089021


In [5]:
results = pd.DataFrame(gcv.cv_results_)
results = results.sort_values('rank_test_score', ascending=True)
key_columns = [
    'rank_test_score', 'mean_test_score', 'std_test_score',
    'param_kernel', 'param_C', 'param_gamma', 'param_class_weight'
]
results[key_columns].head()

Unnamed: 0,rank_test_score,mean_test_score,std_test_score,param_kernel,param_C,param_gamma,param_class_weight
70,1,0.809347,0.003709,rbf,10,auto,
64,2,0.804154,0.002967,rbf,10,auto,balanced
58,3,0.747774,0.002967,rbf,5,auto,
52,4,0.737389,0.004451,rbf,5,auto,balanced
63,5,0.706973,0.005193,linear,10,auto,balanced


## 3. Decision tree

1. Using `GridSearchCV` try different parameters of `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use `random_state=21`.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [6]:
dt = DecisionTreeClassifier(random_state=21, max_features=None)

In [7]:
param_grid = {'criterion': ['gini','entropy'],
              'max_depth': list(range(1, 50)),
              'class_weight': ['balanced', None],
              'random_state': [21]
            }

gs = GridSearchCV(dt, param_grid, scoring='accuracy', n_jobs=-1, verbose=2)
gs.fit(X_train, y_train)

print(f'лучшие параметры: {gs.best_params_}')
print(f'лучший score: {gs.best_score_}')

Fitting 5 folds for each of 196 candidates, totalling 980 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.0s


лучшие параметры: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 21, 'random_state': 21}
лучший score: 0.873864794162192


[Parallel(n_jobs=-1)]: Done 837 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 980 out of 980 | elapsed:    1.0s finished


In [8]:
results = pd.DataFrame(gs.cv_results_)
results = results.sort_values('rank_test_score', ascending=True)
columns_to_show = [
    'rank_test_score', 'mean_test_score', 'std_test_score',
    'param_max_depth', 'param_class_weight', 'param_criterion'
]
results[columns_to_show].head()

Unnamed: 0,rank_test_score,mean_test_score,std_test_score,param_max_depth,param_class_weight,param_criterion
20,1,0.873865,0.025066,21,balanced,gini
24,2,0.873854,0.025018,25,balanced,gini
21,3,0.872378,0.025263,22,balanced,gini
30,4,0.872372,0.025179,31,balanced,gini
28,4,0.872372,0.025179,29,balanced,gini


## 4. Random forest

1. Using `GridSearchCV` try different parameters of `n_estimators` (`5`, `10`, `50`, `100`), `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use random_state=21.
2. Create a dataframe from the results of the gridsearch and sort it ascendengly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [9]:
frt = RandomForestClassifier(random_state=21)

In [10]:
param_grid = {'n_estimators': [5, 10, 50, 100],
              'criterion': ['gini','entropy'],
              'max_depth': list(range(1, 50)),
              'class_weight': ['balanced', None],
              'random_state': [21]}

gs = GridSearchCV(frt, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
gs.fit(X_train, y_train)

print(f'лучшие параметры: {gs.best_params_}')
print(f'лучший score: {gs.best_score_}')

Fitting 5 folds for each of 784 candidates, totalling 3920 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 432 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 1244 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done 2376 tasks      | elapsed:   22.7s
[Parallel(n_jobs=-1)]: Done 3836 tasks      | elapsed:   37.7s
[Parallel(n_jobs=-1)]: Done 3920 out of 3920 | elapsed:   38.7s finished


лучшие параметры: {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 24, 'n_estimators': 100, 'random_state': 21}
лучший score: 0.9042929918766351


In [11]:
results = pd.DataFrame(gs.cv_results_)
results = results.sort_values('rank_test_score', ascending=True)
columns_to_show = [
    'rank_test_score', 'mean_test_score', 'std_test_score',
    'param_max_depth', 'param_class_weight', 'param_criterion'
]
results[columns_to_show].head()

Unnamed: 0,rank_test_score,mean_test_score,std_test_score,param_max_depth,param_class_weight,param_criterion
291,1,0.904293,0.012361,24,balanced,entropy
311,2,0.90429,0.012156,29,balanced,entropy
502,2,0.90429,0.010961,28,,gini
118,4,0.903549,0.012056,30,balanced,gini
515,5,0.903547,0.01438,31,,gini


## 5. Progress bar

Gridsearch can be a quite long process and you may find yourself wondering when it will end.
1. Create a manual gridsearch for the same parameters values of random forest iterating through the list of the possible values and calculating `cross_val_score` for each combination. Try to increase `n_jobs`. The value `cv` for `cross_val_score` is 5.
2. Track the progress using the library `tqdm.notebook`.
3. Create a dataframe from the results of the gridsearch with the columns corresponding to the names of the parameters and `mean_accuracy` and `std_accuracy`.
4. Sort it descendingly by the `mean_accuracy`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [12]:
grid = list(ParameterGrid(param_grid))
print(f'Количество комбинаций суперпараметров: {len(grid)}')

Количество комбинаций суперпараметров: 784


In [13]:
data = []

for params in tqdm(grid, desc='GridSearch Progress'):
    estimator = RandomForestClassifier(**params)
    sc = cross_val_score(estimator, X_train, y_train, cv=5, n_jobs=-1)
    d = {**params, 'mean_accuracy': np.mean(sc), 'std_accuracy': np.std(sc)}
    data.append(d)

GridSearch Progress:   0%|          | 0/784 [00:00<?, ?it/s]

In [14]:
result = pd.DataFrame(data)
result = result.sort_values('mean_accuracy', ascending=False)
result

Unnamed: 0,class_weight,criterion,max_depth,n_estimators,random_state,mean_accuracy,std_accuracy
291,balanced,entropy,24,100,21,0.904293,0.012361
311,balanced,entropy,29,100,21,0.904290,0.012156
502,,gini,28,50,21,0.904290,0.010961
118,balanced,gini,30,50,21,0.903549,0.012056
515,,gini,31,100,21,0.903547,0.014380
...,...,...,...,...,...,...,...
588,,entropy,1,5,21,0.353832,0.016467
200,balanced,entropy,2,5,21,0.353110,0.021165
4,balanced,gini,2,5,21,0.346419,0.029749
0,balanced,gini,1,5,21,0.283390,0.011062


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.

In [15]:
rfc = RandomForestClassifier(n_estimators=100, max_depth=24, criterion='entropy', class_weight="balanced", random_state=21)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

In [16]:
accuracy_score(y_test, y_pred)

0.9260355029585798