# Day 09. Exercise 01
# Gridsearch

## 0. Imports

In [3]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [4]:
from sklearn.model_selection import train_test_split, GridSearchCV, ParameterGrid, cross_val_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

## 1. Preprocessing

1. Прочитайте файл [`day-of-week-not-scaled.csv`](https://drive.google.com/file/d/1AlGvsJDSzPT_70caausx8bFuupIEZkfh/view?usp=sharing). Он похож на файл из предыдущего упражнения, но на этот раз мы не масштабировали непрерывные признаки (мы больше не будем использовать logreg).
2. Используя `train_test_split` с параметрами `test_size=0.2`, `random_state=21`, получаем `X_train`, `y_train`, `X_test`, `y_test`. Используйте дополнительный параметр `stratify`.

In [5]:
df = pd.read_csv('../data/day-of-week-not-scaled.csv')
df2 = pd.read_csv('../data/dayofweek.csv')
df['dayofweek'] = df2['dayofweek']

x = df.drop(['dayofweek'], axis=1)
y = df['dayofweek']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=21, stratify=y)

## 2. SVM gridsearch

1. Используя `GridSearchCV`, попробуйте различные параметры ядра (`linear`, `rbf`, `sigmoid`), C (`0.01`, `0.1`, `1`, `1.5`, `5`, `10`), gamma (`scale`, `auto`), class_weight (`balanced`, `None`), используйте `random_state=21` и `probability=True` и получите наилучшую комбинацию из них с точки зрения точности.
2. Создайте фрейм данных из результатов gridsearch и отсортируйте его по возрастанию по `rank_test_score`. Проверьте, велика ли разница между различными комбинациями (иногда более простая модель может дать сопоставимый результат).

In [6]:
model = SVC(random_state=21, probability=True)

param_grid = {'kernel': ['linear', 'rbf', 'sigmoid'],
            'C': [0.01, 0.1, 1, 1.5, 5, 10],
            'gamma': ['scale', 'auto'],
            'class_weight': ['balanced', None]}

gscv = GridSearchCV(estimator=model,
                    param_grid=param_grid,
                    scoring='accuracy',
                    n_jobs=-1)
gscv.fit(x_train, y_train)

print(f'Best params {gscv.best_params_}')
print(f'Best score {gscv.best_score_}')

params = pd.DataFrame(gscv.cv_results_)
params = params.sort_values('rank_test_score', ascending=True)
params

Best params {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf'}
Best score 0.8761090458488228


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
70,0.579127,0.004651,0.033142,0.000919,10.0,,auto,rbf,"{'C': 10, 'class_weight': None, 'gamma': 'auto...",0.900000,0.848148,0.885185,0.884758,0.862454,0.876109,0.018419,1
64,0.582633,0.005475,0.032866,0.000299,10.0,balanced,auto,rbf,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.877778,0.851852,0.862963,0.873606,0.851301,0.863500,0.010870,2
58,0.571245,0.033564,0.036928,0.002434,5.0,,auto,rbf,"{'C': 5, 'class_weight': None, 'gamma': 'auto'...",0.825926,0.811111,0.818519,0.821561,0.802974,0.816018,0.008116,3
52,0.633290,0.053589,0.040646,0.003659,5.0,balanced,auto,rbf,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.844444,0.785185,0.792593,0.817844,0.802974,0.808608,0.021007,4
60,49.768416,4.199656,0.009930,0.000610,10.0,balanced,scale,linear,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.729630,0.700000,0.755556,0.754647,0.665428,0.721052,0.034438,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53,0.665772,0.056007,0.020738,0.000825,5.0,balanced,auto,sigmoid,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.144444,0.148148,0.137037,0.126394,0.092937,0.129792,0.019869,68
65,0.583730,0.012254,0.019202,0.000540,10.0,balanced,auto,sigmoid,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.122222,0.140741,0.129630,0.100372,0.085502,0.115693,0.020052,69
41,0.693456,0.007474,0.019468,0.000254,1.5,balanced,auto,sigmoid,"{'C': 1.5, 'class_weight': 'balanced', 'gamma'...",0.066667,0.085185,0.081481,0.078067,0.085502,0.079380,0.006913,70
17,0.764722,0.052643,0.026423,0.010738,0.1,balanced,auto,sigmoid,"{'C': 0.1, 'class_weight': 'balanced', 'gamma'...",0.062963,0.066667,0.062963,0.059480,0.059480,0.062310,0.002678,71


## 3. Decision tree

1. Используя `GridSearchCV`, попробуйте различные параметры `max_depth` (от `1` до `49`), `class_weight` (`balanced`, `None`) и `criterion` (`entropy` и `gini`) и получите наилучшую комбинацию из них с точки зрения точности. Используйте `random_state=21`.
2. Создайте кадр данных из результатов gridsearch и отсортируйте его по возрастанию по `rank_test_score`, проверьте, есть ли большая разница между различными комбинациями (иногда более простая модель может дать сопоставимый результат).

In [7]:
model = DecisionTreeClassifier(random_state=21)

param_grid = {'max_depth': np.arange(1, 49, 1),
                'class_weight': ['balanced', None],
                'criterion': ['entropy', 'gini']}

gscv = GridSearchCV(estimator=model,
                    param_grid=param_grid,
                    scoring='accuracy',
                    n_jobs=-1)
gscv.fit(x_train, y_train)

print(f'Best params {gscv.best_params_}')
print(f'Best score {gscv.best_score_}')

params = pd.DataFrame(gscv.cv_results_)
params = params.sort_values('rank_test_score', ascending=True)
params

Best params {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': np.int64(22)}
Best score 0.8731212997384002


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
69,0.007110,0.000568,0.002318,0.000117,balanced,gini,22,"{'class_weight': 'balanced', 'criterion': 'gin...",0.885185,0.862963,0.903704,0.881041,0.832714,0.873121,0.023998,1
68,0.007086,0.000714,0.002315,0.000173,balanced,gini,21,"{'class_weight': 'balanced', 'criterion': 'gin...",0.888889,0.859259,0.903704,0.884758,0.828996,0.873121,0.026300,2
80,0.007002,0.000273,0.002473,0.000210,balanced,gini,33,"{'class_weight': 'balanced', 'criterion': 'gin...",0.888889,0.866667,0.903704,0.873606,0.832714,0.873116,0.023911,3
81,0.007280,0.000482,0.002313,0.000064,balanced,gini,34,"{'class_weight': 'balanced', 'criterion': 'gin...",0.888889,0.866667,0.903704,0.873606,0.832714,0.873116,0.023911,3
84,0.008235,0.002590,0.002442,0.000548,balanced,gini,37,"{'class_weight': 'balanced', 'criterion': 'gin...",0.888889,0.866667,0.903704,0.873606,0.832714,0.873116,0.023911,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50,0.005102,0.000741,0.002922,0.000735,balanced,gini,3,"{'class_weight': 'balanced', 'criterion': 'gin...",0.388889,0.303704,0.403704,0.427509,0.345725,0.373906,0.044064,188
96,0.003809,0.000488,0.002414,0.000324,,entropy,1,"{'class_weight': None, 'criterion': 'entropy',...",0.370370,0.351852,0.359259,0.353160,0.342007,0.355330,0.009338,189
144,0.003579,0.000261,0.002408,0.000251,,gini,1,"{'class_weight': None, 'criterion': 'gini', 'm...",0.370370,0.351852,0.359259,0.353160,0.342007,0.355330,0.009338,189
0,0.005176,0.000555,0.002571,0.000561,balanced,entropy,1,"{'class_weight': 'balanced', 'criterion': 'ent...",0.262963,0.318519,0.266667,0.323420,0.260223,0.286358,0.028376,191


## 4. Random forest

1. Используя `GridSearchCV`, попробуйте различные параметры `n_estimators` (`5`, `10`, `50`, `100`), `max_depth` (от `1` до `49`), `class_weight` (`balanced`, `None`) и `criterion` (`entropy` и `gini`) и получите наилучшую комбинацию из них с точки зрения точности. Используйте random_state=21.
2. Создайте кадр данных из результатов gridsearch и отсортируйте его по возрастанию по `rank_test_score`, проверьте, есть ли большая разница между различными комбинациями (иногда более простая модель может дать сопоставимый результат).

In [8]:
model = RandomForestClassifier(random_state=21)

param_grid = {'n_estimators': [5, 10, 50, 100],
                'max_depth': np.arange(1, 49, 1),
                'class_weight': ['balanced', None],
                'criterion': ['entropy', 'gini']}

gscv = GridSearchCV(estimator=model,
                    param_grid=param_grid,
                    scoring='accuracy',
                    n_jobs=-1)
gscv.fit(x_train, y_train)

print(f'Best params {gscv.best_params_}')
print(f'Best score {gscv.best_score_}')

params = pd.DataFrame(gscv.cv_results_)
params = params.sort_values('rank_test_score', ascending=True)
params

Best params {'class_weight': None, 'criterion': 'gini', 'max_depth': np.int64(28), 'n_estimators': 50}
Best score 0.9042902381935839


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
686,0.139923,0.014801,0.010443,0.001515,,gini,28,50,"{'class_weight': None, 'criterion': 'gini', 'm...",0.922222,0.900000,0.907407,0.903346,0.888476,0.904290,0.010961,1
699,0.277447,0.019353,0.015236,0.000752,,gini,31,100,"{'class_weight': None, 'criterion': 'gini', 'm...",0.918519,0.911111,0.900000,0.910781,0.877323,0.903547,0.014380,2
310,0.127478,0.005379,0.008585,0.000095,balanced,gini,30,50,"{'class_weight': 'balanced', 'criterion': 'gin...",0.922222,0.907407,0.881481,0.907063,0.895911,0.902817,0.013554,3
326,0.129620,0.001969,0.009392,0.001230,balanced,gini,34,50,"{'class_weight': 'balanced', 'criterion': 'gin...",0.922222,0.907407,0.892593,0.907063,0.884758,0.902809,0.013010,4
727,0.301143,0.028789,0.016234,0.000348,,gini,38,100,"{'class_weight': None, 'criterion': 'gini', 'm...",0.914815,0.911111,0.900000,0.903346,0.884758,0.902806,0.010460,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
384,0.012225,0.001106,0.002936,0.000037,,entropy,1,5,"{'class_weight': None, 'criterion': 'entropy',...",0.355556,0.366667,0.374074,0.345725,0.327138,0.353832,0.016467,764
4,0.012045,0.000279,0.003001,0.000116,balanced,entropy,2,5,"{'class_weight': 'balanced', 'criterion': 'ent...",0.318519,0.366667,0.381481,0.353160,0.345725,0.353110,0.021165,765
196,0.012033,0.000258,0.002902,0.000013,balanced,gini,2,5,"{'class_weight': 'balanced', 'criterion': 'gin...",0.311111,0.377778,0.377778,0.353160,0.312268,0.346419,0.029749,766
192,0.013408,0.001598,0.003104,0.000182,balanced,gini,1,5,"{'class_weight': 'balanced', 'criterion': 'gin...",0.262963,0.292593,0.285185,0.282528,0.293680,0.283390,0.011062,767


## 5. Progress bar

Поиск решетки может быть довольно долгим процессом, и вы можете задаться вопросом, когда же он закончится.
1. Создайте ручной gridsearch для тех же значений параметров random forest, итерируя список возможных значений и вычисляя `cross_val_score` для каждой комбинации. Попробуйте увеличить `n_jobs`. Значение `cv` для `cross_val_score` равно 5.
2. Отслеживайте прогресс, используя библиотеку `tqdm.notebook`.
3. Создайте фрейм данных из результатов gridsearch со столбцами, соответствующими названиям параметров, а также `mean_accuracy` и `std_accuracy`.
4. Отсортируйте его по убыванию `mean_accuracy`, проверьте, есть ли большая разница между различными комбинациями (иногда более простая модель может дать сопоставимый результат).

In [9]:
def gridsearch(param_grid, model_class):
    grid = list(ParameterGrid(param_grid))
    result = pd.DataFrame()

    for params in tqdm(grid):
        row = {}
        model = model_class(**params)
        score = cross_val_score(model, x_train, y_train, cv=5, n_jobs=-1)
        row = {**params, 'mean_accuracy': np.mean(score), 'std_accuracy': np.std(score)}
        result = pd.concat([result, pd.DataFrame([row])], ignore_index=True)

    result = result.sort_values('mean_accuracy', ascending=False)
    return result

In [10]:
model = RandomForestClassifier
param_grid = {'n_estimators': [5, 10, 50, 100],
                'max_depth': np.arange(1, 49, 1),
                'class_weight': ['balanced', None],
                'criterion': ['entropy', 'gini'],
                'random_state': [21]}

result = gridsearch(param_grid, model)
result

  0%|          | 0/768 [00:00<?, ?it/s]

Unnamed: 0,class_weight,criterion,max_depth,n_estimators,random_state,mean_accuracy,std_accuracy
686,,gini,28,50,21,0.904290,0.010961
699,,gini,31,100,21,0.903547,0.014380
310,balanced,gini,30,50,21,0.902817,0.013554
326,balanced,gini,34,50,21,0.902809,0.013010
751,,gini,44,100,21,0.902806,0.010460
...,...,...,...,...,...,...,...
384,,entropy,1,5,21,0.353832,0.016467
4,balanced,entropy,2,5,21,0.353110,0.021165
196,balanced,gini,2,5,21,0.346419,0.029749
192,balanced,gini,1,5,21,0.283390,0.011062


## 6. Predictions

1. Выберите лучшую модель и используйте ее для прогнозирования тестового набора данных.
2. Рассчитайте итоговую точность.

In [11]:
best_params = {'class_weight': None, 'criterion': 'gini', 'max_depth': np.int64(28), 'n_estimators': 50}
model = RandomForestClassifier(random_state=21, **best_params)
model.fit(x_train, y_train)
print(f'Accuracy = {accuracy_score(y_test, model.predict(x_test))}')

Accuracy = 0.9289940828402367
