#### Vary hyperparameters RandomizedSearchCV and GridSearchCV


In [2]:
import sys
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

from quant.Classification import Classification
from quant.factor import get_factors

%reload_ext autoreload
%autoreload 2
%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

np.set_printoptions(precision=4, suppress=True)

import warnings
warnings.filterwarnings('ignore')

In [3]:
klines_1h_file = 'data/futures_klines_1h_BTCUSDT_20200101_20220430.zip'
kline_1h_df = pd.read_csv(klines_1h_file, index_col=0, parse_dates=True)
kline_1h_factors = get_factors(kline_1h_df, sign_ratio=1)

In [4]:
X = kline_1h_factors.drop(['Price', 'Returns', 'CumReturns', 'Log_Returns', 'Returns_Ratio', 'Sign'], axis=1)
y = kline_1h_factors['Sign']
X.shape, y.shape

((20217, 18), (20217,))

In [5]:
dt_gs = Classification(X, y)
dt_gs.fit_predict(MinMaxScaler(), DecisionTreeClassifier(random_state=64))
mse, rmse, r2train, r2test = dt_gs.eval_metrics()
mse, rmse, r2train, r2test

(0.49455984174085066, 0.7032494875510757, 1.0, 0.5054401582591493)

In [6]:
# GridSearchCV
param_grid = {
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_depth': [1, 2, 3, 4, 6, 7, 8, 9, 10],
    'classifier__max_features': [1, 2, 3, 4, 6, 7, 8, 9, 10],
    'classifier__min_samples_leaf': [1, 2, 3, 4, 6, 7, 8, 9, 10],
    'classifier__min_samples_split': [1, 2, 3, 4, 6, 7, 8, 9, 10],
}
grid_search = GridSearchCV(dt_gs.pipe, param_grid, scoring='accuracy', n_jobs=-1, cv=5, verbose=1)
grid_search_best_model = grid_search.fit(X, y)

Fitting 5 folds for each of 13122 candidates, totalling 65610 fits


In [7]:
grid_search_best_model.best_params_

{'classifier__criterion': 'gini',
 'classifier__max_depth': 1,
 'classifier__max_features': 6,
 'classifier__min_samples_leaf': 1,
 'classifier__min_samples_split': 2}

In [14]:
grid_search_best_model.best_score_

0.5369744705347289

In [8]:
dt_gs_best = Classification(X, y)
dt_gs_best.fit_predict(
    MinMaxScaler(),
    DecisionTreeClassifier(criterion='gini',
                           max_depth=1,
                           max_features=6,
                           min_samples_leaf=1,
                           min_samples_split=2,
                           random_state=64))
gs_best_mse, gs_best_rmse, gs_best_r2train, gs_best_r2test = dt_gs_best.eval_metrics()
gs_best_mse, gs_best_rmse, gs_best_r2train, gs_best_r2test

(0.4710682492581602,
 0.6863441186884027,
 0.5397267050021641,
 0.5289317507418397)

In [9]:
# RandomizedSearchCV
dt_rs = Classification(X, y)
dt_rs.fit_predict(MinMaxScaler(), DecisionTreeClassifier(random_state=64))

param_grid = {
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_depth': [1, 2, 3, 4, 6, 7, 8, 9, 10],
    'classifier__max_features': [1, 2, 3, 4, 6, 7, 8, 9, 10],
    'classifier__min_samples_leaf': [1, 2, 3, 4, 6, 7, 8, 9, 10],
    'classifier__min_samples_split': [1, 2, 3, 4, 6, 7, 8, 9, 10],
}
rand_search = RandomizedSearchCV(dt_rs.pipe, param_grid, scoring='accuracy', n_jobs=-1, cv=5, verbose=1)
rand_search_best_model = rand_search.fit(X, y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [10]:
rand_search_best_model.best_params_

{'classifier__min_samples_split': 7,
 'classifier__min_samples_leaf': 6,
 'classifier__max_features': 7,
 'classifier__max_depth': 1,
 'classifier__criterion': 'gini'}

In [15]:
rand_search_best_model.best_score_

0.5369744705347289

In [11]:
dt_rs_best = Classification(X, y)
dt_rs_best.fit_predict(
    MinMaxScaler(),
    DecisionTreeClassifier(criterion='gini',
                           max_depth=1,
                           max_features=7,
                           min_samples_leaf=6,
                           min_samples_split=7,
                           random_state=64))
rs_best_mse, rs_best_rmse, rs_best_r2train, rs_best_r2test = dt_rs_best.eval_metrics()
rs_best_mse, rs_best_rmse, rs_best_r2train, rs_best_r2test

(0.4762611275964392, 0.690116749250762, 0.5354603351264453, 0.5237388724035609)

In [16]:
# Result

result = pd.DataFrame()
result = result.append([['criterion', 'gini', 'gini']])
result = result.append([['max_depth', '1', '1']])
result = result.append([['max_features', '6', '7']])
result = result.append([['min_samples_leaf', '1', '6']])
result = result.append([['min_samples_split', '2', '7']])
result = result.append([['best_score_', '0.536974', '0.536974']])

result.columns = ['Hyperparameter', 'GridSearchCV', 'RandomizedSearchCV']
result.reset_index(drop=True, inplace=True)
print(result.to_latex(index=False))
result

\begin{tabular}{lll}
\toprule
   hyperparameter & GridSearchCV & RandomizedSearchCV \\
\midrule
        criterion &         gini &               gini \\
        max\_depth &            1 &                  1 \\
     max\_features &            6 &                  7 \\
 min\_samples\_leaf &            1 &                  6 \\
min\_samples\_split &            2 &                  7 \\
       best score &     0.536974 &           0.536974 \\
\bottomrule
\end{tabular}



Unnamed: 0,hyperparameter,GridSearchCV,RandomizedSearchCV
0,criterion,gini,gini
1,max_depth,1,1
2,max_features,6,7
3,min_samples_leaf,1,6
4,min_samples_split,2,7
5,best score,0.536974,0.536974


In [13]:
result = pd.DataFrame()
result = result.append([['No Hyperparameters', mse, rmse, r2train, r2test]])
result = result.append([['GridSearchCV', gs_best_mse, gs_best_rmse, gs_best_r2train, gs_best_r2test]])
result = result.append([['RandomizedSearchCV', rs_best_mse, rs_best_rmse, rs_best_r2train, rs_best_r2test]])
result.columns = ['Search Type', 'MSE', 'RMSE', 'R2Train', 'R2Test']
result.reset_index(drop=True, inplace=True)
print(result.to_latex(index=False))
result

\begin{tabular}{lrrrr}
\toprule
       Search Type &    MSE &   RMSE &  R2Train &  R2Test \\
\midrule
No Hyperparameters & 0.4946 & 0.7032 &   1.0000 &  0.5054 \\
      GridSearchCV & 0.4711 & 0.6863 &   0.5397 &  0.5289 \\
RandomizedSearchCV & 0.4763 & 0.6901 &   0.5355 &  0.5237 \\
\bottomrule
\end{tabular}



Unnamed: 0,Search Type,MSE,RMSE,R2Train,R2Test
0,No Hyperparameters,0.4946,0.7032,1.0,0.5054
1,GridSearchCV,0.4711,0.6863,0.5397,0.5289
2,RandomizedSearchCV,0.4763,0.6901,0.5355,0.5237


#### Conclusion

From table, GridSearchCV and RandomizedSearchCV provide better parameters for the model. The difference between GridSearchCV and RandomizedSearchCV is very small. We notice that the cost time of GridSearchCV is 3m 50s, much longer than RandomizedSearchCV 1s. So, if the run time is too long,  RandomizedSearchCV is another better solusion.