In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from scipy.stats import randint

# The non-tuned parameters dict used to initialize the model.
#   params ={'random_state': 2,
#            'class_weight': "balanced"}

# When isExhaustive is True, the dictionary(ies) must associate lists of values to each hyperparamter. 
# These the hyperparameter value combinations are exhaustively tested in the cross validation process.
#   exhaustive_hyperparams = {'max_leaf_nodes': list(range(45, 60)),
#          'max_depth': [112, 114, 118, 119] }

# When isExhaustive is False, the dictionary(ies) must associate scipy.stats distributions or list of values to each hyperparamter. 
# These distributions/lists are sampled in the cross validation process.
#   hyperparams = {'max_leaf_nodes': randint(10,100),
#            'max_depth': randint(10, 100) }

def getValidHyperparams(X_train, y_train, modelConstructor, params, hyperparams, isRandom=True):
  model =  modelConstructor(**params)
  # cv can take a CV Splitter object... make our own custom for more control than the automatic stratified splitting? see https://scikit-learn.org/stable/glossary.html#term-CV-splitter
  # cv=5 in itself is a hyper parameter that we should tune... manually? Values 3 and 4 change very little performance wise.
  model_cv = RandomizedSearchCV(model, hyperparams, cv=4, scoring='f1_micro', n_jobs=1, return_train_score=True) if isRandom else GridSearchCV(model, hyperparams, cv=4, scoring='f1_micro', n_jobs=1, return_train_score=True)

  cv_results = model_cv.fit(X_train, y_train)

  print(cv_results.best_params_)
  return cv_results.best_params_

In [None]:
# example of how it can be used. Copy this cell and the one above i the main ipynb to test. This can run for quite a while (20min +)
from sklearn.ensemble import RandomForestClassifier 

params = {'random_state': 2,
         'class_weight': "balanced"}
hyperparams = {'max_leaf_nodes': randint(10,100),
              'max_depth': randint(10, 100),
              'criterion': ['gini', 'entropy']}


val_hyperparams = getValidHyperparams(X_train, y_train, RandomForestClassifier, params, hyperparameters=hyperparams)
RF = RandomForestClassifier(**val_hyperparams, **params)
RF.fit(X_train, y_train)
