# Hyperparameter Tuning

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import joblib
import os

In [15]:
# Load the data

mean1 = 55
std_dev = 10
num_samples = 500

column1_numbers = np.random.normal(mean1, std_dev, num_samples)
column1_numbers = np.clip(column1_numbers, 30, 120)
column1_numbers = np.round(column1_numbers).astype(int)

mean2 = 18
std_dev2 = 3

column2_numbers = np.random.normal(mean2, std_dev2, num_samples)
column2_numbers = np.clip(column2_numbers, 12, 26)
column2_numbers = np.round(column2_numbers).astype(int)

column3_numbers = np.random.randint(2, size=num_samples)
column3_numbers[column1_numbers > mean1] = 1

data = {
    'MilesPerWeek': column1_numbers,
    'FarthestRun': column2_numbers,
    'Qualified': column3_numbers
}

df = pd.DataFrame(data)
print(df.head())

   MilesPerWeek  FarthestRun  Qualified
0            44           18          0
1            65           22          1
2            54           20          0
3            73           24          1
4            61           20          1


In [16]:
# Split the data
X = df.drop('Qualified', axis=1)
y = df['Qualified']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# GridSearchCV vs RandomizedSearchCV

GridSearchCV and RandomizedSearchCV are both methods used for hyperparameter tuning in machine learning models, but they differ in how they search the hyperparameter space.  
> GridSearchCV
> - Exhaustive Search: GridSearchCV performs an exhaustive search over a specified parameter grid. It tries every combination of the provided hyperparameter values.
> - Time-Consuming: Because it evaluates all possible combinations, it can be very time-consuming, especially with a large number of hyperparameters and values.
> - Deterministic: The results are deterministic, meaning that running GridSearchCV multiple times with the same data and parameters will yield the same results.

> RandomizedSearchCV
> - Random Search: RandomizedSearchCV samples a fixed number of hyperparameter combinations from a specified distribution. It does not try all possible combinations.
> - Faster: It is generally faster than GridSearchCV because it evaluates only a subset of the possible combinations.
> - Stochastic: The results can vary between runs because it samples hyperparameter combinations randomly.

In [17]:
# create a model
MODEL_NAME = 'models/best_model.pkl'
model = None

if os.path.exists(MODEL_NAME):
    model = joblib.load(MODEL_NAME)

if not model:
    rf = RandomForestClassifier()
    # see documentation of RandomForestClassifier for more options https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
    param_grid = [{ 
        'n_estimators': [500, 1000, 1500],
        # 'criterion': ['gini', 'entropy'],
        'min_samples_split': [5, 10, 15],
        'min_samples_leaf': [1, 2, 4],
        # 'max_depth': [10, 20, 30]
    }]
    
    gs = GridSearchCV(estimator=rf,
                      param_grid=param_grid,
                      cv=5,
                      n_jobs=-1,
                      scoring='accuracy',
                      verbose=1)
    gs.fit(X_train, y_train)
    
    params = gs.best_params_
    print('Best params: ', params)
    best_score = gs.best_score_
    print('Best score: ', best_score)
    best_model = gs.best_estimator_
    if not os.path.exists(MODEL_NAME):
        os.makedirs('models', exist_ok=True)
    joblib.dump(best_model, MODEL_NAME)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best params:  {'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 1000}
Best score:  0.7300000000000001


In [20]:
random_param_grid = [{ 
    'n_estimators': [500, 1000, 1500],
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [5, 10, 15],
    'min_samples_leaf': [1, 2, 4],
    'max_depth': [10, 20, 30]
}]

rf = RandomForestClassifier()

rs = RandomizedSearchCV(estimator=rf, 
                        param_distributions=random_param_grid, 
                        cv=5, # 5-fold cross validation
                        n_iter=10, # try 10 random parameter settings
                        n_jobs=-1,
                        scoring='accuracy',
                        verbose=1)

rs.fit(X_train, y_train)

params = rs.best_params_
print('Best params: ', params)
best_score = rs.best_score_
print('Best score: ', best_score)
best_model = rs.best_estimator_
if not os.path.exists(MODEL_NAME):
    os.makedirs('models', exist_ok=True)
joblib.dump(best_model, MODEL_NAME)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best params:  {'n_estimators': 1000, 'min_samples_split': 15, 'min_samples_leaf': 4, 'max_depth': 20, 'criterion': 'gini'}
Best score:  0.7275
