# Spaceship. Part 5.
## Hyperparameters tuning 

We'll load our prepared in Part 4 data, as well as the scores DataFrame, and set random seed:

In [1]:
# Random seed for reproducibility
SEED = 123

import pandas as pd

train = pd.read_csv('04_train_prepared.csv', index_col=0)
test =  pd.read_csv('04_test_prepared.csv', index_col=0)
scores_df = pd.read_csv('04_scores_df.csv', index_col=0)

Next, we'll create train_evaluate function, that will return average cross-validation ROC AUC score for a given set of parameters:

In [2]:
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

def train_evaluate(params):
    '''
    This function takes a train set, as well as a classifier for cross validation and parameters for that classifier.
    
    It returns average cross-validated ROC AUC score.
    '''

    # Set parameters for the model
    model.set_params(**params)

    
    # Create a StratifiedKFold object (6 splits with equal proportion of positive target values)
    skf = StratifiedKFold(n_splits=6, shuffle=True, random_state=SEED)
    
    # An empty list for collecting scores
    train_roc_auc_scores = []
    
    # Iterate through folds
    for train_index, cv_index in skf.split(train.drop('Transported', axis=1), train['Transported']):
        # Obtain training and testing folds
        cv_train, cv_test = train.iloc[train_index], train.iloc[cv_index]
        
        # Fit the model
        model.fit(cv_train.drop('Transported', axis=1), cv_train['Transported']) 
        
        # Calculate ROC AUC score and append to the scores lists
        train_pred_proba = model.predict_proba(cv_train.drop('Transported', axis=1))[:, 1]
        train_roc_auc_scores.append(roc_auc_score(cv_train['Transported'], train_pred_proba))
        
    return np.mean(train_roc_auc_scores)
        

We'll also define our classifier. We'll put n_estimators to 100 for speed. Greater number may increase performance.

In [3]:
# Prepare our best model for training
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=SEED,
                               n_estimators= 100,
                               n_jobs=-1
                               )

print(model)

RandomForestClassifier(n_jobs=-1, random_state=123)


We'll use Optuna for our tuning. For this, we'll need to create a study:

In [4]:
import optuna

study = optuna.create_study(study_name='04_RF', direction='maximize')

[I 2023-07-28 12:22:31,960] A new study created in memory with name: 04_RF


Next, we need to define objective function to optimize, which contains range of parameters for search. We'll need to redefine this function in order to change the ranges of search.

In [5]:
def objective(trial):
    params = {
        # 'n_estimators': optuna.distributions.IntDistribution(100, 1000),
        # 'criterion': optuna.distributions.CategoricalDistribution(['log_loss', 'entropy']),
        'criterion': trial.suggest_categorical('criterion', ['log_loss', 'gini']),
        'max_depth': trial.suggest_int('max_depth', 2, 20),
        'max_features': trial.suggest_int('max_features', 1, 15),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 20, 80),
        "min_impurity_decrease": trial.suggest_float("min_impurity_decrease", 1e-7, 5e-7, log=True),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 2, 20),
        'ccp_alpha': trial.suggest_float('ccp_alpha', 0, 0.4),
        'max_samples': trial.suggest_float('max_samples', 0.5, 1)
             
         }
    return train_evaluate(train)

Now we can run our study. Let's run it for ~1 minute (60 seconds):

In [6]:
study.optimize(objective, timeout=60, n_jobs=-1)

[W 2023-07-28 12:22:31,987] Trial 0 failed with parameters: {'criterion': 'log_loss', 'max_depth': 14, 'max_features': 11, 'max_leaf_nodes': 27, 'min_impurity_decrease': 1.5048661553436468e-07, 'min_samples_leaf': 3, 'ccp_alpha': 0.285561258444964, 'max_samples': 0.6457625368080321} because of the following error: ValueError("Invalid parameter 'Age' for estimator RandomForestClassifier(n_jobs=-1, random_state=123). Valid parameters are: ['bootstrap', 'ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'].").
Traceback (most recent call last):
  File "C:\Users\mikej\anaconda3\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\mikej\AppData\Local\Temp\ipykernel_31152\189726306.py", line 15, in object

[W 2023-07-28 12:22:32,001] Trial 5 failed with parameters: {'criterion': 'log_loss', 'max_depth': 8, 'max_features': 1, 'max_leaf_nodes': 56, 'min_impurity_decrease': 1.1846632171004268e-07, 'min_samples_leaf': 15, 'ccp_alpha': 0.3983466969012843, 'max_samples': 0.8369088475717932} because of the following error: ValueError("Invalid parameter 'Age' for estimator RandomForestClassifier(n_jobs=-1, random_state=123). Valid parameters are: ['bootstrap', 'ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'].").
Traceback (most recent call last):
  File "C:\Users\mikej\anaconda3\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\mikej\AppData\Local\Temp\ipykernel_31152\189726306.py", line 15, in object

[W 2023-07-28 12:22:32,013] Trial 10 failed with parameters: {'criterion': 'log_loss', 'max_depth': 7, 'max_features': 14, 'max_leaf_nodes': 61, 'min_impurity_decrease': 3.615880523510054e-07, 'min_samples_leaf': 10, 'ccp_alpha': 0.15844399221018102, 'max_samples': 0.8984496320239985} because of the following error: ValueError("Invalid parameter 'Age' for estimator RandomForestClassifier(n_jobs=-1, random_state=123). Valid parameters are: ['bootstrap', 'ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'].").
Traceback (most recent call last):
  File "C:\Users\mikej\anaconda3\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\mikej\AppData\Local\Temp\ipykernel_31152\189726306.py", line 15, in obje

ValueError: Invalid parameter 'Age' for estimator RandomForestClassifier(n_jobs=-1, random_state=123). Valid parameters are: ['bootstrap', 'ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'].