# Spaceship. Part 5. (continued)
## Hyperparameters tuning 

Here we'll proceed with hyperparameters searching process described in ['05_hyperparameters.ipynb'](05_hyperparameters.ipynb).\

This notebook can be re-run over and over to continue searching.

Choose running time:

In [171]:
HOURS = 0
MINUTES = 20
SECONDS = 0

RUNNING_TIME = HOURS * 3600 + MINUTES * 60 + SECONDS

Let's load our data, our Optuna study and define all the nesessary functions.

We'll put n_estimators in the search to 90 for speed. Greater numbers may increase scores. For the scores table and submissions we'll use 500 estimators.

In [172]:
# Random seed for reproducibility
SEED = 123

import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
import joblib
import optuna
import optuna.visualization as vis

train = pd.read_csv('04_train_prepared.csv', index_col=0)
test =  pd.read_csv('04_test_prepared.csv', index_col=0)
scores_df = pd.read_csv('05_scores_df.csv', index_col=0)
test_Ids = pd.read_csv('test_Ids.csv', index_col=0).reset_index(drop=True)

train['Transported'] = [1 if i else 0 for i in train['Transported']]

study = joblib.load("05_RF.pkl")
total_seconds = pd.read_csv('05_total_seconds.csv', index_col=0)

print('Before current session: ')
print("Best trial:", study.best_trial.number)
print("Best average cross-validation ROC AUC:", study.best_trial.value)
print("Best hyperparameters:", study.best_params)


def train_evaluate(params):
    '''
    This function takes  parameters for a classifier.
    
    It returns average cross-validated ROC AUC score.
    '''
    
    # Prepare our best estimator for training
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(random_state=SEED,
                               n_estimators= 100,
                               n_jobs=-1
                               )


    # Set parameters for the model
    model.set_params(**params)
    
    # Create a StratifiedKFold object (6 splits with equal proportion of positive target values)
    skf = StratifiedKFold(n_splits=6, shuffle=True, random_state=SEED)
    
    # An empty list for collecting scores
    test_roc_auc_scores = []
    
    # Iterate through folds
    for train_index, cv_index in skf.split(train.drop('Transported', axis=1), train['Transported']):
        # Obtain training and testing folds
        cv_train, cv_test = train.iloc[train_index], train.iloc[cv_index]
        
        # Fit the model
        model.fit(cv_train.drop('Transported', axis=1), cv_train['Transported']) 
        
        # Calculate ROC AUC score and append to the scores lists
        test_pred_proba = model.predict_proba(cv_test.drop('Transported', axis=1))[:, 1]
        test_roc_auc_scores.append(roc_auc_score(cv_test['Transported'], test_pred_proba))
        
    return np.mean(test_roc_auc_scores)
        

Before current session: 
Best trial: 664
Best average cross-validation ROC AUC: 0.8864496687391946
Best hyperparameters: {'criterion': 'log_loss', 'max_depth': 12, 'max_features': 9, 'max_leaf_nodes': 173, 'min_impurity_decrease': 8.171925773743862e-08, 'min_samples_leaf': 2, 'ccp_alpha': 0.0008752962343279396, 'max_samples': 0.9227960316695011}


In [173]:
def objective(trial):
    params = {
        # 'n_estimators': optuna.distributions.IntDistribution(100, 1000),
        # 'criterion': optuna.distributions.CategoricalDistribution(['log_loss', 'entropy']),
        'criterion': trial.suggest_categorical('criterion', ['log_loss', 'gini']),
        'max_depth': trial.suggest_int('max_depth', 2, 50),
        'max_features': trial.suggest_int('max_features', 1, 16),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 20, 500),
        "min_impurity_decrease": trial.suggest_float("min_impurity_decrease", 1e-9, 1e-1, log=True),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 2, 30),
        'ccp_alpha': trial.suggest_float('ccp_alpha', 1e-7, 4e-1, log=True),
        'max_samples': trial.suggest_float('max_samples', 0.3, 1)
             
         }
    return train_evaluate(params)

In [174]:
def get_cv_scores(train, test, model, scores_df, comment = "", verbose=False, prepare_submission=False):
    
    '''
    This function takes train and test sets, as well as a model for cross validation and a DataFrame with previous scores.
    It also takes an optional comment string to comment changes.
    
    Setting verbose to True makes function printing out updated scores.

    
    It returns:
        
        -) Updated DataFrame with new:
            1) Average training ROC AUC score.
            2) Average cross-validation ROC AUC score.
            3) Average training accuracy score. 
            4) Average cross-validation accuracy score.
        
        -) A dataset for a new submission, if prepare_submission is True
    '''
    
    # Create a StratifiedKFold object (6 splits with equal proportion of positive target values)
    skf = StratifiedKFold(n_splits=6, shuffle=True, random_state=SEED)
    
    # Empty lists for collecting scores
    train_roc_auc_scores = []
    cv_roc_auc_scores = []
    train_accuracy_scores = []
    cv_accuracy_scores = []
    
    # Iterate through folds
    for train_index, cv_index in skf.split(train.drop('Transported', axis=1), train['Transported']):
        # Obtain training and testing folds
        cv_train, cv_test = train.iloc[train_index], train.iloc[cv_index]
        
        # Fit the model
        model.fit(cv_train.drop('Transported', axis=1), cv_train['Transported']) 
        
        # Calculate scores and append to the scores lists
        train_pred_proba = model.predict_proba(cv_train.drop('Transported', axis=1))[:, 1]
        train_roc_auc_scores.append(roc_auc_score(cv_train['Transported'], train_pred_proba))
        cv_pred_proba = model.predict_proba(cv_test.drop('Transported', axis=1))[:, 1]
        cv_roc_auc_scores.append(roc_auc_score(cv_test['Transported'], cv_pred_proba))
        train_accuracy_scores.append(model.score(cv_train.drop('Transported', axis=1), cv_train['Transported']))
        cv_accuracy_scores.append(model.score(cv_test.drop('Transported', axis=1), cv_test['Transported']))
        

    # Update the scores DataFrame with average scores:
    
    scores_df.loc[len(scores_df)] = [comment, np.mean(train_roc_auc_scores), np.mean(cv_roc_auc_scores), \
                                     np.mean(train_accuracy_scores), np.mean(cv_accuracy_scores), np.nan]
    #scores_df.index = scores_df.index + 1
    #scores_df.sort_index()
    
    # Print the updated scores DataFrame
    if verbose:
        print(scores_df)
        
    submission = "prepare_submission=False"
        
    if prepare_submission:
    
        # Prepare the submission DataFrame
        test_pred = model.predict(test)
        test_pred = ["True" if i == 1 else "False" for i in test_pred]
        test_pred = pd.DataFrame(test_pred, columns=['Transported'])
        submission = pd.concat([test_Ids, test_pred], axis=1)

    
    return submission
                         

Now, let's optimize and observe results:

In [None]:
study.optimize(objective, timeout=RUNNING_TIME, n_jobs=-1)

Save our study back to the file:

In [None]:
total_seconds.iloc[0, 0] = total_seconds.iloc[0, 0] + RUNNING_TIME
joblib.dump(study, "05_RF.pkl")
total_seconds.to_csv('05_total_seconds.csv')

In [None]:
# Plotting Optimization History
optimization_history_plot = vis.plot_optimization_history(study, error_bar=True)
optimization_history_plot.show()

In [None]:
# Plotting Parameter Importance
param_importance_plot = vis.plot_param_importances(study)
param_importance_plot.show()

In [None]:
# Plotting a Contour Plot
contour_plot = vis.plot_contour(study, params=["max_depth", "min_samples_leaf"])
contour_plot.show()

In [None]:
print('After current session: ')
print("Best trial:", study.best_trial.number)
print("Best average cross-validation ROC AUC:", study.best_trial.value)
print("Best hyperparameters:", study.best_params)
total_hours = round(total_seconds.iloc[0, 0] / 3600, 3)
print("Total running time (hours):", total_hours)

Now, let's test our best model with greater number of estimators, put scores in the table with a comment "optuna_(number of hours)" and prepare a submission file:

In [None]:
%%time

model_for_tests = RandomForestClassifier(random_state=SEED,
                               n_estimators= 500,
                               n_jobs=-1,
                               **study.best_params
                               )

print(model_for_tests)

submission = get_cv_scores(train, test, model_for_tests, scores_df,
                              comment= "optuna_{}".format(total_hours),
                              prepare_submission=True)

scores_df

In [None]:
# FOR SUMBISSION

#submission.to_csv('05_submission_NUMBER.csv', index=False)

#scores_df.loc[13, 'Test accuracy'] = 0.79845

#scores_df