<h1>Grid Search Decision Tree & Random Forest</h1>

<h3>A - Prepare for Grid Search</h3>

In [1]:
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
challenge_set_df = pd.read_csv('../data/processed_encoded_challenge_set_df.csv')

# Pick up features from dataset
features = challenge_set_df[['adep', 'country_code_adep', 'ades', 
                             'country_code_ades', 'aircraft_type', 
                             'wtc', 'airline', 'flight_duration', 
                             'taxiout_time', 'flown_distance']]

# Pick up target from dataset
target = challenge_set_df['tow']

# Function to score model using Root Mean Square Error
def rmse(y_true, y_pred):
    return np.sqrt(((y_true - y_pred) ** 2).mean())
    
# Create a scorer
rmse_scorer = make_scorer(rmse, greater_is_better=False)

# Make cross validation strategy
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=123)

# Function to print the best parameters and best RMSE score after tunning
def print_model_score(grid_search):
    print("Best Parameters:", grid_search.best_params_)
    print("Best Score (Negative Mean Squared Error):", grid_search.best_score_)

<h3>B - Grid Search Decision Tree</h3>

In [3]:
param_grid = {
    'max_depth': [10, 20, 30, 40, 50],
    'min_samples_split': [1, 2, 4, 6, 8],
    'min_samples_leaf': [1, 2, 3],
    'max_features': [None, 'sqrt'],
    'criterion': ['squared_error', 'friedman_mse'],
    'ccp_alpha': [0, 0.001, 0.005]
}

# Set up GridSearchCV with custom scoring
grid_search = GridSearchCV(
    estimator=DecisionTreeRegressor(random_state=123), 
    param_grid=param_grid, 
    scoring=rmse_scorer, 
    cv=cv_strategy, 
    n_jobs=-1, 
    verbose=1
)

grid_search.fit(features, target)

print_model_score(grid_search)

Fitting 5 folds for each of 900 candidates, totalling 4500 fits
Best Parameters: {'ccp_alpha': 0, 'criterion': 'squared_error', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 8}
Best Score (Negative Mean Squared Error): -4461.064639174912


<h3>C - Grid Search Random Forest</h3>

In [5]:
param_grid = {
    'ccp_alpha': [0], 
    'criterion': ['squared_error'], 
    'max_depth': [10], 
    'max_features': [None], 
    'min_samples_leaf': [1], 
    'min_samples_split': [8],
    'n_estimators': [20, 40, 60, 80, 100],
    'bootstrap': [True, False]
}

# Set up GridSearchCV with custom scoring
grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=123), 
    param_grid=param_grid, 
    scoring=rmse_scorer, 
    cv=cv_strategy, 
    n_jobs=-1, 
    verbose=1
)

grid_search.fit(features, target)

print_model_score(grid_search)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Parameters: {'bootstrap': True, 'ccp_alpha': 0, 'criterion': 'squared_error', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 100}
Best Score (Negative Mean Squared Error): -4360.1402562306375
