<h1>Preprocess</h1>

In [1]:
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.preprocessing import LabelEncoder
from IPython.display import display
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

<h1>Prepare for Grid Search</h1>

In [2]:
# Pick up features from dataset
features = challenge_set_df[['adep', 'country_code_adep', 'ades', 'country_code_ades', 'aircraft_type', 'wtc', 'airline']]

# Pick up target from dataset
target = challenge_set_df['tow']

# Function to score model using Root Mean Square Error
def rmse(y_true, y_pred):
    return np.sqrt(((y_true - y_pred) ** 2).mean())
    
# Create a scorer
rmse_scorer = make_scorer(rmse, greater_is_better=False)

# Make cross validation strategy
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=123)

# Function to print the best parameters and best RMSE score after tunning
def print_model_score(grid_search):
    print("Best Parameters:", grid_search.best_params_)
    print("Best Score (Negative Mean Squared Error):", grid_search.best_score_)

NameError: name 'challenge_set_df' is not defined

<h1>Grid Search using DecisionTreeRegressor()</h1>

In [None]:
param_grid = {
    'max_depth': [10, 20, 30, 40, 50],
    'min_samples_split': [1, 2, 4, 6, 8],
    'min_samples_leaf': [1, 2, 3],
    'max_features': [None, 'sqrt'],
    'criterion': ['squared_error', 'friedman_mse'],
    'ccp_alpha': [0, 0.001, 0.005]
}

# Set up GridSearchCV with custom scoring
grid_search = GridSearchCV(
    estimator=DecisionTreeRegressor(random_state=123), 
    param_grid=param_grid, 
    scoring=rmse_scorer, 
    cv=cv_strategy, 
    n_jobs=-1, 
    verbose=1
)

grid_search.fit(features, target)

print_model_score(grid_search)

<h1>Grid Search using RandomForestRegressor()</h1>

In [None]:
param_grid = {
    'ccp_alpha': [0.001], 
    'criterion': ['squared_error'], 
    'max_depth': [30], 
    'max_features': [None], 
    'min_samples_leaf': [2], 
    'min_samples_split': [8],
    'n_estimators': [20, 40, 60, 80, 100],
    'bootstrap': [True, False]
}

# Set up GridSearchCV with custom scoring
grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=123), 
    param_grid=param_grid, 
    scoring=rmse_scorer, 
    cv=cv_strategy, 
    n_jobs=-1, 
    verbose=1
)

grid_search.fit(features, target)

print_model_score(grid_search)

<h1>Predicting using the tunned parameters of RandomForestRegressor()</h1>

In [None]:
# Function to plot an example prediction with the tunned parameters
def plot_rmse(observed, predicted, show_error=False):
    # Calculate errors
    errors = observed - predicted
    squared_errors = errors**2
    mse = np.mean(squared_errors)
    rmse = np.sqrt(mse)
    
    # Create a plot
    plt.figure(figsize=(10, 6))
    
    # Plot the observed and predicted values
    plt.plot(observed, 'o', label='Observed Values')
    plt.plot(predicted, 'o', label='Predicted Values')
    
    # Plot the errors
    for i in range(len(observed)):
        plt.plot([i, i], [observed[i], predicted[i]], 'r--')
        if show_error:
            plt.text(i, (observed[i] + predicted[i]) / 2, f'{errors[i]:.1f}', ha='right', color='red')
    
    # Adding labels and title
    plt.xlabel('Data Points')
    plt.ylabel('Values')
    plt.title('Visualizing RMSE Calculation')
    plt.legend()
    plt.grid(True)
    
    # Show RMSE in the plot
    plt.text(1.5, max(observed) - 1, f'RMSE = {rmse:.3f}', fontsize=12, color='blue')

    plt.xticks([])
    plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=123, shuffle=True)

model = RandomForestRegressor(
    bootstrap=True,
    ccp_alpha=0.001,
    criterion='squared_error',
    max_depth=30,
    max_features=None,
    min_samples_leaf=2,
    min_samples_split=8,
    n_estimators=100,
    random_state=123
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [None]:
random_indices = np.random.choice(X_test.index, size=100, replace=False)

X_test_sampled = X_test.loc[random_indices]
y_test_sampled = y_test.loc[random_indices]

y_pred_sampled = model.predict(X_test_sampled)

plot_rmse(observed=np.array(y_test_sampled), predicted=np.array(y_pred_sampled), show_error=False)

In [None]:
random_indices = np.random.choice(X_test.index, size=10, replace=False)

X_test_sampled = X_test.loc[random_indices]
y_test_sampled = y_test.loc[random_indices]

y_pred_sampled = model.predict(X_test_sampled)

plot_rmse(observed=np.array(y_test_sampled), predicted=np.array(y_pred_sampled), show_error=True)