<a href="https://colab.research.google.com/github/JONICK277/ML/blob/main/model_eval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model Evaluation with Nested Cross-Validation

This notebook performs model evaluation using nested cross-validation for hyperparameter tuning.
It then evaluates each model on a held-out validation set and generates predictions on an external test set,
saving the results as Excel files for each model.

In [1]:
import pandas as pd
import numpy as np
import pickle
from math import sqrt
from pprint import pprint
from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# --- Data Download ---
# Execute this if you are running the notebook in Google Colab
!git clone https://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@github.com/JONICK277/ML.git

# Load the cleaned data
train_cleaned = pd.read_pickle("ML/data/cleaned/train/train_cleaned.pkl")
test_cleaned  = pd.read_pickle("ML/data/cleaned/test/test_cleaned.pkl")
# Alternatively, if running locally, uncomment the following:
# train_cleaned = pd.read_pickle("../../data/cleaned/train/train_cleaned.pkl")
# test_cleaned  = pd.read_pickle("../../data/cleaned/test/test_cleaned.pkl")

# --- Preparation ---
target = "LAID_UP_TIME"
X = train_cleaned.drop(columns=[target])
y = train_cleaned[target]

# Split the cleaned training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

Cloning into 'ML'...
remote: Enumerating objects: 104, done.[K
remote: Counting objects: 100% (54/54), done.[K
remote: Compressing objects: 100% (47/47), done.[K
remote: Total 104 (delta 13), reused 29 (delta 6), pack-reused 50 (from 1)[K
Receiving objects: 100% (104/104), 113.56 MiB | 11.35 MiB/s, done.
Resolving deltas: 100% (14/14), done.
Updating files: 100% (22/22), done.


## Nested Cross-Validation Function

This function performs nested cross-validation: the outer loop estimates the generalization error,
and the inner loop tunes hyperparameters using GridSearchCV.

In [2]:
def nested_cv(model, param_grid, X, y, cv_outer=5, cv_inner=3):
    """
    Perform nested cross-validation.
    Returns a list of RMSE scores for each outer fold.
    """
    outer_cv = KFold(n_splits=cv_outer, shuffle=True, random_state=42)
    outer_rmse = []

    for train_idx, test_idx in outer_cv.split(X):
        X_outer_train, X_outer_test = X.iloc[train_idx], X.iloc[test_idx]
        y_outer_train, y_outer_test = y.iloc[train_idx], y.iloc[test_idx]

        # Inner CV for hyperparameter tuning using GridSearchCV
        inner_cv = KFold(n_splits=cv_inner, shuffle=True, random_state=42)
        gs = GridSearchCV(estimator=model, param_grid=param_grid, cv=inner_cv,
                          scoring='neg_mean_squared_error', n_jobs=-1)
        gs.fit(X_outer_train, y_outer_train)

        best_model = gs.best_estimator_
        y_pred_outer = best_model.predict(X_outer_test)
        rmse = sqrt(mean_squared_error(y_outer_test, y_pred_outer))
        outer_rmse.append(rmse)
        print(f"Outer fold RMSE: {rmse:.4f}")

    return outer_rmse

## Model 1: Random Forest Regressor

In [3]:
# Define hyperparameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [500, 700, 800, 1000],
    'max_features': ['sqrt'],
    'max_depth': [10, 20, 30],
    'min_samples_split': [5, 10, 20],
    'min_samples_leaf': [2, 5, 10],
    'bootstrap': [True, False]
}
pprint(param_grid_rf)

# Initialize the Random Forest model
rf_model = RandomForestRegressor(random_state=42)
print("\n--- Random Forest Nested CV ---")
rf_rmse_scores = nested_cv(rf_model, param_grid_rf, X_train, y_train, cv_outer=5, cv_inner=3)
print("Random Forest Nested CV RMSE scores:", rf_rmse_scores)
print("Mean RF Nested RMSE:", np.mean(rf_rmse_scores))
best_rf = rf_model.best_estimator_

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30],
 'max_features': ['sqrt'],
 'min_samples_leaf': [2, 5, 10],
 'min_samples_split': [5, 10, 20],
 'n_estimators': [500, 700, 800, 1000]}

--- Random Forest Nested CV ---


KeyboardInterrupt: 

In [None]:
### Random Forest Final Evaluation

y_val_pred_rf = best_rf.predict(X_val)
val_rmse_rf = sqrt(mean_squared_error(y_val, y_val_pred_rf))
print("\nFinal Validation RMSE (Random Forest):", val_rmse_rf)

##tuning on whole dataset for prediction

In [None]:
# Final tuning on the entire training set and saving the best model for Random Forest
rf_random = GridSearchCV(estimator=rf_model, param_grid=param_grid_rf, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
rf_random.fit(X, y)
best_model_rf = rf_random.best_estimator_
with open('best_model_forest.pkl', 'wb') as f:
    pickle.dump(best_model_rf, f)

## Model 2: XGBoost

In [None]:
# Define hyperparameter grid for XGBoost
param_grid_xgb = {
    'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    'max_depth': [3, 5, 7, 8, 9],
    'learning_rate': [0.01, 0.1, 0.2, 0.4],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}
pprint(param_grid_xgb)

# Initialize the XGBoost model
xgb_model = XGBRegressor(tree_method='hist', device="cuda", random_state=42)
print("\n--- XGBoost Nested CV ---")
xgb_rmse_scores = nested_cv(xgb_model, param_grid_xgb, X_train, y_train, cv_outer=5, cv_inner=3)
print("XGBoost Nested CV RMSE scores:", xgb_rmse_scores)
print("Mean XGBoost Nested RMSE:", np.mean(xgb_rmse_scores))
best_xgb = xgb_random.best_estimator_


In [None]:
### XGBoost Final Evaluation
y_val_pred_xgb = best_xgb.predict(X_val)
val_rmse_xgb = sqrt(mean_squared_error(y_val, y_val_pred_xgb))
print("\nFinal Validation RMSE (XGBoost):", val_rmse_xgb)

##tuning on whole dataset for prediction

In [None]:
# Final tuning on the entire training set and saving the best model for XGBoost
xgb_random = GridSearchCV(estimator=xgb_model, param_grid=param_grid_xgb, cv=3,
                          scoring='neg_mean_squared_error', n_jobs=-1)
xgb_random.fit(X, y)
best_model_xgb = xgb_random.best_estimator_
with open('best_model_xgboost.pkl', 'wb') as f:
    pickle.dump(best_model_xgb, f)

## Model 3: Gradient Boost (using XGBoost for GPU training)

In [None]:
# Define hyperparameter grid for Gradient Boost
param_grid_grad = {
    'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900],
    'max_depth': [3, 5, 7, 8, 9, 10],
    'learning_rate': [0.01, 0.1, 0.2, 0.4],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'min_child_weight': [1, 3, 5, 6, 7, 8]
}
pprint(param_grid_grad)

# Initialize the Gradient Boost model
grad_model = XGBRegressor(tree_method='hist', device="cuda", random_state=42)
print("\n--- Gradient Boost Nested CV ---")
grad_rmse_scores = nested_cv(grad_model, param_grid_grad, X_train, y_train, cv_outer=5, cv_inner=3)
print("Gradient Boost Nested CV RMSE scores:", grad_rmse_scores)
print("Mean Gradient Boost Nested RMSE:", np.mean(grad_rmse_scores))
best_grad = grad_model.best_estimator_


In [None]:
### Gradient Boost Final Evaluation
y_val_pred_grad = best_grad.predict(X_val)
val_rmse_grad = sqrt(mean_squared_error(y_val, y_val_pred_grad))
print("\nFinal Validation RMSE (Gradient Boost):", val_rmse_grad)

##tuning on whole dataset for prediction

In [None]:
# Final tuning on the entire training set and saving the best model for Gradient Boost
grad_random = GridSearchCV(estimator=grad_model, param_grid=param_grid_grad, cv=3,
                           scoring='neg_mean_squared_error', n_jobs=-1)
grad_random.fit(X, y)
best_model_grad = grad_random.best_estimator_
with open('best_model_grad_boost_small_test_set.pkl', 'wb') as f:
    pickle.dump(best_model_grad, f)

## Final Evaluation on Validation Set

For each model, we load the saved best model, predict on the validation set, compute the RMSE,
and then predict on the external test set. The results are saved to Excel files.

In [None]:
# Predict on external test set for Random Forest and save results
with open('best_model_forest.pkl', 'rb') as f:
    best_model_rf = pickle.load(f)

test_cleaned_copy = test_cleaned.copy()
chassis_number_rf = test_cleaned_copy['CHASSIS_NUMBER']
test_cleaned_copy = test_cleaned_copy.drop(columns=['CHASSIS_NUMBER', 'LAID_UP_TIME'])
y_test_pred_rf = best_model_rf.predict(test_cleaned_copy)
result_rf = pd.DataFrame({
    'CHASSIS_NUMBER': chassis_number_rf,
    'LAID_UP_TIME': y_test_pred_rf
})
try:
    with open("../../results/teamB-model1_RF.xlsx") as f:
        raise FileExistsError
except FileNotFoundError:
    result_rf.to_excel("../../results/teamB-model1_RF.xlsx", index=False)
try:
    with open("ML/results/teamB-model1_RF.xlsx") as f:
        raise FileExistsError
except FileNotFoundError:
    result_rf.to_excel("ML/results/teamB-model1_RF.xlsx", index=False)

In [None]:
# Predict on external test set for XGBoost and save results
with open('best_model_xgboost.pkl', 'rb') as f:
    best_model_xgb = pickle.load(f)

test_cleaned_copy = test_cleaned.copy()
chassis_number_xgb = test_cleaned_copy['CHASSIS_NUMBER']
test_cleaned_copy = test_cleaned_copy.drop(columns=['CHASSIS_NUMBER', 'LAID_UP_TIME'])
y_test_pred_xgb = best_model_xgb.predict(test_cleaned_copy)
result_xgb = pd.DataFrame({
    'CHASSIS_NUMBER': chassis_number_xgb,
    'LAID_UP_TIME': y_test_pred_xgb
})
try:
    with open("../../results/teamB-model2_XGB.xlsx") as f:
        raise FileExistsError
except FileNotFoundError:
    result_xgb.to_excel("../../results/teamB-model2_XGB.xlsx", index=False)
try:
    with open("ML/results/teamB-model2_XGB.xlsx") as f:
        raise FileExistsError
except FileNotFoundError:
    result_xgb.to_excel("ML/results/teamB-model2_XGB.xlsx", index=False)


In [None]:
# Predict on external test set for Gradient Boost and save results
with open('best_model_grad_boost_small_test_set.pkl', 'rb') as f:
    best_model_grad = pickle.load(f)

test_cleaned_copy = test_cleaned.copy()
chassis_number_grad = test_cleaned_copy['CHASSIS_NUMBER']
test_cleaned_copy = test_cleaned_copy.drop(columns=['CHASSIS_NUMBER', 'LAID_UP_TIME'])
y_test_pred_grad = best_model_grad.predict(test_cleaned_copy)
result_grad = pd.DataFrame({
    'CHASSIS_NUMBER': chassis_number_grad,
    'LAID_UP_TIME': y_test_pred_grad
})
try:
    with open("../../results/teamB-model3_Grad.xlsx") as f:
        raise FileExistsError
except FileNotFoundError:
    result_grad.to_excel("../../results/teamB-model3_Grad.xlsx", index=False)
try:
    with open("ML/results/teamB-model3_Grad.xlsx") as f:
        raise FileExistsError
except FileNotFoundError:
    result_grad.to_excel("ML/results/teamB-model3_Grad.xlsx", index=False)