In [3]:
import sys
import os
sys.path.append(os.path.abspath('..'))
from model_codes.add_preprocessing import load_and_prepare_data, create_preprocessor

# === IMPORTS ===
import joblib
from sklearn.model_selection import train_test_split

# === LOAD & PREPARE DATA ===
X, y = load_and_prepare_data("../Motor_vehicle_insurance_data.csv")
_ = create_preprocessor(X)  # Optional: build it if needed for consistency

# === SPLIT DATA ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# === EXPORT TEST SET ===
joblib.dump(X_test, "../testing/X_test.pkl")
joblib.dump(y_test, "../testing/y_test.pkl")

print("✅ Test set saved to '../testing/X_test.pkl' and '../testing/y_test.pkl'")

  df = pd.read_csv(filepath, delimiter=delimiter)


✅ Test set saved to '../testing/X_test.pkl' and '../testing/y_test.pkl'


In [4]:
# === IMPORTS ===
import joblib
import pandas as pd
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)

# === LOAD TEST SET (in the same folder as this notebook) ===
X_test = joblib.load("X_test.pkl")
y_test = joblib.load("y_test.pkl")
print("✅ Loaded preprocessed test set.")

# === MODEL FILES (one directory up, inside 'models/') ===
models_info = {
    "Random Forest": "../models/random_forest_model.pkl",
    "Linear Regression": "../models/linear_regression_model.pkl",
    "Gradient Boosting": "../models/gradient_boost_model.pkl",
    "Gradient Boosting Hyperparamaterized": "../models/gradient_boost_model_hyperparameter.pkl",

}

# === COLLECT METRICS FOR SIDE-BY-SIDE COMPARISON ===
metrics = {
    "Model": [],
    "Accuracy": [],
    "Precision": [],
    "Recall": [],
    "F1 Score": [],
    "ROC AUC": []
}

# === EVALUATE EACH MODEL ===
for name, file in models_info.items():
    model = joblib.load(file)
    y_raw = model.predict(X_test)

    # Handle regression models with threshold
    if name.lower().startswith("linear"):
        y_pred = (y_raw >= 0.5).astype(int)
        y_prob = y_raw
    else:
        y_pred = y_raw
        try:
            y_prob = model.predict_proba(X_test)[:, 1]
        except AttributeError:
            y_prob = y_raw

    # Collect metrics
    metrics["Model"].append(name)
    metrics["Accuracy"].append(accuracy_score(y_test, y_pred))
    metrics["Precision"].append(precision_score(y_test, y_pred))
    metrics["Recall"].append(recall_score(y_test, y_pred))
    metrics["F1 Score"].append(f1_score(y_test, y_pred))
    metrics["ROC AUC"].append(roc_auc_score(y_test, y_prob))

# === DISPLAY RESULTS ===
comparison_df = pd.DataFrame(metrics)
print("\n📊 Model Evaluation Summary:")
print(comparison_df.to_string(index=False))

✅ Loaded preprocessed test set.

📊 Model Evaluation Summary:
                               Model  Accuracy  Precision   Recall  F1 Score  ROC AUC
                       Random Forest  0.827199   0.844789 0.958269  0.897958 0.838664
                   Linear Regression  0.794609   0.799797 0.988597  0.884231 0.708554
                   Gradient Boosting  0.870352   0.871599 0.981134  0.923129 0.892817
Gradient Boosting Hyperparamaterized  0.904173   0.902399 0.985851  0.942281 0.949984
