In [5]:
# === IMPORTS ===
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import joblib  # needed for exporting .pkl files

# === LOAD DATA ===
df = pd.read_csv("../Motor_vehicle_insurance_data.csv", delimiter=';')

# === CONVERT DATE COLUMNS ===
date_cols = ['Date_start_contract', 'Date_last_renewal', 'Date_next_renewal', 
             'Date_birth', 'Date_driving_licence', 'Date_lapse']
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors='coerce', dayfirst=True)

# === CREATE Renewed TARGET VARIABLE ===
df['Renewed'] = df['Lapse'].apply(lambda x: 1 if x == 0 else 0)

# === FEATURE ENGINEERING WITH FIXED DATE ===
reference_date = pd.to_datetime("2018-12-31")
df['Customer_age'] = (reference_date - df['Date_birth']).dt.days // 365
df['Driving_experience'] = (reference_date - df['Date_driving_licence']).dt.days // 365
df['Contract_duration'] = (df['Date_next_renewal'] - df['Date_start_contract']).dt.days

# === REMOVE DATE-BASED OUTLIERS ===
df = df[(df['Customer_age'] >= 18) & (df['Customer_age'] <= 100)]
df = df[(df['Driving_experience'] >= 0) & (df['Driving_experience'] <= 80)]

# === DROP UNUSED COLUMNS ===
df = df.drop(columns=['ID', 'Date_start_contract', 'Date_last_renewal', 'Date_next_renewal', 
                      'Date_birth', 'Date_driving_licence', 'Date_lapse', 'Lapse'])

# === DEFINE FEATURES & TARGET ===
X = df.drop(columns=['Renewed'])
y = df['Renewed']

# === IDENTIFY COLUMN TYPES ===
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
for col in categorical_cols:
    X[col] = X[col].astype(str)

# === PREPROCESSING PIPELINES ===
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

# === TRAIN-TEST SPLIT ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# === EXPORT TEST SET TO TESTING FOLDER ===
joblib.dump(X_test, "../testing/X_test.pkl")
joblib.dump(y_test, "../testing/y_test.pkl")
print("✅ Test set saved to '../testing/X_test.pkl' and 'y_test.pkl'")

  df = pd.read_csv("../Motor_vehicle_insurance_data.csv", delimiter=';')


✅ Test set saved to '../testing/X_test.pkl' and 'y_test.pkl'


In [6]:
# === IMPORTS ===
import joblib
import pandas as pd
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)

# === LOAD TEST SET (in the same folder as this notebook) ===
X_test = joblib.load("X_test.pkl")
y_test = joblib.load("y_test.pkl")
print("✅ Loaded preprocessed test set.")

# === MODEL FILES (one directory up, inside 'models/') ===
models_info = {
    "Random Forest": "../models/random_forest_model.pkl",
    "Linear Regression": "../models/linear_regression_model.pkl",
    "Gradient Boosting": "../models/gradient_boosting_model.pkl"
}

# === COLLECT METRICS FOR SIDE-BY-SIDE COMPARISON ===
metrics = {
    "Model": [],
    "Accuracy": [],
    "Precision": [],
    "Recall": [],
    "F1 Score": [],
    "ROC AUC": []
}

# === EVALUATE EACH MODEL ===
for name, file in models_info.items():
    model = joblib.load(file)
    y_raw = model.predict(X_test)

    # Handle regression models with threshold
    if name.lower().startswith("linear"):
        y_pred = (y_raw >= 0.5).astype(int)
        y_prob = y_raw
    else:
        y_pred = y_raw
        try:
            y_prob = model.predict_proba(X_test)[:, 1]
        except AttributeError:
            y_prob = y_raw

    # Collect metrics
    metrics["Model"].append(name)
    metrics["Accuracy"].append(accuracy_score(y_test, y_pred))
    metrics["Precision"].append(precision_score(y_test, y_pred))
    metrics["Recall"].append(recall_score(y_test, y_pred))
    metrics["F1 Score"].append(f1_score(y_test, y_pred))
    metrics["ROC AUC"].append(roc_auc_score(y_test, y_prob))

# === DISPLAY RESULTS ===
comparison_df = pd.DataFrame(metrics)
print("\n📊 Model Evaluation Summary:")
print(comparison_df.to_string(index=False))

✅ Loaded preprocessed test set.

📊 Model Evaluation Summary:
            Model  Accuracy  Precision   Recall  F1 Score  ROC AUC
    Random Forest  0.832504   0.847590 0.961851  0.901113 0.845410
Linear Regression  0.795131   0.799874 0.989313  0.884565 0.707457
Gradient Boosting  0.845104   0.845393 0.984896  0.909828 0.837767
