In [7]:
# === IMPORTS ===
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
import joblib

# === LOAD DATA ===
df = pd.read_csv("Motor_vehicle_insurance_data.csv", delimiter=';')

# === CONVERT DATE COLUMNS ===
date_cols = ['Date_start_contract', 'Date_last_renewal', 'Date_next_renewal', 
             'Date_birth', 'Date_driving_licence', 'Date_lapse']
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors='coerce', dayfirst=True)

# === FIX LABEL COLUMN & FILTER BAD LABELS ===
df = df[df['Lapse'].isin([0, 1])]
df['Renewed'] = 1 - df['Lapse']

# === FEATURE ENGINEERING WITH FIXED DATE ===
reference_date = pd.to_datetime("2018-12-31")
df['Customer_age'] = (reference_date - df['Date_birth']).dt.days // 365
df['Driving_experience'] = (reference_date - df['Date_driving_licence']).dt.days // 365
df['Contract_duration'] = (df['Date_next_renewal'] - df['Date_start_contract']).dt.days

# === REMOVE DATE-BASED OUTLIERS ===
df = df[(df['Customer_age'] >= 18) & (df['Customer_age'] <= 100)]
df = df[(df['Driving_experience'] >= 0) & (df['Driving_experience'] <= 80)]

# === DROP UNUSED COLUMNS ===
df = df.drop(columns=['ID', 'Date_start_contract', 'Date_last_renewal', 'Date_next_renewal', 
                      'Date_birth', 'Date_driving_licence', 'Date_lapse', 'Lapse'])

# === DEFINE FEATURES & TARGET ===
X = df.drop(columns=['Renewed'])
y = df['Renewed']

# === IDENTIFY COLUMN TYPES ===
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
for col in categorical_cols:
    X[col] = X[col].astype(str)

# === PREPROCESSING PIPELINES ===
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

# === MODEL PIPELINE WITH RANDOM FOREST ===
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42))
])

# === TRAIN-TEST SPLIT & MODEL TRAINING ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)

# === EXPORT TRAINED MODEL ===
joblib.dump(model, "randomForest_insurance_renewal_rf_model.pkl")
print("✅ Random Forest model trained and saved as 'insurance_renewal_rf_model.pkl'")

  df = pd.read_csv("Motor_vehicle_insurance_data.csv", delimiter=';')


✅ Random Forest model trained and saved as 'insurance_renewal_rf_model.pkl'


In [8]:
# === IMPORTS ===
import joblib
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# === LOAD SAVED RANDOM FOREST MODEL ===
model = joblib.load("randomForest_insurance_renewal_rf_model.pkl")

# === PREDICT ON TEST SET ===
y_pred = model.predict(X_test)

# === EVALUATE MODEL PERFORMANCE ===
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8444455126664423

Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.32      0.44      3977
           1       0.86      0.97      0.91     16826

    accuracy                           0.84     20803
   macro avg       0.78      0.64      0.68     20803
weighted avg       0.83      0.84      0.82     20803


Confusion Matrix:
 [[ 1279  2698]
 [  538 16288]]
