In [None]:
# === IMPORTS ===
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
import joblib

# === LOAD DATA ===
df = pd.read_csv("Motor_vehicle_insurance_data.csv", delimiter=';')

# === CONVERT DATE COLUMNS ===
date_cols = ['Date_start_contract', 'Date_last_renewal', 'Date_next_renewal', 
             'Date_birth', 'Date_driving_licence', 'Date_lapse']
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors='coerce', dayfirst=True)

# === FIX LABEL COLUMN & FILTER BAD LABELS ===
df = df[df['Lapse'].isin([0, 1])]
df['Renewed'] = 1 - df['Lapse']

# === FEATURE ENGINEERING WITH FIXED DATE ===
reference_date = pd.to_datetime("2018-12-31")
df['Customer_age'] = (reference_date - df['Date_birth']).dt.days // 365
df['Driving_experience'] = (reference_date - df['Date_driving_licence']).dt.days // 365
df['Contract_duration'] = (df['Date_next_renewal'] - df['Date_start_contract']).dt.days

# === REMOVE DATE-BASED OUTLIERS ===
df = df[(df['Customer_age'] >= 18) & (df['Customer_age'] <= 100)]
df = df[(df['Driving_experience'] >= 0) & (df['Driving_experience'] <= 80)]

# === DROP UNUSED COLUMNS ===
df = df.drop(columns=['ID', 'Date_start_contract', 'Date_last_renewal', 'Date_next_renewal', 
                      'Date_birth', 'Date_driving_licence', 'Date_lapse', 'Lapse'])

# === DEFINE FEATURES & TARGET ===
X = df.drop(columns=['Renewed'])
y = df['Renewed']

# === IDENTIFY COLUMN TYPES ===
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
for col in categorical_cols:
    X[col] = X[col].astype(str)

# === PREPROCESSING PIPELINES ===
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

# === MODEL PIPELINE ===
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# === TRAIN-TEST SPLIT & MODEL TRAINING ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)

# === EXPORT TRAINED MODEL ===
joblib.dump(model, "regresssion_insurance_renewal_model.pkl")
print("✅ Model trained and saved as 'regresssion_insurance_renewal_model.pkl'")

  df = pd.read_csv("Motor_vehicle_insurance_data.csv", delimiter=';')


✅ Model trained and saved as 'insurance_renewal_model.pkl'


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

# Load the model
model = joblib.load("regresssion_insurance_renewal_model.pkl")

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8119501994904581

Classification Report:
               precision    recall  f1-score   support

           0       0.57      0.07      0.12      3977
           1       0.82      0.99      0.89     16826

    accuracy                           0.81     20803
   macro avg       0.69      0.53      0.51     20803
weighted avg       0.77      0.81      0.75     20803


Confusion Matrix:
 [[  264  3713]
 [  199 16627]]
