In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score
from scipy.stats import randint, uniform
import joblib

# Load and prepare data

def load_and_prepare_data(filepath, delimiter=';', reference_date="2018-12-31"):
    df = pd.read_csv(filepath, delimiter=delimiter)

    # Convert date columns
    date_cols = ['Date_start_contract', 'Date_last_renewal', 'Date_next_renewal', 
                 'Date_birth', 'Date_driving_licence', 'Date_lapse']
    for col in date_cols:
        df[col] = pd.to_datetime(df[col], errors='coerce', dayfirst=True)

    # Create target
    df['Renewed'] = df['Lapse'].apply(lambda x: 1 if x == 0 else 0)

    # Feature engineering
    reference_date = pd.to_datetime(reference_date)
    df['Customer_age'] = (reference_date - df['Date_birth']).dt.days // 365
    df['Driving_experience'] = (reference_date - df['Date_driving_licence']).dt.days // 365
    df['Contract_duration'] = (df['Date_next_renewal'] - df['Date_start_contract']).dt.days
    df['Experience_ratio'] = df['Driving_experience'] / (df['Customer_age'] + 1)

    # Filter outliers
    df = df[(df['Customer_age'] >= 18) & (df['Customer_age'] <= 100)]
    df = df[(df['Driving_experience'] >= 0) & (df['Driving_experience'] <= 80)]

    # Drop unused columns
    df = df.drop(columns=['ID', 'Date_start_contract', 'Date_last_renewal', 'Date_next_renewal', 
                          'Date_birth', 'Date_driving_licence', 'Date_lapse', 'Lapse'])

    # Define features and target
    X = df.drop(columns=['Renewed'])
    y = df['Renewed']

    return X, y

def create_preprocessor(X):
    categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()

    # Ensure all categoricals are strings
    for col in categorical_cols:
        X[col] = X[col].astype(str)

    numeric_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

    return preprocessor

# Load data
X, y = load_and_prepare_data("Motor_vehicle_insurance_data.csv")
preprocessor = create_preprocessor(X)

# Stratified split
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_idx, test_idx in sss.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

# Build base pipeline
base_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(random_state=42))
])

# Define hyperparameter distributions
param_dist = {
    'classifier__n_estimators': randint(200, 600),
    'classifier__learning_rate': uniform(0.01, 0.09),
    'classifier__max_depth': randint(3, 10),
    'classifier__min_samples_split': randint(100, 500),
    'classifier__min_samples_leaf': randint(20, 100),
    'classifier__subsample': uniform(0.7, 0.3)
}

# Randomized Search
random_search = RandomizedSearchCV(
    base_pipeline,
    param_distributions=param_dist,
    n_iter=50,  # Number of random settings to try
    scoring='roc_auc',  # Maximize ROC-AUC
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1  # Use all CPUs
)

# Run tuning
print("Starting hyperparameter search...")
random_search.fit(X_train, y_train)
print("Hyperparameter search complete!")

best_model = random_search.best_estimator_
print("Best Parameters:", random_search.best_params_)

y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

print("\nModel Evaluation")
print("-----------------------------")
print("Accuracy:", best_model.score(X_test, y_test))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred_proba))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

joblib.dump(best_model, "gradient_boosting_best_model.pkl")
print("\nBest Gradient Boosting model saved to '../models/gradient_boosting_best_model.pkl'")

  df = pd.read_csv(filepath, delimiter=delimiter)


🔍 Starting hyperparameter search...
Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] END classifier__learning_rate=0.043708610696262626, classifier__max_depth=7, classifier__min_samples_leaf=34, classifier__min_samples_split=206, classifier__n_estimators=271, classifier__subsample=0.8795975452591109; total time= 1.5min
[CV] END classifier__learning_rate=0.043708610696262626, classifier__max_depth=7, classifier__min_samples_leaf=34, classifier__min_samples_split=206, classifier__n_estimators=271, classifier__subsample=0.8795975452591109; total time= 1.5min
[CV] END classifier__learning_rate=0.043708610696262626, classifier__max_depth=7, classifier__min_samples_leaf=34, classifier__min_samples_split=206, classifier__n_estimators=271, classifier__subsample=0.8795975452591109; total time= 1.5min
[CV] END classifier__learning_rate=0.024041677639819285, classifier__max_depth=5, classifier__min_samples_leaf=94, classifier__min_samples_split=187, classifier__n_estimators=572,