In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
# For model interpretability
import shap
# Make sure you've installed these: pip install pandas numpy scikit-learn matplotlib seaborn xgboost shap

In [None]:
# Load data (adjust path if needed)
df = pd.read_csv(
    "../data/insurance.csv", # Or processed_insurance_v1.csv if you used it in DVC
    parse_dates=['TransactionMonth'],
    dtype={'PostalCode': 'str'}
)
df['TransactionMonth'] = pd.to_datetime(df['TransactionMonth'])
# Convert numerical columns if they were loaded incorrectly
numeric_cols = ['TotalPremium', 'TotalClaims', 'CustomValueEstimate', 'CalculatedPremiumPerTerm', 'SumInsured', 'Kilowatts', 'Cubiccapacity']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

In [None]:
# Example: Simple imputation for numerical columns
for col in df.select_dtypes(include=np.number).columns:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(df[col].median()) # Or .mean()

# Example: Imputation for categorical columns with 'Missing'
for col in df.select_dtypes(include='object').columns:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna('Missing')

In [None]:
df['VehicleAge'] = df['TransactionMonth'].dt.year - df['RegistrationYear']
df['HasAlarmImmobiliser_Binary'] = df['AlarmImmobiliser'].apply(lambda x: 1 if x == 'Y' else 0)
df['HasTrackingDevice_Binary'] = df['TrackingDevice'].apply(lambda x: 1 if x == 'Y' else 0)
df['IsNewVehicle_Binary'] = df['NewVehicle'].apply(lambda x: 1 if x == 'Y' else 0)
# Handle division by zero for ClaimsPerVehicleInFleet
df['ClaimsPerVehicleInFleet'] = df.apply(lambda row: row['TotalClaims'] / row['NumberOfVehiclesInFleet'] if row['NumberOfVehiclesInFleet'] > 0 else 0, axis=1)

# Convert binary flags to appropriate types (e.g., bool or int)
df['IsVATRegistered'] = df['IsVATRegistered'].map({'Y': 1, 'N': 0}).fillna(0).astype(int) # Assuming default N
df['WrittenOff'] = df['WrittenOff'].map({'Y': 1, 'N': 0}).fillna(0).astype(int)
df['Rebuilt'] = df['Rebuilt'].map({'Y': 1, 'N': 0}).fillna(0).astype(int)
df['Converted'] = df['Converted'].map({'Y': 1, 'N': 0}).fillna(0).astype(int)
df['CrossBorder'] = df['CrossBorder'].map({'Y': 1, 'N': 0}).fillna(0).astype(int)

# Feature from gender hypothesis (ensure 'Gender' is clean from Task 3)
df['Gender_Female'] = df['Gender'].apply(lambda x: 1 if x == 'Female' else 0)