# Vehicle Policy Lapse Prediction

In [1]:
import pandas as pd, numpy as np , seaborn as sns
import matplotlib.pyplot as plt

In [2]:
def warn(*args, **kwargs): 
    pass 
import warnings 
warnings.warn = warn

# Ignore divide by zero, overflow, and invalid value warnings
np.seterr(divide='ignore', over='ignore', invalid='ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [3]:
from pathlib import Path
base_dir = Path.cwd().parent
data_path = base_dir / "data" / "eudirectlapse.csv"

eudirectlapse_data= pd.read_csv(data_path)

In [4]:
eudirectlapse_data.head()

Unnamed: 0,lapse,polholder_age,polholder_BMCevol,polholder_diffdriver,polholder_gender,polholder_job,policy_age,policy_caruse,policy_nbcontract,prem_final,prem_freqperyear,prem_last,prem_market,prem_pure,vehicl_age,vehicl_agepurchase,vehicl_garage,vehicl_powerkw,vehicl_region
0,0,38,stable,only partner,Male,normal,1,private or freelance work,1,232.46,4 per year,232.47,221.56,243.59,9,8,private garage,225 kW,Reg7
1,1,35,stable,same,Male,normal,1,private or freelance work,1,208.53,4 per year,208.54,247.56,208.54,15,7,private garage,100 kW,Reg4
2,1,29,stable,same,Male,normal,0,private or freelance work,1,277.34,1 per year,277.35,293.32,277.35,14,6,underground garage,100 kW,Reg7
3,0,33,down,same,Female,medical,2,private or freelance work,1,239.51,4 per year,244.4,310.91,219.95,17,10,street,75 kW,Reg5
4,0,50,stable,same,Male,normal,8,unknown,1,554.54,4 per year,554.55,365.46,519.5,16,8,street,75 kW,Reg14


## Feature Engineering
### Numerical Feature 


In [5]:
data = eudirectlapse_data.copy()

In [6]:
data['policy_age_log'] = np.log1p(data['policy_age'])
data['prem_final_log'] = np.log1p(data['prem_final'])

In [7]:
numerical_columns = [
                "polholder_age",
                "policy_age_log",
                "vehicl_age",
                "vehicl_agepurchase",
                "prem_final_log",]

### Ordinal Feature

In [8]:
data['policy_nbcontract'].value_counts()

policy_nbcontract
1     18259
2      3541
3       793
4       270
5        87
6        39
7        31
10       11
8         9
9         6
15        6
11        4
13        2
12        1
14        1
Name: count, dtype: int64

**policy_nbcontract_grp:** Most policies have between 1 and 5 contracts, while higher numbers are rare. To make the feature more stable and avoid overfitting on these rare values, all contracts above 5 are grouped into a single category 6. This keeps the feature meaningful for the model while preserving most of the information.

In [9]:
data['policy_nbcontract_grp'] = data['policy_nbcontract'].apply(lambda x: x if x <= 5 else 6)

In [10]:
data['prem_freqperyear'].value_counts()

prem_freqperyear
1 per year     11680
4 per year      6114
2 per year      3090
12 per year     2176
Name: count, dtype: int64


**prem_freqperyear_ord:** This feature represents how often premiums are paid per year. The values are converted to numbers to reflect their natural order (1 per year = 1, 2 per year = 2, 4 per year = 3, 12 per year = 4). This keeps the information meaningful for the model while simplifying the feature.


In [11]:
freq_map = {
   '1 per year': 1,
   '4 per year' : 2,
   '2 per year' : 3,
   '12 per year': 4
}

data['prem_freqperyear_ord'] = data['prem_freqperyear'].map(freq_map)

In [12]:
data['vehicl_powerkw'].value_counts()

vehicl_powerkw
75 kW         10339
100 kW         5116
25-50 kW       4968
125-300 kW     1720
150 kW          580
175 kW          206
225 kW           77
200 kW           32
250 kW           16
275 kW            4
300 kW            2
Name: count, dtype: int64

In [13]:
def group_powerkw(x):
    if x in ['75 kW','100 kW' ,'25-50 kW']:
        return x
    else:
        return '125+ kW'
data['vehicl_powerkw_ord'] = data['vehicl_powerkw'].apply(group_powerkw)

power_map = {
    '25-50 kW': 1,
    '75 kW': 2,
    '100 kW': 3,
    '125+ kW': 4
}
data['vehicl_powerkw_ord'] = data['vehicl_powerkw_ord'].map(power_map)



**vehicl_powerkw:** represents vehicle engine power and has a natural order. Most vehicles fall into a few common power ranges, while higher power values occur rarely. To reduce sparsity and keep the feature meaningful, all power values above 100 kW are grouped into a single category (125+ kW). The grouped values are then converted into ordered numeric levels so the model can learn patterns related to increasing engine power.

In [14]:
data['polholder_BMCevol'].value_counts() 

polholder_BMCevol
stable    12036
down      10155
up          869
Name: count, dtype: int64

In [15]:
bmc_map = {
    'down': 0,
    'stable': 1,
    'up': 2
}

data['polholder_BMCevol_ord'] = data['polholder_BMCevol'].map(bmc_map)

**polholder_BMCevol**: This feature shows the evolution of the policyholderâ€™s bonus malus class. The values have a natural order (down < stable < up), so I convert them to numeric codes (0, 1, 2) to reflect this order. This helps the model understand the direction of change while keeping the feature simple and interpretable.

In [16]:
ordinal_columns = [
                "policy_nbcontract_grp",
                "prem_freqperyear_ord",
                "vehicl_powerkw_ord",
                "polholder_BMCevol_ord",]

cols_to_drop = [
                "prem_last",
                "prem_market",
                "prem_pure",
                "policy_age",
                "prem_final",
                "policy_nbcontract",
                "prem_freqperyear",
                "vehicl_powerkw",
                "polholder_BMCevol",
            ]

data = data.drop(cols_to_drop, axis=1)

### Categorical Features

In [17]:
data.head()

Unnamed: 0,lapse,polholder_age,polholder_diffdriver,polholder_gender,polholder_job,policy_caruse,vehicl_age,vehicl_agepurchase,vehicl_garage,vehicl_region,policy_age_log,prem_final_log,policy_nbcontract_grp,prem_freqperyear_ord,vehicl_powerkw_ord,polholder_BMCevol_ord
0,0,38,only partner,Male,normal,private or freelance work,9,8,private garage,Reg7,0.693147,5.453011,1,2,4,1
1,1,35,same,Male,normal,private or freelance work,15,7,private garage,Reg4,0.693147,5.344867,1,2,3,1
2,1,29,same,Male,normal,private or freelance work,14,6,underground garage,Reg7,0.0,5.628843,1,1,3,1
3,0,33,same,Female,medical,private or freelance work,17,10,street,Reg5,1.098612,5.482762,1,2,2,0
4,0,50,same,Male,normal,unknown,16,8,street,Reg14,2.197225,6.319941,1,2,2,1


In [18]:
categorical_columns = [
                "polholder_diffdriver",
                "polholder_gender",
                "polholder_job",
                "policy_caruse",
                "vehicl_garage",
                "vehicl_region",]

These features *polholder_diffdriver, polholder_gender, polholder_job, policy_caruse, vehicl_garage, and vehicl_region*  are categorical with no natural order. We turn them into separate binary columns using one-hot encoding so the model can use them effectively. 

- Numerical features are standardized to control scale differences introduced by log based feature engineering; normalization was considered but not used, as it would compress meaningful relative variation.
- Categorical variables are one-hot encoded with unknown categories ignored to ensure robustness to unseen values; dense output is used to enable conversion to DataFrames, feature inspection, and downstream analysis.



In [19]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [20]:
num_pipeline = Pipeline(
                steps=[ ("scaler", StandardScaler()), 
                            ])

cat_pipeline = Pipeline(
                steps=[
                    (
                        "one_hot_encoder", OneHotEncoder( handle_unknown="ignore", drop="first", sparse_output=False, ),
                    )])

ord_pipeline = Pipeline(
                steps=[("scaler", StandardScaler())]
                    )

preprocessor = ColumnTransformer(
                    transformers=[
                        ("num_pipeline", num_pipeline, numerical_columns),
                        ("cat_pipeline", cat_pipeline, categorical_columns),
                        ("ord_pipeline", ord_pipeline, ordinal_columns),
                    ]
                )

In [21]:
X = data.drop(columns=["lapse"])
y = data["lapse"]

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=2025,
    stratify=y
)

In [23]:
X_train_arr = preprocessor.fit_transform(X_train)
X_test_arr = preprocessor.transform(X_test)

In [24]:
num_features = numerical_columns

cat_features = (
    preprocessor
    .named_transformers_["cat_pipeline"]
    .get_feature_names_out(categorical_columns)
)

ord_features = ordinal_columns

all_features = list(num_features) + list(cat_features) + list(ord_features)

In [25]:
X_train_df = pd.DataFrame(X_train_arr, columns=all_features)
X_test_df = pd.DataFrame(X_test_arr, columns=all_features)


In [26]:
import joblib

joblib.dump(
    {
        "X_train_df": X_train_df,
        "X_test_df": X_test_df,
        "y_train": y_train,
        "y_test": y_test,
        "feature_names": X_train_df.columns.tolist()
    },
    "../data/pre_processed_data.joblib"
)


['../data/pre_processed_data.joblib']

---