In [4]:
%pip install -e ..
%load_ext autoreload
%autoreload 2

Obtaining file:///C:/Users/USER/Desktop/projects/Health%20Insurance%20Model
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Checking if build backend supports build_editable: started
  Checking if build backend supports build_editable: finished with status 'done'
  Getting requirements to build editable: started
  Getting requirements to build editable: finished with status 'done'
  Preparing editable metadata (pyproject.toml): started
  Preparing editable metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: insurance
  Building editable for insurance (pyproject.toml): started
  Building editable for insurance (pyproject.toml): finished with status 'done'
  Created wheel for insurance: filename=insurance-0.1.0-0.editable-py3-none-any.whl size=1307 sha256=35cf79e7064ee6cd532795f5d339ecc7ea4a0e2f987eaf44c1fbd53a0b7bbd08
  Stored in directory: C:\Users\USER\AppData\Local\Temp\pip-ephem-wheel-


[notice] A new release of pip is available: 25.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score

from insurance.data.load import load_raw
from insurance.features.engineering import prepare_data
from insurance.features.risk_score import add_normalized_risk_score
from insurance.features.preprocessing import make_preprocessor




### Building Dataset

In [None]:
df = load_raw()           
df = prepare_data(df)
df = add_normalized_risk_score(df)

# label-encode plan & income level
df["insurance_plan"] = df["insurance_plan"].map({"Bronze": 1, "Silver": 2, "Gold": 3})
df["income_level"] = df["income_level"].map({"<10L": 1, "10L - 25L": 2, "25L - 40L": 3, "> 40L": 4})

#dropping income_level due to too high VIF
df = df.drop(columns=["income_level"])

df.head()

Unnamed: 0,age,gender,region,marital_status,number_of_dependants,bmi_category,smoking_status,employment_status,income_lakhs,medical_history,insurance_plan,annual_premium_amount,disease1,disease2,total_risk_score,normalized_risk_score
0,26,Male,Northwest,Unmarried,0,Normal,No Smoking,Salaried,6,Diabetes,1,9053,diabetes,none,6,0.428571
1,29,Female,Southeast,Married,2,Obesity,Regular,Salaried,6,Diabetes,1,16339,diabetes,none,6,0.428571
2,49,Female,Northeast,Married,2,Normal,No Smoking,Self-Employed,20,High blood pressure,2,18164,high blood pressure,none,6,0.428571
3,30,Female,Southeast,Married,3,Normal,No Smoking,Salaried,77,No Disease,3,20303,no disease,none,0,0.0
4,18,Male,Northeast,Unmarried,0,Overweight,Regular,Self-Employed,99,High blood pressure,2,13365,high blood pressure,none,6,0.428571


### Model Training

In [10]:
X = df.drop(columns=["annual_premium_amount"])
y = df["annual_premium_amount"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
pipe: Pipeline = Pipeline(
steps=[
("prep", make_preprocessor()),
("model", Ridge()),])

In [12]:
pipe

In [15]:
grid = GridSearchCV(
        pipe,
        param_grid={"model__alpha": [0.1, 1.0, 10.0]},
        cv=5,
        scoring="neg_mean_absolute_error",
        n_jobs=-1,
    )
grid.fit(X_train, y_train)

In [18]:
best_pipe: Pipeline = grid.best_estimator_
preds = best_pipe.predict(X_test)
print(f"Best CV MAE: {-grid.best_score_:.2f}")
print(f"Test  MAE : {mean_absolute_error(y_test, preds):.2f}")
print(f"Test  R²  : {r2_score(y_test, preds):.3f}")

Best CV MAE: 1741.84
Test  MAE : 1751.15
Test  R²  : 0.927


In [None]:
X_test_trans = best_pipe.named_steps["prep"].transform(X_test)
print(X_test_trans.shape)        

feature_names = best_pipe.named_steps["prep"].get_feature_names_out()
pd.DataFrame(X_test_trans, columns=feature_names).head()


(9991, 17)


Unnamed: 0,num__age,num__number_of_dependants,num__income_lakhs,num__insurance_plan,num__normalized_risk_score,cat__gender_Male,cat__region_Northwest,cat__region_Southeast,cat__region_Southwest,cat__marital_status_Unmarried,cat__bmi_category_Obesity,cat__bmi_category_Overweight,cat__bmi_category_Underweight,cat__smoking_status_Occasional,cat__smoking_status_Regular,cat__employment_status_Salaried,cat__employment_status_Self-Employed
0,0.462963,0.4,0.727273,1.0,0.357143,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,0.907407,0.2,0.010101,0.5,0.357143,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.092593,0.2,0.030303,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.759259,0.6,0.050505,0.5,0.428571,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.018519,0.0,0.363636,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
