In [1]:
%pip install -e ..
%load_ext autoreload
%autoreload 2

Obtaining file:///C:/Users/USER/Desktop/projects/Health%20Insurance%20Model
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Checking if build backend supports build_editable: started
  Checking if build backend supports build_editable: finished with status 'done'
  Getting requirements to build editable: started
  Getting requirements to build editable: finished with status 'done'
  Preparing editable metadata (pyproject.toml): started
  Preparing editable metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: insurance
  Building editable for insurance (pyproject.toml): started
  Building editable for insurance (pyproject.toml): finished with status 'done'
  Created wheel for insurance: filename=insurance-0.1.0-0.editable-py3-none-any.whl size=1307 sha256=d7c665b1b1bd93377b9c7ad42261a056994be0577249cab0623d5ad49ab08514
  Stored in directory: C:\Users\USER\AppData\Local\Temp\pip-ephem-wheel-


[notice] A new release of pip is available: 25.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import sklearn, sys
assert sklearn.__version__.startswith("1.4"), \
    f"Wrong sklearn version: {sklearn.__version__}"

In [6]:

import pandas as pd, numpy as np, joblib, pathlib, datetime as dt
import matplotlib.pyplot as plt, seaborn as sns

from insurance.data.load import load_raw
from insurance.features.engineering import prepare_data
from insurance.features.risk_score import add_normalized_risk_score
from insurance.features.preprocessing import make_preprocessor
from insurance.data.split_age_segments import segment_by_age

from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor


### Dataset building and splitting

In [7]:
def build_dataset() -> pd.DataFrame:
    df = add_normalized_risk_score(prepare_data(load_raw()))
    df["insurance_plan"] = df["insurance_plan"].map({"Bronze":1,"Silver":2,"Gold":3})
    return df.drop(columns=["income_level"])   # high VIF

df = build_dataset()

Main dataset not found (NDA). Loading sample: premiums_sample.xlsx


In [6]:
preprocess = make_preprocessor()

In [7]:
df_young, df_rest = segment_by_age(df=df, age_cutoff=25)
df_young.head(2)


✔ Saved 20107 rows → premiums_young.xlsx
✔ Saved 29846 rows → premiums_adult.xlsx


Unnamed: 0,age,gender,region,marital_status,number_of_dependants,bmi_category,smoking_status,employment_status,income_lakhs,medical_history,insurance_plan,annual_premium_amount,disease1,disease2,total_risk_score,normalized_risk_score
4,18,Male,Northeast,Unmarried,0,Overweight,Regular,Self-Employed,99,High blood pressure,2,13365,high blood pressure,none,6,0.428571
9,22,Female,Northwest,Unmarried,0,Underweight,No Smoking,Freelancer,3,No Disease,2,11050,no disease,none,0,0.0


In [8]:
df_young.shape, df_rest.shape

((20107, 16), (29846, 16))

### Models Training

In [9]:
X_young = df_young.drop(columns=["annual_premium_amount"])
y_young = df_young["annual_premium_amount"]

X_train_young, X_test_young, y_train_young, y_test_young = train_test_split(X_young, y_young, test_size=0.2, random_state=42)

X_rest = df_rest.drop(columns=["annual_premium_amount"])
y_rest = df_rest["annual_premium_amount"]

X_train_rest, X_test_rest, y_train_rest, y_test_rest = train_test_split(X_rest, y_rest, test_size=0.2, random_state=42)

In [10]:
models: dict[str, tuple[object, dict]] = {
    "RandomForest": (
        RandomForestRegressor(random_state=42),
        {
            "model__n_estimators": [300],
            "model__max_depth": [None, 6, 10],
            "model__min_samples_leaf": [1, 3],
        },
    ),
    "GBR": (
        GradientBoostingRegressor(random_state=42),
        {
            "model__n_estimators": [300],
            "model__learning_rate": [0.05, 0.1],
            "model__max_depth": [2, 3],
        },
    ),
    "XGB": (
        XGBRegressor(tree_method="hist", random_state=42, n_estimators=400),
        {
            "model__learning_rate": [0.05, 0.1],
            "model__max_depth": [3, 4, 5],
            "model__subsample": [0.9, 1.0],
        },
    ),
}

In [11]:
rows = []

for segment, (X_tr, X_te, y_tr, y_te) in {
    "young": (X_train_young, X_test_young, y_train_young, y_test_young),
    "rest":  (X_train_rest,  X_test_rest,  y_train_rest,  y_test_rest),}.items():
    print(f"\nSegment: {segment.upper()}")
    
    
    for name, (estimator, param_grid) in models.items():
        print(f"→ Training {name}...")
        pipe = Pipeline([("prep", preprocess), ("model", estimator)])
        gs = GridSearchCV(pipe, param_grid, cv=5, scoring="neg_mean_absolute_error",n_jobs=-1, verbose=0)
        gs.fit(X_tr, y_tr)
        y_pred = gs.best_estimator_.predict(X_te)

        pct_err   = np.abs((y_te - y_pred) / y_te)
        pct10     = (pct_err < 0.10).mean()          # business KPI
        cv_std_mae = -gs.cv_results_["std_test_score"][gs.best_index_]
    
        rows.append({
            "segment" : segment,
            "model": name,
            "cv_MAE": -gs.best_score_,
            "cv_std": cv_std_mae,
            "test_MAE": mean_absolute_error(y_te, y_pred),
            "test_RMSE": np.sqrt(mean_squared_error(y_te, y_pred)),
            "pct10": pct10,
            "test_R2": r2_score(y_te, y_pred),
            "best_params": gs.best_params_,
            "best_estimator": gs.best_estimator_,
        })
print("Training Done! ✔")


Segment: YOUNG
→ Training RandomForest...
→ Training GBR...
→ Training XGB...

Segment: REST
→ Training RandomForest...
→ Training GBR...
→ Training XGB...
Training Done! ✔


In [12]:
results = pd.DataFrame(rows).sort_values("pct10", ascending=False)
display(results[['segment','model','cv_MAE','cv_std','test_MAE','test_RMSE','pct10','test_R2']])


Unnamed: 0,segment,model,cv_MAE,cv_std,test_MAE,test_RMSE,pct10,test_R2
4,rest,GBR,258.060831,-2.436734,261.379589,316.828771,0.99933,0.997914
5,rest,XGB,256.43577,-1.590521,260.239136,315.706032,0.99933,0.997928
3,rest,RandomForest,269.384532,-3.105622,268.389081,332.221423,0.998827,0.997706
2,young,XGB,1510.836402,-6.360342,1507.331346,1740.876859,0.277971,0.604609
0,young,RandomForest,1509.846888,-6.338978,1509.713567,1740.735819,0.273247,0.604673
1,young,GBR,1506.160658,-6.837265,1505.596789,1737.080977,0.272501,0.606332


### Conclusion:
Clearly, there is a huge disproportion between how every model operates, where it comes to age group distribution. To try fixing this problem, I'll try to either collect more data, which can be helpful, or try different feature engineering techniques.

In [13]:
MODEL_DIR = pathlib.Path.cwd().parent / "src" / "insurance" / "models"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

for row in results.itertuples():
    segment = row.segment               
    name  = row.model                           
    pipe  = row.best_estimator                 

    stamp = dt.datetime.now().strftime("%Y%m%d")
    fname = MODEL_DIR / f"premium_{segment}_{name}_{stamp}.joblib"

    joblib.dump(pipe, fname)
    print("✔ saved →", fname)

✔ saved → c:\Users\USER\Desktop\projects\Health Insurance Model\src\insurance\models\premium_rest_GBR_20250530.joblib
✔ saved → c:\Users\USER\Desktop\projects\Health Insurance Model\src\insurance\models\premium_rest_XGB_20250530.joblib
✔ saved → c:\Users\USER\Desktop\projects\Health Insurance Model\src\insurance\models\premium_rest_RandomForest_20250530.joblib
✔ saved → c:\Users\USER\Desktop\projects\Health Insurance Model\src\insurance\models\premium_young_XGB_20250530.joblib
✔ saved → c:\Users\USER\Desktop\projects\Health Insurance Model\src\insurance\models\premium_young_RandomForest_20250530.joblib
✔ saved → c:\Users\USER\Desktop\projects\Health Insurance Model\src\insurance\models\premium_young_GBR_20250530.joblib
