In [None]:
%pip install -e ..
%load_ext autoreload
%autoreload 2

Obtaining file:///C:/Users/USER/Desktop/projects/Health%20Insurance%20Model
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Checking if build backend supports build_editable: started
  Checking if build backend supports build_editable: finished with status 'done'
  Getting requirements to build editable: started
  Getting requirements to build editable: finished with status 'done'
  Preparing editable metadata (pyproject.toml): started
  Preparing editable metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: insurance
  Building editable for insurance (pyproject.toml): started
  Building editable for insurance (pyproject.toml): finished with status 'done'
  Created wheel for insurance: filename=insurance-0.1.0-0.editable-py3-none-any.whl size=1307 sha256=f0145ce96d5a17704ea06879e40b22c86047c7f8b93d377401ddb99b989c8203
  Stored in directory: C:\Users\USER\AppData\Local\Temp\pip-ephem-wheel-


[notice] A new release of pip is available: 25.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [14]:

import pandas as pd, numpy as np, joblib, pathlib, datetime as dt
import matplotlib.pyplot as plt, seaborn as sns

from insurance.data.load import load_raw
from insurance.features.engineering import prepare_data
from insurance.features.risk_score import add_normalized_risk_score
from insurance.features.preprocessing import make_preprocessor
from insurance.data.split_age_segments import segment_by_age

from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor


In [2]:
def build_dataset() -> pd.DataFrame:
    df = add_normalized_risk_score(prepare_data(load_raw()))
    df["insurance_plan"] = df["insurance_plan"].map({"Bronze":1,"Silver":2,"Gold":3})
    return df.drop(columns=["income_level"])   # high VIF

df = build_dataset()

In [5]:
preprocess = make_preprocessor()

In [15]:
df_young, df_rest = segment_by_age(df=df, age_cutoff=25)
df_young.head(2)


✔ Saved 20107 rows → premiums_young.xlsx
✔ Saved 29846 rows → premiums_adult.xlsx


Unnamed: 0,age,gender,region,marital_status,number_of_dependants,bmi_category,smoking_status,employment_status,income_lakhs,medical_history,insurance_plan,annual_premium_amount,disease1,disease2,total_risk_score,normalized_risk_score
4,18,Male,Northeast,Unmarried,0,Overweight,Regular,Self-Employed,99,High blood pressure,2,13365,high blood pressure,none,6,0.428571
9,22,Female,Northwest,Unmarried,0,Underweight,No Smoking,Freelancer,3,No Disease,2,11050,no disease,none,0,0.0


In [16]:
df_young.shape, df_rest.shape

((20107, 16), (29846, 16))

In [4]:
models: dict[str, tuple[object, dict]] = {
    "RandomForest": (
        RandomForestRegressor(random_state=42),
        {
            "model__n_estimators": [300],
            "model__max_depth": [None, 6, 10],
            "model__min_samples_leaf": [1, 3],
        },
    ),
    "GBR": (
        GradientBoostingRegressor(random_state=42),
        {
            "model__n_estimators": [300],
            "model__learning_rate": [0.05, 0.1],
            "model__max_depth": [2, 3],
        },
    ),
    "XGB": (
        XGBRegressor(tree_method="hist", random_state=42, n_estimators=400),
        {
            "model__learning_rate": [0.05, 0.1],
            "model__max_depth": [3, 4, 5],
            "model__subsample": [0.9, 1.0],
        },
    ),
}

In [None]:
rows = []

for name, (estimator, param_grid) in models.items():
    pipe = Pipeline([("prep", preprocess), ("model", estimator)])
    gs   = GridSearchCV(pipe, param_grid, cv=5, scoring="neg_mean_absolute_error",
                        n_jobs=-1, verbose=0)

    gs.fit(X_train, y_train)
    y_pred = gs.best_estimator_.predict(X_test)

    pct_err   = np.abs((y_test - y_pred) / y_test)
    pct10     = (pct_err < 0.10).mean()          # business KPI
    cv_std_mae = -gs.cv_results_["std_test_score"][gs.best_index_]

    rows.append({
        "model": name,
        "cv_MAE": -gs.best_score_,
        "cv_std": cv_std_mae,
        "test_MAE": mean_absolute_error(y_test, y_pred),
        "test_RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
        "pct10": pct10,
        "test_R2": r2_score(y_test, y_pred),
        "best_params": gs.best_params_,
        "best_estimator": gs.best_estimator_,
    })

In [None]:
results = pd.DataFrame(rows).sort_values("pct10", ascending=False)
display(results[['model','cv_MAE','cv_std','test_MAE','test_RMSE','pct10','test_R2']])