In [1]:
%pip install -e ..
%load_ext autoreload
%autoreload 2


[notice] A new release of pip is available: 25.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Obtaining file:///C:/Users/USER/Desktop/projects/Health%20Insurance%20Model
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Checking if build backend supports build_editable: started
  Checking if build backend supports build_editable: finished with status 'done'
  Getting requirements to build editable: started
  Getting requirements to build editable: finished with status 'done'
  Preparing editable metadata (pyproject.toml): started
  Preparing editable metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: insurance
  Building editable for insurance (pyproject.toml): started
  Building editable for insurance (pyproject.toml): finished with status 'done'
  Created wheel for insurance: filename=insurance-0.1.0-0.editable-py3-none-any.whl size=1307 sha256=ed701ab722f164ffaf59fa88b60eef5363b8ba8898ec93e7d3eb834ca9a54efd
  Stored in directory: C:\Users\USER\AppData\Local\Temp\pip-ephem-wheel-

In [13]:
import pandas as pd, numpy as np, joblib, pathlib, datetime as dt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import seaborn as sns

from insurance.data.load import load_raw, AGE_PATH
from insurance.features.engineering import prepare_data, label_encode_categoricals
from insurance.features.risk_score import add_normalized_risk_score
from insurance.features.preprocessing import make_preprocessor_young

### Dataset building for Young Segment - including new feature: Genetical_Risk

In [7]:
df = load_raw(AGE_PATH)
df.sample(1)

Unnamed: 0,Age,Gender,Region,Marital_status,Number Of Dependants,BMI_Category,Smoking_Status,Employment_Status,Income_Level,Income_Lakhs,Medical History,Insurance_Plan,Annual_Premium_Amount,Genetical_Risk
6429,24,Male,Northwest,Unmarried,0,Normal,No Smoking,Freelancer,25L - 40L,31,Diabetes,Bronze,4650,0


In [8]:
df = prepare_data(df)
df = add_normalized_risk_score(df)
df.sample(1)

Unnamed: 0,age,gender,region,marital_status,number_of_dependants,bmi_category,smoking_status,employment_status,income_level,income_lakhs,medical_history,insurance_plan,annual_premium_amount,genetical_risk,disease1,disease2,total_risk_score,normalized_risk_score
2149,20,Male,Northeast,Unmarried,0,Normal,Occasional,Freelancer,<10L,3,No Disease,Bronze,5395,1,no disease,none,0,0.0


In [11]:
df = label_encode_categoricals(df)
df.sample(1)

Unnamed: 0,age,gender,region,marital_status,number_of_dependants,bmi_category,smoking_status,employment_status,income_lakhs,medical_history,insurance_plan,annual_premium_amount,genetical_risk,disease1,disease2,total_risk_score,normalized_risk_score
3721,21,Male,Southwest,Unmarried,2,Normal,Occasional,Salaried,16,No Disease,0,4276,0,no disease,none,0,0.0


### Young Model Training

In [14]:
X = df.drop(columns=["annual_premium_amount"])
y = df["annual_premium_amount"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

preprocess = make_preprocessor_young()

In [15]:
models: dict[str, tuple[object, dict]] = {
    "RandomForest": (
        RandomForestRegressor(random_state=42),
        {
            "model__n_estimators": [300],
            "model__max_depth": [None, 6, 10],
            "model__min_samples_leaf": [1, 3],
        },
    ),
    "GBR": (
        GradientBoostingRegressor(random_state=42),
        {
            "model__n_estimators": [300],
            "model__learning_rate": [0.05, 0.1],
            "model__max_depth": [2, 3],
        },
    ),
    "XGB": (
        XGBRegressor(tree_method="hist", random_state=42, n_estimators=400),
        {
            "model__learning_rate": [0.05, 0.1],
            "model__max_depth": [3, 4, 5],
            "model__subsample": [0.9, 1.0],
        },
    ),
}

### Model Evaluation

In [17]:
rows = []

for name, (estimator, param_grid) in models.items():
    print(f"→ Training {name}...")
    pipe = Pipeline([("prep", preprocess), ("model", estimator)])
    gs   = GridSearchCV(pipe, param_grid, cv=5, scoring="neg_mean_absolute_error",
                        n_jobs=-1, verbose=0)

    gs.fit(X_train, y_train)
    y_pred = gs.best_estimator_.predict(X_test)

    pct_err   = np.abs((y_test - y_pred) / y_test)
    pct10     = (pct_err < 0.10).mean()          # business KPI
    cv_std_mae = -gs.cv_results_["std_test_score"][gs.best_index_]

    rows.append({
        "model": name,
        "cv_MAE": -gs.best_score_,
        "cv_std": cv_std_mae,
        "test_MAE": mean_absolute_error(y_test, y_pred),
        "test_RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
        "pct10": pct10,
        "test_R2": r2_score(y_test, y_pred),
        "best_params": gs.best_params_,
        "best_estimator": gs.best_estimator_,
    })
print("Training Done! ✔")

→ Training RandomForest...
→ Training GBR...
→ Training XGB...
Training Done! ✔


In [18]:
results = pd.DataFrame(rows).sort_values("pct10", ascending=False)
display(results[['model','cv_MAE','cv_std','test_MAE','test_RMSE','pct10','test_R2']])

Unnamed: 0,model,cv_MAE,cv_std,test_MAE,test_RMSE,pct10,test_R2
2,XGB,250.381219,-2.260567,255.745728,294.403966,0.980587,0.988594
1,GBR,249.949419,-2.262933,255.786821,294.460495,0.98009,0.988589
0,RandomForest,259.769159,-2.808782,264.330142,309.049151,0.975859,0.987431


### Conclusion:
Thanks to the acquisition of a new dataset focused on the 18–25 age group and the inclusion of a new feature — genetical_risk — I was able to build dedicated models tailored specifically to this segment and fulfill all the business requirements:

**Achieve high accuracy**  - ≥ **97 %** overall predictive accuracy.  
- For at least **95 %** of predictions, the percentage error must be **< 10 %**.

In [19]:
MODEL_DIR = pathlib.Path.cwd().parent / "src" / "insurance" / "models"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

for row in results.itertuples():              
    name  = row.model                           
    pipe  = row.best_estimator                 

    stamp = dt.datetime.now().strftime("%Y%m%d")
    fname = MODEL_DIR / f"premium_YOUNG_{name}_{stamp}.joblib"

    joblib.dump(pipe, fname)
    print("✔ saved →", fname)

✔ saved → c:\Users\USER\Desktop\projects\Health Insurance Model\src\insurance\models\premium_YOUNG_XGB_20250529.joblib
✔ saved → c:\Users\USER\Desktop\projects\Health Insurance Model\src\insurance\models\premium_YOUNG_GBR_20250529.joblib
✔ saved → c:\Users\USER\Desktop\projects\Health Insurance Model\src\insurance\models\premium_YOUNG_RandomForest_20250529.joblib
