In [53]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score

In [54]:
df = pd.read_csv('./dataset/insurance.csv')

In [55]:
df.sample(5)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
94,43,88.7,1.81,14.841,False,Noida,freelancer,Low
33,45,82.4,1.65,1.597,False,Belgaum,retired,Medium
91,35,66.0,1.58,3.086,True,Mysore,unemployed,Medium
65,53,68.7,1.83,28.023,True,Mumbai,freelancer,High
45,26,66.2,1.62,23.361,False,Jalandhar,freelancer,Low


In [56]:
df['occupation'].unique()

array(['private_job', 'freelancer', 'student', 'retired',
       'business_owner', 'government_job', 'unemployed'], dtype=object)

In [57]:
df_feat = df.copy()

In [58]:
# Feature 1: BMI
df_feat["bmi"] = df_feat["weight"] / (df_feat["height"] ** 2)

In [59]:
# Feature 2: Age Group
def age_group(age):
    if age < 25:
        return "young"
    elif age < 45:
        return "adult"
    elif age < 60:
        return "middle_aged"
    return "senior"

In [60]:
df_feat["age_group"] = df_feat["age"].apply(age_group)

In [61]:
# Feature 3: Lifestyle Risk
def lifestyle_risk(row):
    if row["smoker"] and row["bmi"] > 30:
        return "high"
    elif row["smoker"] or row["bmi"] > 27:
        return "medium"
    else:
        return "low"

In [62]:
df_feat["lifestyle_risk"] = df_feat.apply(lifestyle_risk, axis=1)

In [63]:
tier_1_cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]

In [64]:
# Feature 4: City Tier
def city_tier(city):
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    else:
        return 3

In [65]:
df_feat["city_tier"] = df_feat["city"].apply(city_tier)

In [66]:
df_feat.drop(columns=['age', 'weight', 'height', 'smoker', 'city'])[['income_lpa', 'occupation', 'bmi', 'age_group', 'lifestyle_risk', 'city_tier', 'insurance_premium_category']].sample(5)

Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier,insurance_premium_category
79,34.818,retired,28.632812,middle_aged,medium,1,Medium
43,32.237,business_owner,23.163254,young,low,2,Medium
52,9.061,retired,21.201693,adult,low,2,Low
19,4.672,unemployed,29.752066,adult,medium,1,Medium
20,13.145,student,25.233726,senior,medium,2,Low


In [67]:
# Select features and target
X = df_feat[["bmi", "age_group", "lifestyle_risk", "city_tier", "income_lpa", "occupation"]]
y = df_feat["insurance_premium_category"]

In [68]:
X

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
0,17.440166,middle_aged,medium,2,27.851,private_job
1,16.732017,senior,low,2,3.270,freelancer
2,36.294741,middle_aged,medium,1,36.693,student
3,20.467412,adult,medium,2,18.252,retired
4,25.175510,senior,low,2,10.352,business_owner
...,...,...,...,...,...,...
95,27.390919,adult,medium,2,38.315,unemployed
96,35.050473,senior,high,2,27.394,student
97,32.870535,middle_aged,medium,2,19.818,student
98,27.178763,middle_aged,medium,2,20.228,private_job


In [69]:
y

0       High
1     Medium
2     Medium
3        Low
4        Low
       ...  
95       Low
96      High
97       Low
98       Low
99    Medium
Name: insurance_premium_category, Length: 100, dtype: object

In [70]:
# Define categorical and numeric features
categorical_features = ["age_group", "lifestyle_risk", "occupation", "city_tier"]
numeric_features = ["bmi", "income_lpa"]

In [71]:
# Create column transformer for OHE
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)

In [72]:
# Create a pipeline with preprocessing and random forest classifier
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

In [73]:
# Split data and train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
pipeline.fit(X_train, y_train)


In [74]:
# Predict and evaluate
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

0.65

In [75]:
X_test.sample(5)

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
17,32.276224,adult,medium,2,14.333,freelancer
78,28.476331,adult,medium,2,23.641,retired
51,44.609375,senior,high,2,26.408,unemployed
44,27.666632,adult,medium,1,27.41,business_owner
80,29.975695,adult,medium,2,22.913,student


In [78]:
import pickle

# Save the trained pipeline using pickle
pickle_model_path = "./models/model.pkl"
with open(pickle_model_path, "wb") as f:
    pickle.dump(pipeline, f)