In [1]:
import torch
print(torch.cuda.is_available())

True


In [2]:
import pandas as pd     # type:ignore
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

### for local execution, get to project root path

In [3]:
import os
from pathlib import Path

workspace_rootpath: Path = Path(os.getcwd()).parent
print(workspace_rootpath)           # prints /

/


In [None]:
# Cell: Upload CSV directly (works only in browser window, "chrome + colab" web would be best combination)
# from google.colab import files

# uploaded = files.upload()  # Click "Choose Files" and select health_insurance.csv

In [4]:
# Cell: Force remount Drive
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [6]:
import pandas as pd
from pandas import DataFrame

df: DataFrame = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/health_insurance.csv')

In [7]:
df.sample(5)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
71,38,54.1,1.81,20.25,False,Chandigarh,unemployed,Low
14,49,89.3,1.65,13.505166,False,Kota,government_job,Medium
91,38,119.8,1.76,28.467885,False,Bangalore,government_job,Low
55,47,75.7,1.73,24.93,False,Delhi,unemployed,Low
36,61,58.4,1.64,0.53,False,Hyderabad,retired,Medium


In [11]:
df['occupation'].unique()

array(['retired', 'freelancer', 'student', 'government_job',
       'business_owner', 'unemployed', 'private_job'], dtype=object)

In [12]:
df_feat = df.copy()

In [None]:
# Feature 1: BMI
df_feat['bmi'] = df_feat['weight'] / (df_feat['height'] ** 2)

In [None]:
# Feature 2: Age Group
def age_group(age):
    if age < 25:
        return 'young'
    elif age < 45:
        return 'adult'
    elif age < 60:
        return 'middle_aged'
    return 'senior'

In [None]:
df_feat['age_group'] = df_feat['age'].apply(age_group)

In [None]:
# Feature 3: Lifestyle Risk
def lifestyle_risk(row):
    if row['smoker'] and row['bmi'] > 30:
        return 'high'
    elif row['smoker'] or row['bmi'] > 27:
        return 'medium'
    else:
        return 'low'

In [None]:
df_feat['lifestyle_risk'] = df_feat.apply(lifestyle_risk, axis=1)

In [None]:
tier_1_cities = ['Mumbai', 'Delhi', 'Bangalore', 'Chennai', 'Kolkata', 'Hyderabad', 'Pune']

tier_2_cities = [
    'Jaipur', 'Chandigarh', 'Indore', 'Lucknow', 'Patna', 'Ranchi', 'Visakhapatnam', 'Coimbatore',
    'Bhopal', 'Nagpur', 'Vadodara', 'Surat', 'Rajkot', 'Jodhpur', 'Raipur', 'Amritsar', 'Varanasi',
    'Agra', 'Dehradun', 'Mysore', 'Jabalpur', 'Guwahati', 'Thiruvananthapuram', 'Ludhiana', 'Nashik',
    'Allahabad', 'Udaipur', 'Aurangabad', 'Hubli', 'Belgaum', 'Salem', 'Vijayawada', 'Tiruchirappalli',
    'Bhavnagar', 'Gwalior', 'Dhanbad', 'Bareilly', 'Aligarh', 'Gaya', 'Kozhikode', 'Warangal',
    'Kolhapur', 'Bilaspur', 'Jalandhar', 'Noida', 'Guntur', 'Asansol', 'Siliguri'
]

In [21]:
# Feature 4: City Tier
def city_tier(city):
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    else:
        return 3

In [None]:
df_feat['city_tier'] = df_feat['city'].apply(city_tier)

In [23]:
df_feat.drop(columns=['age', 'weight', 'height', 'smoker', 'city'])[['income_lpa', 'occupation', 'bmi', 'age_group', 'lifestyle_risk', 'city_tier', 'insurance_premium_category']].sample(5)

Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier,insurance_premium_category
70,0.57,retired,36.694215,senior,medium,2,High
3,3.34,student,45.5359,young,high,1,Medium
44,50.0,private_job,30.078125,middle_aged,high,2,Medium
87,25.59837,government_job,32.03125,adult,medium,1,Low
95,19.64,business_owner,21.420747,adult,low,2,Low


In [None]:
# Select features and target
X = df_feat[['bmi', 'age_group', 'lifestyle_risk', 'city_tier', 'income_lpa', 'occupation']]
y = df_feat['insurance_premium_category']

In [25]:
print(X)
print(y)

          bmi    age_group lifestyle_risk  city_tier  income_lpa  \
0   49.227482       senior         medium          2     2.92000   
1   30.189017        adult         medium          1    34.28000   
2   21.118382        adult            low          2    36.64000   
3   45.535900        young           high          1     3.34000   
4   24.296875       senior         medium          2     3.94000   
..        ...          ...            ...        ...         ...   
95  21.420747        adult            low          2    19.64000   
96  47.984483        adult         medium          1    34.01000   
97  18.765432  middle_aged            low          1    44.86000   
98  30.521676        adult         medium          1    28.30000   
99  27.688778        adult         medium          1    28.16664   

        occupation  
0          retired  
1       freelancer  
2       freelancer  
3          student  
4          retired  
..             ...  
95  business_owner  
96     private_

In [None]:
# Define categorical and numeric features
categorical_features = ['age_group', 'lifestyle_risk', 'occupation', 'city_tier']
numeric_features = ['bmi', 'income_lpa']

In [None]:
# Create column transformer for OHE
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features),
        ('num', 'passthrough', numeric_features)
    ]
)

In [None]:
# Create a pipeline with preprocessing and random forest classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [39]:
# Split data and train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)
pipeline.fit(X_train, y_train)


In [40]:
# Predict and evaluate
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

0.9

In [41]:
X_test.sample()

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
83,24.338934,senior,medium,3,2.16,retired


In [None]:
import joblib as jb

# Save the trained pipeline using pickle
model_path = '/content/drive/MyDrive/Colab Notebooks/insurance_premium_prediction_model.joblib'

with open(model_path, 'wb') as file:
  jb.dump(value=pipeline,
          filename=file)
