In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score,classification_report
import numpy as np

In [4]:
df = pd.read_csv('./insurance.csv')
df.sample(5)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
31,39,51.1,1.83,11.77,True,Lucknow,private_job,Medium
15,61,61.7,1.68,2.99,True,Delhi,retired,Medium
48,36,94.8,1.66,32.69,True,Chennai,unemployed,Medium
8,73,58.0,1.58,1.78,False,Chandigarh,retired,Medium
55,47,75.7,1.73,24.93,False,Delhi,unemployed,Low


In [5]:
df['occupation'].unique()

array(['retired', 'freelancer', 'student', 'government_job',
       'business_owner', 'unemployed', 'private_job'], dtype=object)

In [6]:
df_feat = df.copy()

In [7]:
#feature engineering
#feature 1 : BMI
#creating features from height and weight that is BMI
df_feat['bmi'] = df['weight'] / (df_feat['height'] ** 2)

In [8]:
#feature 2 : Age group
'''
for different ages input we are categorinizing them into 4 groups i.e
young , adult, middleaged and senior
'''
def age_group(age):
  if age < 25:
    return 'young'
  elif age< 45:
    return 'adult'
  elif age< 60:
    return 'middle-aged'
  return 'senior'

In [9]:
df_feat['age_group'] = df_feat['age'].apply(age_group)

In [10]:
#features 3 : lifecylce risk
'''
creating a feature lifestyle risk
considering bmi and if the person smokes or not
'''
def lifestyle_risk(row):
  if row['smoker'] and row['bmi'] > 30:
    return 'high'
  elif row['smoker'] or row['bmi'] > 27:
    return 'medium'
  else:
    return 'low'

In [11]:
df_feat['lifestyle_risk'] = df_feat.apply(lifestyle_risk,axis=1)

In [12]:

tier_1_cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]

In [13]:
#feature 4
'''
categorizing the cities into tier 1 and tier 2 or else tier 3
'''
def city_tier(city):
  if city in tier_1_cities:
    return 1
  elif city in tier_2_cities:
    return 2
  else:
    return 3

In [14]:
df_feat["city_tier"] = df_feat["city"].apply(city_tier)

In [15]:
df_feat.drop(columns=['age', 'weight', 'height', 'smoker', 'city'])[['income_lpa', 'occupation', 'bmi', 'age_group', 'lifestyle_risk', 'city_tier', 'insurance_premium_category']].sample(5)

Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier,insurance_premium_category
77,0.61,retired,37.818734,senior,high,1,High
55,24.93,unemployed,25.293194,middle-aged,low,1,Low
75,45.07,unemployed,20.577355,middle-aged,low,1,Low
58,3.31,retired,29.930402,senior,medium,2,High
14,13.505166,government_job,32.800735,middle-aged,medium,3,Medium


In [16]:
# Select features and target
X = df_feat[["bmi", "age_group", "lifestyle_risk", "city_tier", "income_lpa", "occupation"]]
y = df_feat["insurance_premium_category"]

In [17]:
X

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
0,49.227482,senior,medium,2,2.92000,retired
1,30.189017,adult,medium,1,34.28000,freelancer
2,21.118382,adult,low,2,36.64000,freelancer
3,45.535900,young,high,1,3.34000,student
4,24.296875,senior,medium,2,3.94000,retired
...,...,...,...,...,...,...
95,21.420747,adult,low,2,19.64000,business_owner
96,47.984483,adult,medium,1,34.01000,private_job
97,18.765432,middle-aged,low,1,44.86000,freelancer
98,30.521676,adult,medium,1,28.30000,business_owner


In [18]:
y

0       High
1        Low
2        Low
3     Medium
4       High
       ...  
95       Low
96       Low
97       Low
98       Low
99       Low
Name: insurance_premium_category, Length: 100, dtype: object

In [19]:
# Define categorical and numeric features
categorical_features = ["age_group", "lifestyle_risk", "occupation", "city_tier"]
numeric_features = ["bmi", "income_lpa"]

In [20]:
# Create column transformer for OHE
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)


In [21]:
# Create a pipeline with preprocessing and random forest classifier
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

In [22]:
# Split data and train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
pipeline.fit(X_train, y_train)

In [23]:
# Predict and evaluate
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

0.9

In [24]:
X_test.sample(5)

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
32,31.495845,middle-aged,medium,2,50.0,private_job
92,18.319942,adult,medium,2,30.0,government_job
78,27.932798,middle-aged,medium,2,14.74,freelancer
51,38.827923,middle-aged,high,2,28.95,private_job
82,17.874812,adult,low,1,12.96,unemployed


In [25]:
import pickle

# Save the trained pipeline using pickle
pickle_model_path = "model.pkl"
with open(pickle_model_path, "wb") as f:
    pickle.dump(pipeline, f)