In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [2]:
df=pd.read_csv('Insurance_data.csv')

In [3]:
df.sample(5)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
82,35,56.0,1.77,12.96,False,Delhi,unemployed,Low
85,33,51.4,1.86,34.66,False,Chennai,private_job,Low
66,18,63.9,1.59,3.23,False,Indore,student,Low
43,72,85.7,1.71,1.56,False,Chennai,retired,Medium
57,72,76.8,1.69,1.36,True,Jalandhar,retired,High


In [4]:
df_new=df.copy()
df_new.sample(5)


Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
65,46,106.3,1.68,38.07,True,Jaipur,unemployed,High
47,55,116.4,1.87,8.34,False,Chandigarh,private_job,Medium
66,18,63.9,1.59,3.23,False,Indore,student,Low
54,75,54.5,1.61,3.32,True,Lucknow,retired,High
12,42,95.2,1.78,17.58,True,Chandigarh,freelancer,High


In [5]:
# BMI field in place of height and weight
df_new['bmi']=df_new['weight']/(df_new['height']**2)

In [6]:
#Age group
def age_grp(age):
    if(age<25):
        return 'young'
    elif(age<45):
        return 'adult'
    elif(age<60):
        return 'middle aged'
    else:
        return 'senior'

In [7]:
df_new['age_group']=df_new['age'].apply(age_grp)

In [8]:
# lifestyle risk
def lifestyle_risk(row):
    if(row['smoker'] and row['bmi']>30):
        return 'high'
    elif(row['smoker'] and row['bmi']>27):
        return 'medium'
    else:
        return 'low'

In [9]:
df_new['lifestyle_risk']=df_new.apply(lifestyle_risk,axis=1)

In [10]:
tier_1_cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]

In [11]:
# City Tier
def city_tier(city):
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    else:
        return 3

In [12]:
df_new['city_tier']=df_new["city"].apply(city_tier)

In [13]:
df_new.sample(5)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,bmi,age_group,lifestyle_risk,city_tier
46,42,83.0,1.57,25.57,True,Kolkata,unemployed,High,33.672766,adult,high,1
17,65,90.1,1.7,2.23,False,Delhi,retired,Medium,31.176471,senior,low,1
87,30,82.0,1.6,25.59837,False,Hyderabad,government_job,Low,32.03125,adult,low,1
21,69,92.7,1.84,2.91,False,Jalandhar,retired,High,27.380671,senior,low,2
86,35,66.0,1.89,37.38,False,Hyderabad,freelancer,Low,18.476526,adult,low,1


In [14]:
df_new.drop(columns=['age','weight','height','city','smoker'],inplace=True)

In [15]:
df_new.sample(5)

Unnamed: 0,income_lpa,occupation,insurance_premium_category,bmi,age_group,lifestyle_risk,city_tier
22,30.0,government_job,Low,31.771627,middle aged,low,2
8,1.78,retired,Medium,23.233456,senior,low,2
75,45.07,unemployed,Low,20.577355,middle aged,low,1
64,1.02,retired,High,37.179649,senior,low,2
69,6.034487,government_job,Low,21.942857,middle aged,low,2


In [16]:
X=df_new.drop(columns=['insurance_premium_category'])
Y=df_new['insurance_premium_category']

In [17]:
X.sample(5)

Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier
15,2.99,retired,21.860828,senior,low,1
12,17.58,freelancer,30.046711,adult,high,2
36,0.53,retired,21.713266,senior,low,1
37,8.09,freelancer,17.852127,adult,low,2
93,1.28,student,23.199416,young,low,2


In [18]:
Y.sample(5)

62       Low
29      High
15    Medium
43    Medium
70      High
Name: insurance_premium_category, dtype: object

In [19]:
# Define categorical and numeric features
categorical_features = ["age_group", "lifestyle_risk", "occupation", "city_tier"]
numeric_features = ["bmi", "income_lpa"]

In [20]:
# column tranform
preprocessor=ColumnTransformer(
    [
        ('cat',OneHotEncoder(),categorical_features),
        ('num',"passthrough",numeric_features)
    ]
)

In [21]:
#creating a pipeline with preprocessing and random forest classifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

pipeline_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
pipeline_model.fit(X_train, y_train)

In [23]:

# Predict and evaluate
y_pred = pipeline_model.predict(X_test)
accuracy_score(y_test, y_pred)

0.75

In [24]:
import pickle

# Save the trained pipeline using pickle
pickle_model_path = "model1.pkl"
with open(pickle_model_path, "wb") as f:
    pickle.dump(pipeline_model, f)