In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score

In [2]:
df = pd.read_csv("insurance.csv")
df

Unnamed: 0,Age,Weight,Height,Income_LPA,Smoker,City,Occupation,Insurance_Premium_Category
0,58,64,167,15.26,No,Mumbai,Teacher,Low
1,48,92,151,6.85,Yes,Mumbai,Clerk,Medium
2,34,78,184,14.15,Yes,Bangalore,Lawyer,Low
3,62,85,165,13.16,No,Delhi,Engineer,High
4,27,62,185,19.24,No,Delhi,Engineer,Low
...,...,...,...,...,...,...,...,...
145,49,74,161,3.27,No,Delhi,Engineer,Medium
146,56,84,165,9.87,Yes,Mumbai,Clerk,Medium
147,42,74,175,28.61,Yes,Delhi,Teacher,Medium
148,58,78,175,26.93,No,Bangalore,Lawyer,Low


In [32]:
df['Occupation'].unique()

array(['Teacher', 'Clerk', 'Lawyer', 'Engineer', 'Doctor', 'Manager'],
      dtype=object)

In [3]:
df.head()

Unnamed: 0,Age,Weight,Height,Income_LPA,Smoker,City,Occupation,Insurance_Premium_Category
0,58,64,167,15.26,No,Mumbai,Teacher,Low
1,48,92,151,6.85,Yes,Mumbai,Clerk,Medium
2,34,78,184,14.15,Yes,Bangalore,Lawyer,Low
3,62,85,165,13.16,No,Delhi,Engineer,High
4,27,62,185,19.24,No,Delhi,Engineer,Low


In [4]:
df_feat = df.copy()

In [18]:
# Feature 1: BMI
df_feat['BMI'] = df_feat['Weight'] / (df_feat['Height'] / 100) ** 2

In [19]:
# Feature 2: Age Group
def Age_Group(Age):
    if Age < 25:
        return 'Young'
    elif 25 <= Age < 45:
        return 'Adult'
    elif 45 <= Age < 60:
        return 'Middle-aged'
    else:
        return 'Senior'

In [7]:
df_feat['Age_Group'] = df_feat['Age'].apply(Age_Group)

In [8]:
# Feature 3: Lifestyle Risk
def Lifestyle_Risk(row):
    if row['Smoker'] and row['BMI'] > 30:
        return 'High'
    elif row['Smoker'] and row['BMI'] > 27:
        return 'Medium'
    else:
        return 'Low'

In [9]:
df_feat['Lifestyle_Risk'] = df_feat.apply(Lifestyle_Risk, axis=1)

In [10]:
tier_1_cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]

In [11]:
# Feature 4: City Tier
def City_Tier(City):
    if City in tier_1_cities:
        return 1
    elif City in tier_2_cities:
        return 2
    else:
        return 3

In [12]:
df_feat['City_Tier'] = df_feat['City'].apply(City_Tier)

In [20]:
df_feat.drop(columns=['Age', 'Weight', 'Height', 'Smoker', 'City'])[['Income_LPA', 'Occupation', 'BMI', 'Age_Group', 'Lifestyle_Risk', 'City_Tier', 'Insurance_Premium_Category']]

Unnamed: 0,Income_LPA,Occupation,BMI,Age_Group,Lifestyle_Risk,City_Tier,Insurance_Premium_Category
0,15.26,Teacher,22.948116,Middle-aged,Low,1,Low
1,6.85,Clerk,40.349107,Middle-aged,Low,1,Medium
2,14.15,Lawyer,23.038752,Adult,Low,1,Low
3,13.16,Engineer,31.221304,Senior,Low,1,High
4,19.24,Engineer,18.115413,Adult,Low,1,Low
...,...,...,...,...,...,...,...
145,3.27,Engineer,28.548281,Middle-aged,Low,1,Medium
146,9.87,Clerk,30.853994,Middle-aged,Low,1,Medium
147,28.61,Teacher,24.163265,Adult,Low,1,Medium
148,26.93,Lawyer,25.469388,Middle-aged,Low,1,Low


In [21]:
# Select features and target
X = df_feat[['Income_LPA', 'Occupation', 'BMI', 'Age_Group', 'Lifestyle_Risk', 'City_Tier']]
y = df_feat['Insurance_Premium_Category']

In [22]:
X

Unnamed: 0,Income_LPA,Occupation,BMI,Age_Group,Lifestyle_Risk,City_Tier
0,15.26,Teacher,22.948116,Middle-aged,Low,1
1,6.85,Clerk,40.349107,Middle-aged,Low,1
2,14.15,Lawyer,23.038752,Adult,Low,1
3,13.16,Engineer,31.221304,Senior,Low,1
4,19.24,Engineer,18.115413,Adult,Low,1
...,...,...,...,...,...,...
145,3.27,Engineer,28.548281,Middle-aged,Low,1
146,9.87,Clerk,30.853994,Middle-aged,Low,1
147,28.61,Teacher,24.163265,Adult,Low,1
148,26.93,Lawyer,25.469388,Middle-aged,Low,1


In [23]:
y

0         Low
1      Medium
2         Low
3        High
4         Low
        ...  
145    Medium
146    Medium
147    Medium
148       Low
149      High
Name: Insurance_Premium_Category, Length: 150, dtype: object

In [24]:
# Define categorical and numerical features
categorical_features = ['Occupation', 'Age_Group', 'Lifestyle_Risk', 'City_Tier']
numerical_features = ['Income_LPA', 'BMI']

In [25]:
# Create column transformer for OHE
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(), categorical_features),
        ("num", "passthrough", numerical_features)
    ]
)

In [26]:
# Create pipeline with preprocessing and random forest classifier
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier())
])

In [27]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [28]:
# Train the model
pipeline.fit(X_train, y_train)

In [29]:
# Predict on test data
y_pred = pipeline.predict(X_test)

In [30]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.53


In [31]:
# Save the model
import pickle

with open("model.pkl", "wb") as f:
    pickle.dump(pipeline, f)