In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [2]:
df = pd.read_csv("/content/pakistan_insurance_dataset_600.csv")

In [3]:
df.head()

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
0,26,65,187,49.32,Yes,Karachi,Doctor,High
1,44,62,154,2.18,Yes,Islamabad,Driver,Medium
2,32,67,171,33.85,No,Rawalpindi,Doctor,High
3,26,86,155,34.19,No,Faisalabad,Farmer,High
4,37,80,161,45.55,Yes,Karachi,Farmer,High


In [4]:
df.columns

Index(['age', 'weight', 'height', 'income_lpa', 'smoker', 'city', 'occupation',
       'insurance_premium_category'],
      dtype='object')

In [5]:
df['occupation'].unique()

array(['Doctor', 'Driver', 'Farmer', 'Businessman', 'Software Developer',
       'Teacher', 'Engineer', 'Shopkeeper'], dtype=object)

In [6]:
df_feat = df.copy()

In [7]:
# Feature 1: BMI
df_feat["bmi"] = df_feat["weight"] / (df_feat["height"] ** 2)

In [8]:
# Feature 2: Age Group
def age_group(age):
    if age < 25:
        return "young"
    elif age < 45:
        return "adult"
    elif age < 60:
        return "middle_aged"
    return "senior"

In [9]:
df_feat["age_group"] = df_feat["age"].apply(age_group)

In [10]:
# Feature 3: Lifestyle Risk
def lifestyle_risk(row):
    if row["smoker"] and row["bmi"] > 30:
        return "high"
    elif row["smoker"] or row["bmi"] > 27:
        return "medium"
    else:
        return "low"


In [11]:
df_feat["lifestyle_risk"] = df_feat.apply(lifestyle_risk, axis=1)

In [12]:
# Tier 1 Cities (Major Metros)
tier1_cities = ["Karachi", "Lahore", "Islamabad", "Rawalpindi"]

# Tier 2 Cities (Secondary Urban Centers)
tier2_cities = [
    "Faisalabad",
    "Multan",
    "Peshawar",
    "Quetta",
    "Gujranwala",
    "Sialkot",
    "Hyderabad"
]


In [13]:
# Feature 4: City Tier
def city_tier(city):
    if city in tier1_cities:
        return 1
    elif city in tier2_cities:
        return 2
    else:
        return 3

In [14]:
df_feat["city_tier"] = df_feat["city"].apply(city_tier)

In [15]:
df_feat.drop(columns=['age', 'weight', 'height', 'smoker', 'city'])[['income_lpa', 'occupation', 'bmi', 'age_group', 'lifestyle_risk', 'city_tier', 'insurance_premium_category']].sample(5)


Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier,insurance_premium_category
453,36.31,Software Developer,0.001929,young,medium,1,Medium
231,19.39,Driver,0.002078,adult,medium,1,Medium
496,23.26,Driver,0.001947,adult,medium,1,High
428,39.2,Doctor,0.003074,adult,medium,2,High
385,45.24,Engineer,0.003048,middle_aged,medium,2,Low


In [16]:
df_feat["lifestyle_risk"].unique()

array(['medium'], dtype=object)

In [17]:
# Select features and target
X = df_feat[["bmi", "age_group", "lifestyle_risk", "city_tier", "income_lpa", "occupation"]]
y = df_feat["insurance_premium_category"]


In [18]:

X

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
0,0.001859,adult,medium,1,49.32,Doctor
1,0.002614,adult,medium,1,2.18,Driver
2,0.002291,adult,medium,1,33.85,Doctor
3,0.003580,adult,medium,2,34.19,Farmer
4,0.003086,adult,medium,1,45.55,Farmer
...,...,...,...,...,...,...
595,0.002311,adult,medium,1,26.32,Engineer
596,0.002905,adult,medium,1,36.51,Doctor
597,0.002589,middle_aged,medium,2,5.02,Engineer
598,0.002840,adult,medium,1,8.03,Farmer


In [19]:
y

Unnamed: 0,insurance_premium_category
0,High
1,Medium
2,High
3,High
4,High
...,...
595,Low
596,Low
597,Low
598,Low


In [20]:
# Define categorical and numeric features
categorical_features = ["age_group", "lifestyle_risk", "occupation", "city_tier"]
numeric_features = ["bmi", "income_lpa"]

In [21]:

# Create column transformer for OHE
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)

In [22]:

# Create a pipeline with preprocessing and random forest classifier
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])


In [23]:
# Split data and train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
pipeline.fit(X_train, y_train)


In [24]:
ohe = pipeline.named_steps["preprocessor"].named_transformers_["cat"]
print(ohe.categories_)

[array(['adult', 'middle_aged', 'senior', 'young'], dtype=object), array(['medium'], dtype=object), array(['Businessman', 'Doctor', 'Driver', 'Engineer', 'Farmer',
       'Shopkeeper', 'Software Developer', 'Teacher'], dtype=object), array([1, 2])]


In [25]:
# Predict and evaluate
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

0.2916666666666667

In [26]:
X_test.sample(5)

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
257,0.00251,senior,medium,1,14.21,Software Developer
3,0.00358,adult,medium,2,34.19,Farmer
90,0.002599,middle_aged,medium,2,39.86,Teacher
499,0.00216,young,medium,2,49.18,Engineer
293,0.002189,adult,medium,2,37.79,Engineer


In [27]:
import pickle

# Save the trained pipeline using pickle
pickle_model_path = "model.pkl1"
with open(pickle_model_path, "wb") as f:
    pickle.dump(pipeline, f)