In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import  accuracy_score

In [2]:
df = pd.read_csv('insurance.csv')

In [3]:
df.sample(5)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
13,55,89.6,1.73,32.0,True,Karachi,retired,High
25,26,61.9,1.7,5.5,False,Peshawar,unemployed,Low
12,28,62.1,1.68,7.5,False,Mardan,freelancer,Low
3,52,81.3,1.72,22.0,True,Islamabad,government_job,High
9,61,95.5,1.65,30.0,True,Faisalabad,retired,High


In [4]:
df_feat = df.copy()

### Feature Engineering|

In [5]:
# Feature 1: BMI
df_feat["bmi"] = df_feat["weight"] / (df_feat["height"] ** 2)

In [7]:
# Feature 2: Age Group
def age_group(age):
    if age < 25:
        return "young"
    elif age < 45:
        return "adult"
    elif age < 60:
        return "middle_aged"
    return "senior"

In [8]:
df_feat["age_group"] = df_feat["age"].apply(age_group)

In [10]:
# Feature 3: Lifestyle Risk
def lifestyle_risk(row):
    if row["smoker"] and row["bmi"] > 30:
        return "high"
    elif row["smoker"] and row["bmi"] > 27:
        return "medium"
    else:
        return "low"

In [11]:
df_feat["lifestyle_risk"] = df_feat.apply(lifestyle_risk, axis=1)

In [12]:
tier_1_cities = ["Lahore", "Islamabad", "Peshawar", "Karachi"]
tier_2_cities = ["Mardan", "Quetta", "Faisalabad","Rawalpindi","Multan","Sialkot","Gujranwala","Hyderabad","Sukkur","Bahawalpur",
    "Sargodha","Abbottabad","Haripur","Swat","Kohat","Dera Ghazi Khan","Okara","Sheikhupura","Rahim Yar Khan","Chiniot"
]


In [13]:
# Feature 4: City Tier
def city_tier(city):
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    else:
        return 3

In [14]:
df_feat["city_tier"] = df_feat["city"].apply(city_tier)

In [15]:
df_feat.drop(columns=['age', 'weight', 'height', 'smoker', 'city'])[['income_lpa', 'occupation', 'bmi', 'age_group', 'lifestyle_risk', 'city_tier']]

Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier
0,50.0,private_job,24.98615,middle_aged,low,2
1,18.0,freelancer,23.673469,adult,low,1
2,12.0,private_job,23.843703,adult,low,1
3,22.0,government_job,27.481071,middle_aged,medium,1
4,35.0,private_job,24.586542,adult,low,1
5,28.0,retired,33.131342,middle_aged,high,2
6,5.0,student,22.145329,young,low,2
7,40.0,government_job,26.778167,middle_aged,low,2
8,15.0,freelancer,23.384859,adult,low,2
9,30.0,retired,35.078053,senior,high,2


In [19]:
# Select features and target
X = df_feat[["bmi", "age_group", "lifestyle_risk", "city_tier", "income_lpa", "occupation"]]
y = df_feat["insurance_premium_category"]

In [21]:
X, y

(          bmi    age_group lifestyle_risk  city_tier  income_lpa  \
 0   24.986150  middle_aged            low          2        50.0   
 1   23.673469        adult            low          1        18.0   
 2   23.843703        adult            low          1        12.0   
 3   27.481071  middle_aged         medium          1        22.0   
 4   24.586542        adult            low          1        35.0   
 5   33.131342  middle_aged           high          2        28.0   
 6   22.145329        young            low          2         5.0   
 7   26.778167  middle_aged            low          2        40.0   
 8   23.384859        adult            low          2        15.0   
 9   35.078053       senior           high          2        30.0   
 10  23.562806        adult            low          1        14.0   
 11  27.182335  middle_aged         medium          1        26.0   
 12  22.002551        adult            low          2         7.5   
 13  29.937519  middle_aged       

In [22]:
# Define categorical and numeric features
categorical_features = ["age_group", "lifestyle_risk", "occupation", "city_tier"]
numeric_features = ["bmi", "income_lpa"]

In [27]:
# Create column transformer for OHE
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)

In [28]:
# Create a pipeline with preprocessing and random forest classifier
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor), 
    ("classifier", RandomForestClassifier(random_state=42))
])

### Train Test Split

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
pipeline.fit(X_train, y_train)

### Evaluation

In [30]:
# Predict and Evaluate
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

0.9

In [31]:
X_test.sample(5)

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
31,25.05736,middle_aged,low,1,21.0,private_job
23,22.951595,adult,low,2,13.5,freelancer
27,23.875659,adult,low,1,19.0,government_job
2,23.843703,adult,low,1,12.0,private_job
33,21.28777,adult,low,2,5.0,unemployed


### Export Model

In [32]:
import pickle

pickle_model_path = "model.pkl"
with open(pickle_model_path, "wb") as f:
    pickle.dump(pipeline, f)

---