importing neseccary libraries

In [61]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score,classification_report
import numpy as np


Loading the dataset

In [62]:

df = pd.read_csv('insurance.csv')
df

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
0,67,119.8,1.56,2.92000,False,Jaipur,retired,High
1,36,101.1,1.83,34.28000,False,Chennai,freelancer,Low
2,39,56.8,1.64,36.64000,False,Indore,freelancer,Low
3,22,109.4,1.55,3.34000,True,Mumbai,student,Medium
4,69,62.2,1.60,3.94000,True,Indore,retired,High
...,...,...,...,...,...,...,...,...
95,36,52.8,1.57,19.64000,False,Indore,business_owner,Low
96,26,113.8,1.54,34.01000,False,Delhi,private_job,Low
97,52,60.8,1.80,44.86000,False,Hyderabad,freelancer,Low
98,27,101.1,1.82,28.30000,False,Kolkata,business_owner,Low


**Working with dataframe:**

Feature_01 Bmi

In [63]:
new_df = df.copy()
new_df["bmi"] = new_df["weight"]/(new_df["height"]**2)

Feature_02 Age group

In [64]:
def age_group(age):
    if age < 25:
        return "Young"
    elif age < 45:
        return "Adult"
    elif age < 60:
        return "Middle_Aged"
    else:
        return "Senior"
new_df["age_group"] = new_df["age"].apply(age_group)

Feature_03 Lifestyle risk

In [65]:
def lifestyle_risk(row):
    if row["smoker"] and row["bmi"] > 30:
        return "high"
    elif row["smoker"] and row["bmi"] > 27:
        return "medium"
    else:
        return "low"
new_df["lifestyle_risk"] = new_df.apply(lifestyle_risk,axis = 1)
new_df

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,bmi,age_group,lifestyle_risk
0,67,119.8,1.56,2.92000,False,Jaipur,retired,High,49.227482,Senior,low
1,36,101.1,1.83,34.28000,False,Chennai,freelancer,Low,30.189017,Adult,low
2,39,56.8,1.64,36.64000,False,Indore,freelancer,Low,21.118382,Adult,low
3,22,109.4,1.55,3.34000,True,Mumbai,student,Medium,45.535900,Young,high
4,69,62.2,1.60,3.94000,True,Indore,retired,High,24.296875,Senior,low
...,...,...,...,...,...,...,...,...,...,...,...
95,36,52.8,1.57,19.64000,False,Indore,business_owner,Low,21.420747,Adult,low
96,26,113.8,1.54,34.01000,False,Delhi,private_job,Low,47.984483,Adult,low
97,52,60.8,1.80,44.86000,False,Hyderabad,freelancer,Low,18.765432,Middle_Aged,low
98,27,101.1,1.82,28.30000,False,Kolkata,business_owner,Low,30.521676,Adult,low


Feature_04 City Tier

In [66]:
tier_1 = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2 = [
    "Jaipur",
    "Chandigarh",
    "Indore",
    "Lucknow",
    "Patna",
    "Ranchi",
    "Visakhapatnam",
    "Coimbatore",
    "Bhopal",
    "Nagpur",
    "Vadodara",
    "Surat",
    "Rajkot",
    "Jodhpur",
    "Raipur",
    "Amritsar",
    "Varanasi",
    "Agra",
    "Dehradun",
    "Mysore",
    "Jabalpur",
    "Guwahati",
    "Thiruvananthapuram",
    "Ludhiana",
    "Nashik",
    "Allahabad",
    "Udaipur",
    "Aurangabad",
    "Hubli",
    "Belgaum",
    "Salem",
    "Vijayawada",
    "Tiruchirappalli",
    "Bhavnagar",
    "Gwalior",
    "Dhanbad",
    "Bareilly",
    "Aligarh",
    "Gaya",
    "Kozhikode",
    "Warangal",
    "Kolhapur",
    "Bilaspur",
    "Jalandhar",
    "Noida",
    "Guntur",
    "Asansol",
    "Siliguri",
] 
def city_tier(city):
    if city in tier_1:
        return 1
    elif city in tier_2:
        return 2
    else:
        return 3


new_df["city_tier"] = new_df["city"].apply(city_tier)
new_df

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,bmi,age_group,lifestyle_risk,city_tier
0,67,119.8,1.56,2.92000,False,Jaipur,retired,High,49.227482,Senior,low,2
1,36,101.1,1.83,34.28000,False,Chennai,freelancer,Low,30.189017,Adult,low,1
2,39,56.8,1.64,36.64000,False,Indore,freelancer,Low,21.118382,Adult,low,2
3,22,109.4,1.55,3.34000,True,Mumbai,student,Medium,45.535900,Young,high,1
4,69,62.2,1.60,3.94000,True,Indore,retired,High,24.296875,Senior,low,2
...,...,...,...,...,...,...,...,...,...,...,...,...
95,36,52.8,1.57,19.64000,False,Indore,business_owner,Low,21.420747,Adult,low,2
96,26,113.8,1.54,34.01000,False,Delhi,private_job,Low,47.984483,Adult,low,1
97,52,60.8,1.80,44.86000,False,Hyderabad,freelancer,Low,18.765432,Middle_Aged,low,1
98,27,101.1,1.82,28.30000,False,Kolkata,business_owner,Low,30.521676,Adult,low,1


Creating a new dataframe to train the model

In [67]:
final_df = new_df[["income_lpa","occupation","bmi","age_group","lifestyle_risk","city_tier","insurance_premium_category"]]
final_df

Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier,insurance_premium_category
0,2.92000,retired,49.227482,Senior,low,2,High
1,34.28000,freelancer,30.189017,Adult,low,1,Low
2,36.64000,freelancer,21.118382,Adult,low,2,Low
3,3.34000,student,45.535900,Young,high,1,Medium
4,3.94000,retired,24.296875,Senior,low,2,High
...,...,...,...,...,...,...,...
95,19.64000,business_owner,21.420747,Adult,low,2,Low
96,34.01000,private_job,47.984483,Adult,low,1,Low
97,44.86000,freelancer,18.765432,Middle_Aged,low,1,Low
98,28.30000,business_owner,30.521676,Adult,low,1,Low


Selecting Features and Target for model Training

In [68]:
x = final_df.iloc[::,:-1:]
y = final_df.iloc[::,-1]

Distinguishing Categorical and numerical Features

In [69]:
categorical_features = ["age_group","lifestyle_risk","occupation","city_tier"]
numerical_features = ["bmi","income_lpa"]

**Machine learning Model Building**

Creating Column Transformer for OHE

In [70]:
prepocessor = ColumnTransformer(
    transformers= [
        ("cat",OneHotEncoder(),categorical_features),
        ("num","passthrough",numerical_features)
    ]
)

Creating a pipeline with preprocessing and random forest classifier

In [71]:
pipeline = Pipeline(
    steps=[
        ("prepocessor",prepocessor),
        ("classifier",RandomForestClassifier(random_state=42))
    ]
)

Split data and train the mahine learning model

In [72]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=1)
pipeline.fit(x_train,y_train)

Predicting and evaluating

In [73]:
y_pred = pipeline.predict(x_test)
accuracy_score(y_test,y_pred)

0.75