importing neseccary libraries

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score,classification_report
import numpy as np


Loading the dataset

In [2]:

df = pd.read_csv('insurance.csv')
df

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
0,67,119.8,1.56,2.92000,False,Jaipur,retired,High
1,36,101.1,1.83,34.28000,False,Chennai,freelancer,Low
2,39,56.8,1.64,36.64000,False,Indore,freelancer,Low
3,22,109.4,1.55,3.34000,True,Mumbai,student,Medium
4,69,62.2,1.60,3.94000,True,Indore,retired,High
...,...,...,...,...,...,...,...,...
95,36,52.8,1.57,19.64000,False,Indore,business_owner,Low
96,26,113.8,1.54,34.01000,False,Delhi,private_job,Low
97,52,60.8,1.80,44.86000,False,Hyderabad,freelancer,Low
98,27,101.1,1.82,28.30000,False,Kolkata,business_owner,Low


Working with dataframe

In [3]:
#Feature_01(Bmi)
new_df = df.copy()
new_df["bmi"] = new_df["weight"]/(new_df["height"]**2)

In [None]:
#Feature_02 Age group
def age_group(age):
    if age < 25:
        return "Young"
    elif age < 45:
        return "Adult"
    elif age < 60:
        return "Middle_Aged"
    else:
        return "Senior"
new_df["age_group"] = new_df["age"].apply(age_group)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,bmi,age_group
0,67,119.8,1.56,2.92000,False,Jaipur,retired,High,49.227482,Senior
1,36,101.1,1.83,34.28000,False,Chennai,freelancer,Low,30.189017,Adult
2,39,56.8,1.64,36.64000,False,Indore,freelancer,Low,21.118382,Adult
3,22,109.4,1.55,3.34000,True,Mumbai,student,Medium,45.535900,Young
4,69,62.2,1.60,3.94000,True,Indore,retired,High,24.296875,Senior
...,...,...,...,...,...,...,...,...,...,...
95,36,52.8,1.57,19.64000,False,Indore,business_owner,Low,21.420747,Adult
96,26,113.8,1.54,34.01000,False,Delhi,private_job,Low,47.984483,Adult
97,52,60.8,1.80,44.86000,False,Hyderabad,freelancer,Low,18.765432,Middle_Aged
98,27,101.1,1.82,28.30000,False,Kolkata,business_owner,Low,30.521676,Adult


In [7]:
#Feature_03 Lifestyle risk
def lifestyle_risk(row):
    if row["smoker"] and row["bmi"] > 30:
        return "high"
    elif row["smoker"] and row["bmi"] > 27:
        return "medium"
    else:
        return "low"
new_df["lifestyle_risk"] = new_df.apply(lifestyle_risk,axis = 1)
new_df

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,bmi,age_group,lifestyle_risk
0,67,119.8,1.56,2.92000,False,Jaipur,retired,High,49.227482,Senior,low
1,36,101.1,1.83,34.28000,False,Chennai,freelancer,Low,30.189017,Adult,low
2,39,56.8,1.64,36.64000,False,Indore,freelancer,Low,21.118382,Adult,low
3,22,109.4,1.55,3.34000,True,Mumbai,student,Medium,45.535900,Young,high
4,69,62.2,1.60,3.94000,True,Indore,retired,High,24.296875,Senior,low
...,...,...,...,...,...,...,...,...,...,...,...
95,36,52.8,1.57,19.64000,False,Indore,business_owner,Low,21.420747,Adult,low
96,26,113.8,1.54,34.01000,False,Delhi,private_job,Low,47.984483,Adult,low
97,52,60.8,1.80,44.86000,False,Hyderabad,freelancer,Low,18.765432,Middle_Aged,low
98,27,101.1,1.82,28.30000,False,Kolkata,business_owner,Low,30.521676,Adult,low


In [None]:
# City Tier
tier_1 = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2 = [
    "Jaipur",
    "Chandigarh",
    "Indore",
    "Lucknow",
    "Patna",
    "Ranchi",
    "Visakhapatnam",
    "Coimbatore",
    "Bhopal",
    "Nagpur",
    "Vadodara",
    "Surat",
    "Rajkot",
    "Jodhpur",
    "Raipur",
    "Amritsar",
    "Varanasi",
    "Agra",
    "Dehradun",
    "Mysore",
    "Jabalpur",
    "Guwahati",
    "Thiruvananthapuram",
    "Ludhiana",
    "Nashik",
    "Allahabad",
    "Udaipur",
    "Aurangabad",
    "Hubli",
    "Belgaum",
    "Salem",
    "Vijayawada",
    "Tiruchirappalli",
    "Bhavnagar",
    "Gwalior",
    "Dhanbad",
    "Bareilly",
    "Aligarh",
    "Gaya",
    "Kozhikode",
    "Warangal",
    "Kolhapur",
    "Bilaspur",
    "Jalandhar",
    "Noida",
    "Guntur",
    "Asansol",
    "Siliguri",
]

In [9]:
#Tier of The City Feature
def city_tier(city):
    if city in tier_1:
        return 1
    elif city in tier_2:
        return 2
    else:
        return 3
new_df["city_tier"] = new_df["city"].apply(city_tier)
new_df

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,bmi,age_group,lifestyle_risk,city_tier
0,67,119.8,1.56,2.92000,False,Jaipur,retired,High,49.227482,Senior,low,2
1,36,101.1,1.83,34.28000,False,Chennai,freelancer,Low,30.189017,Adult,low,1
2,39,56.8,1.64,36.64000,False,Indore,freelancer,Low,21.118382,Adult,low,2
3,22,109.4,1.55,3.34000,True,Mumbai,student,Medium,45.535900,Young,high,1
4,69,62.2,1.60,3.94000,True,Indore,retired,High,24.296875,Senior,low,2
...,...,...,...,...,...,...,...,...,...,...,...,...
95,36,52.8,1.57,19.64000,False,Indore,business_owner,Low,21.420747,Adult,low,2
96,26,113.8,1.54,34.01000,False,Delhi,private_job,Low,47.984483,Adult,low,1
97,52,60.8,1.80,44.86000,False,Hyderabad,freelancer,Low,18.765432,Middle_Aged,low,1
98,27,101.1,1.82,28.30000,False,Kolkata,business_owner,Low,30.521676,Adult,low,1
