In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report,accuracy_score


In [2]:
data= pd.read_csv('insurance.csv')


In [3]:
df = data.copy()

In [4]:
df['charges'].min()

1121.8739

In [5]:
df['charges'].median()

9382.033

In [6]:
df['charges'].max()

63770.42801

In [7]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [8]:
df['premium'] = np.where(
    (df['smoker'] == 'yes') & (df['charges'] > 20000), "High",
    np.where(
        (df['age'] > 50) | (df['charges'] > 12000), "Medium",
        "Low"
    )
)                     

In [9]:

cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune",
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]

In [10]:
df['City'] = np.random.choice(cities, size=len(df))


In [11]:
def assign_occupation(age):
    if age < 20:
        return 'student'
    elif age > 60:
        return np.random.choice(['business_owner', 'retired'])
    else:
        return np.random.choice(['freelancer', 'government_job', 'business_owner', 'unemployed', 'private_job'])

df['occupation'] = df['age'].apply(assign_occupation)

In [12]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,premium,City,occupation
0,19,female,27.9,0,yes,southwest,16884.924,Medium,Vijayawada,student
1,18,male,33.77,1,no,southeast,1725.5523,Low,Visakhapatnam,student
2,28,male,33.0,3,no,southeast,4449.462,Low,Mysore,unemployed
3,33,male,22.705,0,no,northwest,21984.47061,Medium,Surat,freelancer
4,32,male,28.88,0,no,northwest,3866.8552,Low,Guwahati,government_job


In [13]:
def age_group(age):
    if age < 25:
        return "young"
    elif age < 45:
        return "adult"
    elif age < 60:
        return "middle_aged"
    return "senior"

In [14]:
df["age_group"] = df["age"].apply(age_group)

In [15]:
def lifestyle_risk(row):
    if row["smoker"] == 'yes' and row["bmi"] > 30:
        return "high"
    elif row["smoker"]  == 'yes' or row["bmi"] > 27:
        return "medium"
    else:
        return "low"


df["lifestyle_risk"] = df.apply(lifestyle_risk, axis=1)

In [16]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,premium,City,occupation,age_group,lifestyle_risk
0,19,female,27.9,0,yes,southwest,16884.924,Medium,Vijayawada,student,young,medium
1,18,male,33.77,1,no,southeast,1725.5523,Low,Visakhapatnam,student,young,medium
2,28,male,33.0,3,no,southeast,4449.462,Low,Mysore,unemployed,adult,medium
3,33,male,22.705,0,no,northwest,21984.47061,Medium,Surat,freelancer,adult,low
4,32,male,28.88,0,no,northwest,3866.8552,Low,Guwahati,government_job,adult,medium


In [17]:
def get_income(job):
    if job == 'student' or job == 'unemployed':
        return 0  # No income
    elif job == 'business_owner':
        # Random between 10 LPA and 50 LPA
        return round(np.random.uniform(10, 50), 2)
    elif job == 'government_job':
        # Random between 4 LPA and 18 LPA
        return round(np.random.uniform(4, 18), 2)
    elif job == 'private_job':
        # Random between 3 LPA and 25 LPA
        return round(np.random.uniform(3, 25), 2)
    elif job == 'freelancer':
        # Random between 2 LPA and 20 LPA
        return round(np.random.uniform(2, 20), 2)
    elif job == 'retired':
        # Pension: Random between 3 LPA and 10 LPA
        return round(np.random.uniform(3, 10), 2)
    else:
        return round(np.random.uniform(2, 5), 2)

df['income_lpa'] = df['occupation'].apply(get_income)

In [18]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,premium,City,occupation,age_group,lifestyle_risk,income_lpa
0,19,female,27.9,0,yes,southwest,16884.924,Medium,Vijayawada,student,young,medium,0.0
1,18,male,33.77,1,no,southeast,1725.5523,Low,Visakhapatnam,student,young,medium,0.0
2,28,male,33.0,3,no,southeast,4449.462,Low,Mysore,unemployed,adult,medium,0.0
3,33,male,22.705,0,no,northwest,21984.47061,Medium,Surat,freelancer,adult,low,9.18
4,32,male,28.88,0,no,northwest,3866.8552,Low,Guwahati,government_job,adult,medium,8.61


In [19]:

tier_1_cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]

In [20]:
def city_tier(city):
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    else:
        return 3
df["city_tier"] = df["City"].apply(city_tier)

In [21]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,premium,City,occupation,age_group,lifestyle_risk,income_lpa,city_tier
0,19,female,27.9,0,yes,southwest,16884.924,Medium,Vijayawada,student,young,medium,0.0,2
1,18,male,33.77,1,no,southeast,1725.5523,Low,Visakhapatnam,student,young,medium,0.0,2
2,28,male,33.0,3,no,southeast,4449.462,Low,Mysore,unemployed,adult,medium,0.0,2
3,33,male,22.705,0,no,northwest,21984.47061,Medium,Surat,freelancer,adult,low,9.18,2
4,32,male,28.88,0,no,northwest,3866.8552,Low,Guwahati,government_job,adult,medium,8.61,2


In [22]:
df_new = df[['income_lpa','bmi','age_group','occupation','city_tier','lifestyle_risk','premium']]

In [23]:
df_new.head(15
            )

Unnamed: 0,income_lpa,bmi,age_group,occupation,city_tier,lifestyle_risk,premium
0,0.0,27.9,young,student,2,medium,Medium
1,0.0,33.77,young,student,2,medium,Low
2,0.0,33.0,adult,unemployed,2,medium,Low
3,9.18,22.705,adult,freelancer,2,low,Medium
4,8.61,28.88,adult,government_job,2,medium,Low
5,8.92,25.74,adult,government_job,2,low,Low
6,12.68,33.44,middle_aged,government_job,2,medium,Low
7,11.21,27.74,adult,freelancer,2,medium,Low
8,7.8,29.83,adult,private_job,2,medium,Low
9,17.79,25.84,senior,freelancer,2,low,Medium


In [24]:
X = df_new[['income_lpa','bmi','age_group','occupation','city_tier','lifestyle_risk']]
y = df_new['premium']

In [25]:
categorical_feat = ['age_group','occupation','lifestyle_risk']
numerical_feat = ["bmi", "income_lpa",'city_tier']


In [26]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(),categorical_feat),
        ('num','passthrough',numerical_feat)
    ]
)

In [27]:
pipeline = Pipeline(steps = [
    ('preprocessor' , preprocessor),
    ('classifier',RandomForestClassifier(random_state=42))
])

In [28]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state=42)
pipeline.fit(X_train,y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [29]:
y_pred = pipeline.predict(X_test)
accuracy_score(y_test,y_pred)

0.8022388059701493

In [30]:
import pickle
pickle_model_path = "model.pkl"
with open(pickle_model_path, "wb") as f:
    pickle.dump(pipeline, f)