In [3]:
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

import warnings
warnings.filterwarnings('ignore')

In [4]:
data = pd.read_csv('F:\\Guvi Projects\\Smart_Premium\\playground-series-s4e12\\train.csv')
data.head()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,...,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,...,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,...,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,...,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,...,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,...,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0


In [5]:
data.drop(['id','Policy Start Date'], axis = 1, inplace = True)

In [6]:
numerical_features = data.select_dtypes(include = ['int64', 'float64']).columns
categorical_features = data.select_dtypes(include = 'object').columns

In [7]:
for col in numerical_features:
    data[col].fillna(data[col].mean(), inplace=True)
        
for col in categorical_features:
    data[col].fillna(data[col].mode()[0], inplace=True)

In [8]:
data.head()

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,Poor,No,Weekly,House,2869.0
1,39.0,Female,31678.0,Divorced,3.0,Master's,Employed,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,Average,Yes,Monthly,House,1483.0
2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,592.92435,3.0,Good,Yes,Weekly,House,567.0
3,21.0,Male,141855.0,Married,2.0,Bachelor's,Employed,10.938144,Rural,Basic,1.0,0.0,367.0,1.0,Poor,Yes,Daily,Apartment,765.0
4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,Premium,0.0,8.0,598.0,4.0,Poor,Yes,Weekly,House,2022.0


# **Exploratory Data Analysis**

### Annual Income Distribution by Gender

In [9]:
gender_code = data.groupby(['Gender']).agg({'Annual Income':'mean'}).reset_index()
gender_code

Unnamed: 0,Gender,Annual Income
0,Female,32774.818487
1,Male,32715.869662


In [10]:
fig = px.bar(gender_code, x='Annual Income', color='Gender', barmode='group',
             title="Annual Income Distribution by Gender")

fig.show()

### Smokers Count

In [11]:
smoker_count = data.groupby(['Gender', 'Smoking Status']).size().reset_index(name = 'Count')
smoker_count

Unnamed: 0,Gender,Smoking Status,Count
0,Female,No,298738
1,Female,Yes,298691
2,Male,No,299389
3,Male,Yes,303182


In [30]:


fig = px.bar(smoker_count, 
             x='Gender', 
             y='Count', 
             color='Smoking Status',  
             title="Smoking Status Distribution by Gender",
             barmode='group',
             color_discrete_sequence=px.colors.qualitative.Set2)

fig.show()



### Premium Distribution on Education Level

In [13]:
education_count = data.groupby(['Education Level']).agg({'Premium Amount':'sum'}).reset_index()
education_count

Unnamed: 0,Education Level,Premium Amount
0,Bachelor's,334375658.0
1,High School,319770796.0
2,Master's,334842068.0
3,PhD,334065264.0


In [14]:
fig = px.pie(education_count, 
             names='Education Level', 
             values='Premium Amount', 
             title="Premium Amount Distribution by Education Level",
             hole=0.4)

fig.show()

### Occupation By Locations

In [15]:
location_group = data.groupby(['Location', 'Occupation']).size().reset_index(name='Count')
location_group

Unnamed: 0,Location,Occupation,Count
0,Rural,Employed,214380
1,Rural,Self-Employed,94391
2,Rural,Unemployed,92176
3,Suburban,Employed,214558
4,Suburban,Self-Employed,94376
5,Suburban,Unemployed,92608
6,Urban,Employed,211887
7,Urban,Self-Employed,93878
8,Urban,Unemployed,91746


In [16]:
fig = px.sunburst(location_group, 
                  path=['Location', 'Occupation'], 
                  values='Count', 
                  title="Occupation by Location",
                  color='Count', 
                  color_continuous_scale='turbo')

fig.show()

### Policy Breakdown by Gender

In [17]:
policy_group = data.groupby(['Gender', 'Policy Type']).size().reset_index(name='Count')
policy_group

Unnamed: 0,Gender,Policy Type,Count
0,Female,Basic,198064
1,Female,Comprehensive,198908
2,Female,Premium,200457
3,Male,Basic,200490
4,Male,Comprehensive,200692
5,Male,Premium,201389


In [18]:
fig = px.sunburst(policy_group, 
                  path=['Gender', 'Policy Type'], 
                  values='Count', 
                  title="Policy Type Breakdown by Gender",
                  color='Count', 
                  color_continuous_scale='Blues')

fig.show()

In [19]:
feedback = data.groupby(['Customer Feedback']).size().reset_index(name='Count')
feedback

Unnamed: 0,Customer Feedback,Count
0,Average,455729
1,Good,368753
2,Poor,375518


In [20]:
fig = px.pie(feedback, 
             names='Customer Feedback', 
             values='Count', 
             title="Customer Feedback Breakdown")

fig.show()

### Gender Wise Customer Feedback Distribution

In [21]:
gender_wise = data.groupby(['Gender','Customer Feedback']).size().reset_index(name = 'Count')
gender_wise

Unnamed: 0,Gender,Customer Feedback,Count
0,Female,Average,227236
1,Female,Good,183156
2,Female,Poor,187037
3,Male,Average,228493
4,Male,Good,185597
5,Male,Poor,188481


In [22]:
fig = px.bar(gender_wise, 
             x='Gender', 
             y='Count', 
             color='Customer Feedback', 
             title="Customer Feedback by Gender",
             barmode='group')
fig.show()

### Distribution of Property and their relationship

In [23]:
property_premium = data.groupby(['Property Type']).agg({'Premium Amount':'sum'}).reset_index()
property_premium

Unnamed: 0,Property Type,Premium Amount
0,Apartment,441635947.0
1,Condo,440408722.0
2,House,441009117.0


In [24]:
fig = px.line(property_premium, 
              x='Property Type', 
              y='Premium Amount', 
              title="Total Premium Amount by Property Type",
              markers=True, 
              line_shape='spline',
              color_discrete_sequence=['Purple']) 

fig.show()