# Importing libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from  scipy.stats import norm

# Importing data:

In [None]:
df_train = pd.read_csv('/kaggle/input/insurance-claim-analysis-demographic-and-health/insurance_data.csv')
df_train

# About each feature

PatientID (Index): Category - Identifier, Type - Categorical

An identifier for each row.
age: Category - Demographic, Type - Numerical

The age of the patient.
gender: Category - Demographic, Type - Categorical

The gender of the patient (e.g., male).
bmi: Category - Health, Type - Numerical

The Body Mass Index (BMI) of the patient.
bloodpressure: Category - Health, Type - Numerical

The blood pressure of the patient.
diabetic: Category - Health, Type - Categorical

Indicates whether the patient is diabetic (e.g., Yes or No).
children: Category - Demographic, Type - Numerical

The number of children the patient has.
smoker: Category - Lifestyle, Type - Categorical

Indicates whether the patient is a smoker (e.g., Yes or No).
region: Category - Demographic, Type - Categorical

The region where the patient is located (e.g., southeast or northwest).
claim: Category - Financial, Type - Numerical

The insurance claim amount associated with the patient.


# Data cleaning

In [None]:
total = df_train.isnull().sum()
percent = (total / len(df_train)) * 100
missing_values = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
print(missing_values)
df_train = df_train.drop('index',axis=1)

In [None]:
mean_age = np.mean(df_train['age'])
# Fill null values in the 'age' column with the mean age
df_train['age'].fillna(mean_age, inplace=True)

# Check if there are any missing values in the 'age' column after filling
missing_values = df_train['age'].isnull().sum()
print("Missing values in 'age' column after filling:", missing_values)


In [None]:
df_train.dropna(subset=['region'],inplace=True)

# Check if there are any missing values in the 'region' column after droping
missing_values = df_train['region'].isnull().sum()
print("Missing values in 'region' column after filling:", missing_values)

In [None]:
df_train.describe()

# Column types

## Numerical : Patient ID, age,bmi,bloodpressure,children,claim

## Categorical : gender,diabetic,smoker,region


In [None]:
df_train.sample(5)

# Univariate analysis

### Age

Conclusion:

-> Age has 5 missing values

-> data has two peaks and skewness of  0.11 so data is distriubuted bimodaly which means two age groups have claimed insurance most so one new column can be added to identify these age groups

-> there are no outliers

-> max age is recorded 60 and mean age is 38 which shows very old people have not been insured

In [None]:
df_train['age'].describe()

In [None]:
sns.displot(df_train['age'],kde=True)

In [None]:
df_train['age'].plot(kind='kde')

In [None]:
print('skewness in age is ' ,df_train['age'].skew())

In [None]:
df_train['age'].plot(kind='box')

### Feature engineering on age:
#### Categorizing age:

In [None]:
def age_categorizer(age):
  if 18<age<24:
    return "Young adult"
  if 24<=age<=39:
    return "Adults"
  if 40<=age:
    return "Senior adults"
df_train['age_category']=df_train['age'].apply(age_categorizer)

analyzing age categories:

finding:                                                                               -> 46% persons are senior adults age above 40 and 44.2% persons are adults which is causing two peaks in data


In [None]:
age_count=df_train['age_category'].value_counts()
plt.pie(age_count,labels=age_count.index,autopct='%1.1f%%')
plt.show()

### bmi

Conclusion :

->distribution is perfectly normal

->  skewness is very minimal

-> boxplot indicated some serious outliers ,, bmi above 50 shows very obese and its very fatal

In [None]:
sns.distplot(df_train['bmi'])

In [None]:
df_train['bmi'].describe()

In [None]:
df_train['bmi'].plot(kind='kde')

In [None]:
df_train['bmi'].skew()

In [None]:
df_train['bmi'].plot(kind='box')

In [None]:
df_train[df_train['bmi']>35].count()

### Feature engineering on bmi (categorizing)

In [None]:
def bmi_categorizer(bmi):
    if bmi < 18.5:
        return "Underweight"
    elif 18.5 <= bmi < 24.9:
        return "Healthy"
    elif 25 <= bmi < 29.9:
        return "Overweight"
    elif bmi >= 30 and bmi < 40:
        return "Obese"
    else:
        return "Very Obese"

# Assuming you have a 'bmi' column in your DataFrame 'df_train'
df_train['bmi_category'] = df_train['bmi'].apply(bmi_categorizer)


### Analysis on bmi category feature

finding:                                                                                          
     -> Most of people almost 46% are obese i.e 615 persons                                                        
     -> 8.1% i.e 92 person were severely obese causing health problems


In [None]:
#plotting pie chart
bmi_count = df_train['bmi_category'].value_counts()
plt.pie(bmi_count,labels=bmi_count.index,autopct='%1.1f%%')
plt.show()

In [None]:
df_train[df_train['bmi']>40].count()

In [None]:
df_train[df_train['bmi_category']=='Healthy'].count()

In [None]:
df_train[df_train['bmi_category']=='Obese'].count()

### bloodpressure

conclusion:

-> data is highly positively skewed with skewness of 1.48

-> boxplot shows some serious outliers which is cause of positive skewness

-> bp higher than 120 is counted 55 , so 55 persons  due to bp shootup

In [None]:
sns.distplot(df_train['bloodpressure'])

In [None]:
df_train['bloodpressure'].skew()

In [None]:
df_train['bloodpressure'].plot(kind='kde')

In [None]:
col=df_train[df_train['bloodpressure']>130]
col['bloodpressure'].plot(kind='hist')

In [None]:
df_train['bloodpressure'].describe()

In [None]:
df_train['bloodpressure'].plot(kind='box')

In [None]:
df_train[df_train['bloodpressure']<120]

In [None]:
df_train[df_train['bloodpressure']<120].count()

## Feature engineering on bloodpressure (categorizing):


In [None]:
df_train[df_train['bloodpressure']<80].count()

no one has low bloodpressure

In [None]:
def bp_categorizer(bp):
    if 80<=bp < 89:
        return "elevated"
    elif 90<= bp <= 99:
        return "High"
    elif  99< bp <100:
        return "very high"
    elif 100<=bp<120:
        return "extremely high"
    else:
        return "fatal"

# Assuming you have a 'bmi' column in your DataFrame 'df_train'
df_train['bp_category'] = df_train['bloodpressure'].apply(bp_categorizer)


### Lets find which how much people have high bp and their cause of death might be high bp

### finding:                                         
        -> There is high chance that many people have been died due to high bloodpressure or they are patients of hypertension
        -> 37.2% of people were suffering from stage 1 hypertension with bp higher than 90 (diastolic)
        -> 19.1% have stage 2 hypertension in this all category all are having more than 100 mmHg(diastolic)
        -> 8.7% have higher than 120 and some also have higher than 140
        bloodpressure which is fatal and we can say bp shootup is cause of
        death of these 117 pesons
        -> Approx Total 813 persons were suffering from hyper tension,out of
        which 498 on stage 1, 315 on stage 2 and 117 were probably died due to
        it because they had high bp than 120 and 55 person had higher than 145

In [None]:
category_counts = df_train['bp_category'].value_counts()
plt.pie(category_counts,labels= category_counts.index,  autopct='%1.1f%%')
plt.show()

In [None]:
df_train[df_train['bloodpressure']>=90].count()

In [None]:
df_train[df_train['bloodpressure']>=100].count()

In [None]:
df_train[df_train['bp_category']=='fatal'].count()

### Claim:

conclusion:

-> highly skewed data with positve skewness of 1.5

-> there are more than 150 outliers which shows some people have been given more claim. there could be multiple reasons we will find in further analysis



In [None]:
sns.displot(df_train['claim'],kde=True)
plt.show()

In [None]:
df_train['claim'].plot(kind='kde')

In [None]:
df_train['claim'].skew()

In [None]:
df_train['claim'].describe()

In [None]:
df_train['claim'].plot(kind='box')

In [None]:
df_train[df_train['claim']>30000].count()

In [None]:
df_train[df_train['claim']<30000].count()

In [None]:
stats.iqr(df_train['claim'])

### Gender

Conclusion:

->  male and female are almost in same quantity

In [None]:
df_train['gender'].value_counts()

In [None]:
df_train['gender'].value_counts().plot(kind='bar')

In [None]:
df_train['gender'].value_counts().plot(kind='pie',autopct='%0.1f%%')

In [None]:
df_train['gender'].isnull().sum()

In [None]:
df_train.head()

### diabetic

conclusion:
-> most patients are non diabetic but there is no more difference

In [None]:
df_train['diabetic'].value_counts()

In [None]:
df_train['diabetic'].value_counts().plot(kind='bar')

In [None]:
df_train['diabetic'].value_counts().plot(kind='pie',autopct='%0.1f%%')

### Children:

conclusion :  most people are children less and number of children is increasing upto 5

In [None]:
df_train['children'].value_counts()

In [None]:
df_train['children'].value_counts().plot(kind='bar')

In [None]:
df_train['children'].value_counts().plot(kind='pie',autopct='%0.1f%%')

### smoker

conclusion:
 80% percent people are not smokers

In [None]:
df_train['smoker'].value_counts()

In [None]:
df_train['smoker'].value_counts().plot(kind='bar')

In [None]:
df_train['smoker'].value_counts().plot(kind='pie',autopct='%0.1f%%')

### region

conclusion : most people are from southeast then northwest then southwest then northwest

In [None]:
df_train['region'].value_counts()

In [None]:
df_train['region'].value_counts().plot(kind='bar')

In [None]:
df_train['region'].value_counts().plot(kind='pie',autopct='%0.1f%%')

In [None]:
df_train.sample(5)

# BiVARIATE ANALYSIS:

## Numerical vs Numerical

In [None]:
plt.scatter(df_train['age'],df_train['bmi'])
plt.title('age VS bmi relation')
plt.show()

### bloodpressure vs claim:

correlation between bloodpressure and claim is 0.53 so they are most related with respect to heatmap

finding:


      -> people having hypertension on stage 1 have low claim and they're in majority
      -> some people having fatal hypertension have high claim so high bloospressure can cause high claim

In [None]:
plt.scatter(df_train['bloodpressure'],df_train['claim'])
plt.title('bloodpressure vs claim')
plt.show()

### claim vs bp_category:

### finding:
->persons having fatal bp i.e severe hypertension have high claim so high bp caused high claim

In [None]:
sns.barplot(x=df_train['bp_category'],y=df_train['claim'])
plt.xlabel('bp_category')
plt.ylabel('claim')
plt.show()

### age and claim

no relation as corr is only 0.2 AND graphs are constant too

In [None]:
plt.scatter(df_train['age'],df_train['claim'])
plt.show()

In [None]:
correlation = df_train['claim'].corr(df_train['age'])
correlation

### Gender vs bloodpressure:

In [None]:
## plotting a barplot to find which gender had the fatal hypertension
sns.barplot(x=df_train['gender'],y=df_train['bloodpressure'])
plt.xlabel('gender')
plt.ylabel('bloodpressure')
plt.title('gender vs bloodpressure')
plt.show()

### gender vs bmi_category

In [None]:
table = pd.crosstab(df_train['gender'],df_train['bmi_category'],normalize=True)
print(table)

### finding:
 there is no much difference in male and female bmi_categories which means almost each categories has same amount of distribution of male and female

## smoker vs bp_category:

In [None]:

smoker_count= df_train['smoker'].value_counts()
smoker_count

In [None]:
pd.crosstab(df_train['smoker'],df_train['bp_category'],normalize=True)

## diabetic vs claim:

 Finding:

 Both diabetic and non-diabetic individuals show an equal distribution of claims, suggesting that diabetes does not appear to be a significant factor in determining claim frequency."

In [None]:

diabetic_count= df_train['diabetic'].value_counts()
diabetic_count

In [None]:
sns.barplot(x=df_train['diabetic'],y=df_train['claim'])

### bmi vs diabetic

### finding: diabetes is not affecting bmi

In [None]:
sns.barplot(x=df_train['diabetic'],y=df_train['bmi'])
plt.xlabel('diabetic')
plt.ylabel('bmi')
plt.title('bmi vs diabetic')
plt.show()

In [None]:
pd.crosstab(df_train['diabetic'],df_train['bmi_category'],normalize=True)

In [None]:
df_train.sample(5)

### Children vs claim:

### finding:
        
         -> so people having less children have more claim so we can say relation is inverse somehow

In [None]:
plt.scatter(df_train['children'],df_train['claim'])
plt.show()

## Smoker vs Claim:

finding:

-> smoker persons have high claim

In [None]:
sns.barplot(x=df_train['smoker'],y=df_train['claim'])
plt.xlabel('Smoker')
plt.ylabel('claim')
plt.title('Smoker vs claim')
plt.show()

## region vs claim:




### finding:
     -> people from northeast have more claim  however only 17.3% people belong from northeast region

In [None]:
sns.barplot(x=df_train['region'],y=df_train['claim'])
plt.xlabel('Region')
plt.ylabel('claim')
plt.title('Region vs claim')
plt.show()

# Multivariate analysis:

## Smoker vs gender vs claim

### finding:
       ->  both male and female smokers have  higher claims than non smokers

In [None]:
sns.barplot(x=df_train['gender'],y=df_train['claim'],hue=df_train['smoker'])
plt.legend()
plt.show()

## bp_category vs smoker vs claim

## finding:
       -> smoker peson having fatal hypertension have claimed the most

In [None]:
sns.barplot(x=df_train['bp_category'],y=df_train['claim'],hue=df_train['smoker'])
plt.legend()
plt.show()

In [None]:
df_train.sample(4)

## diabetic vs smoker vs claim

###finding:
     -> diabetic and non diabetic smokers both have high claim and diabetic or  
      non diabetic non smokers have low claim so diabetes didnt affect claim

In [None]:
sns.barplot(x=df_train['diabetic'],y=df_train['claim'],hue=df_train['smoker'])
plt.legend()
plt.show()

## bmi_category vs smoker vs claim

##finding:
        -> obese and very obese smoker have claimed high

In [None]:
sns.barplot(x=df_train['bmi_category'],y=df_train['claim'],hue=df_train['smoker'])
plt.legend()
plt.show()

## region vs smoker vs claim

### finding:
           -> southeast smokers have claimed the most

In [None]:
sns.barplot(x=df_train['region'],y=df_train['claim'],hue=df_train['smoker'])
plt.legend()
plt.show()

## bp_category vs bmi_category vs claim

## finding :           
          ->obese person with fatal hypertension and
            very obese persons with stage 2 hypertension have claimed the most

In [None]:
sns.barplot(x=df_train['bp_category'],y=df_train['claim'],hue=df_train['bmi_category'])
plt.legend()
plt.show()

### age_category vs smoke vs column:

## finding:
       -> all smokers of any age category in dataset have claimed high

In [None]:
sns.barplot(x=df_train['age_category'],y=df_train['claim'],hue=df_train['smoker'])
plt.legend()
plt.show()

### region vs bp_category vs claim:

## finding:
      ->Northeastern people and southwestern people with fatal hypertension  have claimed high

In [None]:
sns.barplot(x=df_train['region'],y=df_train['claim'],hue=df_train['bp_category'])
plt.legend()
plt.show()

## children vs smoker vs claim:

## finding :     
       -> smokers with 2 and 3 children have claimed highest
       -> people with 4,5 children have claimed low whether smokers or non smokers

In [None]:
sns.barplot(x=df_train['children'],y=df_train['claim'],hue=df_train['smoker'])
plt.legend()
plt.show()

## children vs region vs claim:

## finding:
        -> northeastern  people with 3 children have claimed high

In [None]:
sns.barplot(x=df_train['region'],y=df_train['claim'],hue=df_train['children'])
plt.legend()
plt.show()

## children vs bp_category vs claim:

### finding:
   -> people having 2,3 children with stage 2 or fatal hypertension i.e bp above 100 and 120 respectively (diastolic) have claimed high.

In [None]:
sns.barplot(x=df_train['children'],y=df_train['claim'],hue=df_train['bp_category'])
plt.legend()
plt.show()

### children vs bmi_category vs claim:

## finding:
    -> people with 2,3 children and are obese or very obese i.e bmi above 30 have clamed high

In [None]:
sns.barplot(x=df_train['children'],y=df_train['claim'],hue=df_train['bmi_category'])
plt.legend()
plt.show()

# Final conclusion:

### northeastern and southeastern smoker persons suffering from hypertension at stage 2 or fatal (bp>100) and are obese or very obese(bmi>30) have claimed the highest