In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/insurance/insurance.csv


# Problem Statement
Understanding the relation between the various factor like bmi, number of children or smoker affecting the Hosiptalization charges. Predicting the hospitalization by understanding patterns from other parameters.

# Understanding the data

- ****age**** : age of primary beneficiary
- ****sex**** : insurance contractor gender, female, male
- ****bmi**** : Body mass index, providing an understanding of body, weights that are relatively high or low relative to height,objective index of body weight (kg / m ^ 2) using the ratio of height to weight, ideally 18.5 to 24.9
- ****children**** : Number of children covered by health insurance / Number of dependents
- ****smoker**** : Smoking
- ****region**** : the beneficiary's residential area in the US, northeast, southeast, southwest, northwest.
- ****charges**** : Individual medical costs billed by health insurance

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import levene

In [None]:
data = pd.read_csv('../input/insurance/insurance.csv')
pd.set_option('mode.chained_assignment', None)

In [None]:
data.head()

In [None]:
data.shape

****There are 1338 medical records here.****

In [None]:
data.info()

****Data has numerical as well as categorical data.****

In [None]:
data.describe(include='all').T

### From the above table, we can say - 
- Mean age is around 39
- Mean BMI is 30.66
- Max number of children is 5
- Mean charges is 12k but the 50% is 9K so there is a hint of outliers

# Non Graphical Analysis

In [None]:
data['age'].value_counts().sort_values(ascending=False)[:10]

****Most common ages that we can see is 18-20 and 45-52.****

In [None]:
data['sex'].value_counts()

****Both sex shares almost same number of records.****

In [None]:
data['smoker'].value_counts()

****Number of smokers are less than non-smokers.****

In [None]:
data['region'].value_counts()

****Almost all the regions have same number of records.****

In [None]:
sex = data.groupby(['sex']).agg({'age':['mean']})
sex.reset_index()

****Mean age of both sexes are almost same.****

In [None]:
smoker = data.groupby(['smoker']).agg({'age':['mean']})
smoker.reset_index()

****Mean age of smokers and non-smokers are almost same.****

In [None]:
smoker = data.groupby(['region']).agg({'age':['mean']})
smoker.reset_index()

****Mean age across regions is almost same.****

# Univariate Analysis

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(data['age'],kde=True,bins=10)
plt.title('Age')
plt.show()

****Age distribution is almost normal. People with age under 23 is more than rest ages.****

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(data=data,x='smoker')
plt.title('Smoker v/s Non Smokers')
plt.show()

****As we saw earlier smokers data is less than non-smokers.****

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(data=data,x='sex')
plt.title('Male v/s Female')
plt.show()

****Both sex have almost equal number of data.****

In [None]:
sns.set(style="darkgrid")
f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
sns.boxplot(data=data,x='bmi', ax=ax_box)
sns.histplot(data=data, x="bmi", ax=ax_hist,kde=True)
ax_box.set(xlabel='')
plt.show()

## Treating Outliers

In [None]:
Q3 = data['bmi'].quantile(0.75)
Q1 = data['bmi'].quantile(0.25)
IQR = Q3-Q1
upper = Q3+(1.5*IQR)
lower = Q1-(1.5*IQR)

In [None]:
data = data[(data['bmi']>lower) & (data['bmi']<upper)]

In [None]:
sns.set(style="darkgrid")
f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
sns.boxplot(data=data,x='bmi', ax=ax_box)
sns.histplot(data=data, x="bmi", ax=ax_hist,kde=True)
ax_box.set(xlabel='')
plt.show()

### From the above graph, we can say - 
- Avg BMI is around 30
- 75% of BMI is under 33
- 25% of BMI is less the 26

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(data = data,x='children')
plt.title('Number of children')
plt.show()

### From the above graphs, we can say - 
- Number of children 0,1,2,3 are more common
- Number of children 4 and 5 are almost equal and rare.

In [None]:
type_value_count = data['region'].value_counts(normalize=True)*100
plt.figure(figsize=(10,6))
plt.pie(type_value_count,labels=['South East','North West','South West','North East'],autopct='%1.2f%%')
plt.title('Regions')
plt.show()

In [None]:
sns.set(style="darkgrid")
f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
sns.boxplot(data=data,x='charges', ax=ax_box)
sns.histplot(data=data, x="charges", ax=ax_hist,kde=True)
ax_box.set(xlabel='')
plt.show()

## Data is highly right skewed. We transform data to make it normal.

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(np.log(data['charges']),kde=True,bins=10)
plt.title('Charges')
plt.show()

****Performing log transformation we get almost normal.****

****Checking Normality using Q-Q Plot.****

In [None]:
plt.figure(figsize=(10,6))
stats.probplot(data['charges'].apply(np.log), plot= plt, dist="norm")
plt.title('Q-Q plot for charges')
plt.show()

****As seen from the Q-Q plot the data is almost normal. Will perform Shapiro Wilk test once to confirm hypothesis.****

## Performing log transformation

In [None]:
data['charges'] = data['charges'].apply(np.log)

In [None]:
sns.set(style="darkgrid")
f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
sns.boxplot(data=data,x='charges', ax=ax_box)
sns.histplot(data=data, x="charges", ax=ax_hist,kde=True)
ax_box.set(xlabel='')
plt.show()

### From the above graph, we can say - 
- Avg charges is around 18k
- 75% of charges are less than 35k
- 25% of charges are less than 15k

# Bivariate Analysis

In [None]:
plt.figure(figsize=(10,6))
sns.lineplot(x='age',y='bmi',data=data,hue='sex')
plt.title('bmi across ages')
plt.show()

### From the above graph, we can say - 
- Male with age between 26 and 32 have significantly more BMI than Females in same age range
- More or less both the age group shares same trend in BMI throughtout various agres.

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(y='bmi',x='charges',data=data)
plt.title('Charges v/s BMI')
plt.show()

### From the above graph we can say - 
- Charge is almost uniformly distributed 
- Higher BMI have only high charge

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x='sex',y='bmi',data=data)
plt.title('Gender v/s BMI')
plt.show()

****Almost both the sexes have same BMI with males having a slightly higher average BMI.****

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x='sex',y='age',data=data)
plt.title('Gender v/s Age')
plt.show()

****Females mean age is more than male, but the age group spread is more on males.****

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x='smoker',y='bmi',data=data)
plt.title('Smoker v/s BMI')
plt.show()

****Average BMI is slightly more for smokers than non-smokers.****

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x='children',y='charges',data=data)
plt.show()

### From the above graph, we can say
- Almost all the number of children have same average of charge.
- Number of children 0,1,2 may be low but the charges are still high.

In [None]:
plt.figure(figsize=(10,6))
(data
 .groupby(['smoker'])['children']
 .value_counts(normalize=True)
 .mul(100)
 .rename('Percent')
 .reset_index()
 .pipe((sns.barplot,'data'),x = "smoker",y="Percent",hue='children')
)
plt.title('Number of children for Smokers')
plt.show()

****The Number of children across smoker and non smoker are almost same**** 

In [None]:
plt.figure(figsize=(10,6))
(data
 .groupby(['region'])['children']
 .value_counts(normalize=True)
 .mul(100)
 .rename('Percent')
 .reset_index()
 .pipe((sns.barplot,'data'),x = "region",y="Percent",hue='children')
)
plt.title('Number of children per Region')
plt.show()

****All the regions seems to have same share of seveity levels.****

In [None]:
sns.heatmap(data.corr(),annot=True)
plt.show()

****We can see good co-relation between the age and charges.****

In [None]:
sns.pairplot(data,hue='smoker',y_vars=['charges'])
plt.show()

# Hypothesis Testing

****Setting up a function to return result on the basis of the significance value(0.05).****

In [None]:
def htResult(p_value):
    significance_level = 0.05
    if p_value <= significance_level: 
        print('Reject NULL HYPOTHESIS') 
    else: 
        print('Fail to Reject NULL HYPOTHESIS') 

## Question 1 - 
To prove charges of people who do smoking are greater than those who don't.

### Performing Right Tailed T-test

Null Hypothesis ( H0 ) - Charges are same for Smokers and Non-Smokers

Alternate Hypothesis ( HA ) - Charges are greater for smokers.

#### First Step - 
Checking for basic assumpitons for the hypothesis

#### Second step- 
Performing Right tailed t-test

#### Third step - 
Checking for hypothesis result


## Checking for basic assumptions
- Normality check
- Homogeneity of Variances

### For normality check, using Q-Q Plot

In [None]:
plt.figure(figsize=(10,6))
stats.probplot(data['charges'], plot= plt, dist="norm")
plt.title('Q-Q plot for charges')
plt.show()

****From the above graph we can say, the data points roughly fall along a straight diagonal line in a Q-Q plot, then the dataset likely follows a normal distribution.**** 

### Homogeneity of Variances using Lavene's test

Null Hypothesis(H0) - Homogenous Variance

Alternate Hypothesis(HA) - Non Homogenous variance

In [None]:
smokers = data[data['smoker']=='yes']['charges']
non_smokers = data[data['smoker']=='no']['charges']
stat,p = levene(smokers,non_smokers)

In [None]:
print('P-value :',p)

In [None]:
htResult(p)

****We reject null hypothesis, which means variance is not equal.****

## Performing Right tailed t-test

In [None]:
st,p = stats.ttest_ind(smokers,non_smokers)
print('P-value :',(p/2))

In [None]:
htResult(p/2)

## Answer
****We reject null hypothesis, which means the hospitalization charge for smokers is more than non-smoker.****

## Question 2 - 
To prove the BMI of females is different from that of males .

### Performing Two Tailed T-test

Null Hypothesis ( H0 ) - BMI for male and females are same.

Alternate Hypothesis ( HA ) - BMI for males and females are different.

#### First Step - 
Checking for basic assumpitons for the hypothesis

#### Second step- 
Performing Two tailed t-test

#### Third step - 
Checking for hypothesis result

## Checking for basic assumptions
- Normality check
- Homogeneity of Variances

### For normality check, using Q-Q Plot

In [None]:
plt.figure(figsize=(10,6))
stats.probplot(data['bmi'], plot= plt, dist="norm")
plt.title('Q-Q plot for BMI')
plt.show()

****From the above graph we can say, the data points roughly fall along a straight diagonal line in a Q-Q plot, then the dataset likely follows a normal distribution.**** 

### Homogeneity of Variances using Levene's test

Null Hypothesis(H0) - Homogenous Variance

Alternate Hypothesis(HA) - Non Homogenous variance

In [None]:
males = data[data['sex']=='male']['bmi']
females = data[data['sex']=='female']['bmi']
stat,p = levene(males,females)

In [None]:
print('P-value :',p)

In [None]:
htResult(p)

****We fail to reject null hypothesis, which means the variance is same.****

## Performing two tailed t-test

In [None]:
st,p = stats.ttest_ind(males,females)
print('P-value :',p)

In [None]:
htResult(p)

## Answer
****We fail to reject null hypothesis, which means the bmi for females is not different from males.****

## Question 3 - 
To check if the proportion of smoking significantly different across different regions.

### Performing Chi-Square test

Null Hypothesis ( H0 ) - Proportion of smoking is equal across different regions.

Alternate Hypothesis ( HA ) - Proportion of smoking is different across different regions.


#### First step- 
Performing chi-square test

#### Second step - 
Checking for hypothesis result

In [None]:
cont = pd.crosstab(data['smoker'],data['region'])
value = np.array([cont.iloc[0][0:4].values,cont.iloc[1][0:4].values])

### Performing chi-square test

In [None]:
c, p, dof, expected = stats.chi2_contingency(value)
print('P-value :',p)

In [None]:
htResult(p)

## Answer
****We fail to reject null hypothesis, which means the proportion of smoking is same across different regions.****

## Question 4 - 
To check if the mean BMI of women with 0 child , 1 child, and 2 children the same.

### Performing One-way Anova

Null Hypothesis ( H0 ) - Mean BMI for females of children 0,1,2 is same.

Alternate Hypothesis ( HA ) - Mean BMI for females of children 0,1,2 is different.

#### First Step - 
Checking for basic assumpitons for the hypothesis

#### Second step- 
Performing One-way Anova

#### Third step - 
Checking for hypothesis result


## Checking for basic assumptions
- Normality check
- Homogeneity of Variances

****Normality check for viral load is done and found out to be normal.****

### Homogeneity of Variances with Levene's test

Null Hypothesis(H0) - Homogenous Variance

Alternate Hypothesis(HA) - Non Homogenous variance

In [None]:
female = data[data['sex']=='female']
fem_ch_0 = female[female['children']==0]['bmi']
fem_ch_1 = female[female['children']==1]['bmi']
fem_ch_2 = female[female['children']==2]['bmi']
fem_ch_3 = female[female['children']==3]['bmi']
fem_ch_4 = female[female['children']==4]['bmi']
fem_ch_5 = female[female['children']==5]['bmi']

In [None]:
stat,p = levene(fem_ch_0,fem_ch_1,fem_ch_2,fem_ch_3,fem_ch_4,fem_ch_5)
print('P-value :',p)

In [None]:
htResult(p)

****We fail to reject null hypothesis, which means the variance is same across the groups.****

## Performing One-Way Anova

In [None]:
stat,p = stats.f_oneway(fem_ch_0,fem_ch_1,fem_ch_2,fem_ch_3,fem_ch_4,fem_ch_5)
print('P-value :',p)

In [None]:
htResult(p)

## Answer
****We fail to reject null hypothesis, which means the mean viral load for females of severity levels 0,1,2 is same.****

# Recommendations -
- As we can observe the smokers in general have a higher charges so we can create awareness around to stop smoking as it is not at all pocket friendly.
- Women with any number of children have almost same BMI as observed by hypothesis testing, basic awareness around family planning can be provided to keep them from facing financial issues.
- With increasing age the charges too increase, so we can promote a healthy living in the middle ages to avoid these charges in the later stage of life.