# U.S. Medical Insurance Costs

In [125]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
# explore the data info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


We found that this data contains 1338 rows with 7 cols.
There's no null value in this dataset.

In [20]:
# explore the decriptive statistic of the dataset

df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [11]:
# Now, let's calculate the mean of bmi, age and charges of this dataset

avg_bmi = df['bmi'].mean()
avg_age = df['age'].mean()
avg_charge = df['charges'].mean()

print(f'The average ages are {avg_age} years, the average bmis are {avg_bmi} and the average charges are {avg_charge} dollars')

The average ages are 39.20702541106129 years, the average bmis are 30.663396860986538 and the average charges are 13270.422265141257 dollars


In [17]:
# let's find the number of smokers and non-smokers
df['smoker'].value_counts()

no     1064
yes     274
Name: smoker, dtype: int64

Smoker people equal to 20.47% in this dataset

In [18]:
# let's find the number of each gender
df['sex'].value_counts()

male      676
female    662
Name: sex, dtype: int64

In [23]:
# let's compare the average charges between smoker and non-smoker

smoke_charges = df.groupby('smoker')['charges'].mean().reset_index()
sex_charges = df.groupby('sex')['charges'].mean().reset_index()
smoke_charges

Unnamed: 0,smoker,charges
0,no,8434.268298
1,yes,32050.231832


In [60]:
avg_smoker_charges = df[df['smoker'] == 'yes']['charges'].mean()
compare_smoker_charges = ((avg_smoker_charges - avg_charge)/avg_charge)*100
print(f'The charges for smokers are higher than average charges about {compare_smoker_charges} %')

The charges for smokers are higher than average charges about 141.51629233172477 %


In [24]:
sex_charges

Unnamed: 0,sex,charges
0,female,12569.578844
1,male,13956.751178


The average charges of male are higher than than the average charges!!

In [32]:
smoke_sex_charges = df.groupby(['sex', 'smoker'])['charges'].mean().reset_index()
smoke_sex_charges

Unnamed: 0,sex,smoker,charges
0,female,no,8762.2973
1,female,yes,30678.996276
2,male,no,8087.204731
3,male,yes,33042.005975


From the data above it shows that
> The charges of the smokers are higher than the average charges both male and female.
> The average charges of the non-smoker: the female's charges are a bit higher than the male's charges.
Why? may be the number of the children?

Let's find out!!

In [71]:
# create the new col to see that there's childre or not

df['have_children'] = df['children'].apply(lambda x: 'yes' if x > 0 else 'no')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,have_children
0,19,female,27.9,0,yes,southwest,16884.924,no
1,18,male,33.77,1,no,southeast,1725.5523,yes
2,28,male,33.0,3,no,southeast,4449.462,yes
3,33,male,22.705,0,no,northwest,21984.47061,no
4,32,male,28.88,0,no,northwest,3866.8552,no


In [79]:
# let's answer our question before...

nonsmoke_child = df[df['smoker'] == 'no']
nonsmoke_child.groupby(['have_children'])['charges'].mean().reset_index()

Unnamed: 0,have_children,charges
0,no,7611.793335
1,yes,9058.2617


In [80]:
nonsmoke_child.groupby(['have_children', 'sex'])['charges'].mean().reset_index()

Unnamed: 0,have_children,sex,charges
0,no,female,7688.318863
1,no,male,7530.806677
2,yes,female,9577.277721
3,yes,male,8509.234548


Bingo!! The average charges of the people who have children are higher than the one who not.
And the charges for non-smoker female who have children are higher than male, too.

In [91]:
# take a look at each region

df['region'].value_counts()

southeast    364
northwest    325
southwest    325
northeast    324
Name: region, dtype: int64

In [92]:
df.groupby(['sex', 'region'])['sex'].count()

sex     region   
female  northeast    161
        northwest    164
        southeast    175
        southwest    162
male    northeast    163
        northwest    161
        southeast    189
        southwest    163
Name: sex, dtype: int64

In [93]:
df.groupby('region')['charges'].mean().reset_index()

Unnamed: 0,region,charges
0,northeast,13406.384516
1,northwest,12417.575374
2,southeast,14735.411438
3,southwest,12346.937377


Most of the data are from the southeast region!
And in the Southeast region, they've the most number of male and female.
And the average charges are the highest in the southeast region, too.

But! The northeast, which have the least number of people, is in the 2nd rank of the average charges.

In [99]:
df.groupby(['region', 'smoker'])['smoker'].count()

region     smoker
northeast  no        257
           yes        67
northwest  no        267
           yes        58
southeast  no        273
           yes        91
southwest  no        267
           yes        58
Name: smoker, dtype: int64

In [102]:
df.groupby(['region', 'have_children'])['have_children'].count()

region     have_children
northeast  no               147
           yes              177
northwest  no               132
           yes              193
southeast  no               157
           yes              207
southwest  no               138
           yes              187
Name: have_children, dtype: int64

Most of the smokers are live in the southeast region
and most of the people that have children are live in the southeast region, too.

In [114]:
number_children = df.groupby('region')['children'].sum().sort_values(ascending=False).reset_index()
number_children

Unnamed: 0,region,children
0,southeast,382
1,northwest,373
2,southwest,371
3,northeast,339


In [115]:
region_children = df[df['have_children'] == 'yes'].groupby('region')['have_children'].count().reset_index()
region_children

Unnamed: 0,region,have_children
0,northeast,177
1,northwest,193
2,southeast,207
3,southwest,187


In [116]:
ratio_children = pd.merge(region_children, number_children, how='inner', on='region')
ratio_children['children per person'] = ratio_children['have_children']/ratio_children['children']
ratio_children

Unnamed: 0,region,have_children,children,children per person
0,northeast,177,339,0.522124
1,northwest,193,373,0.517426
2,southeast,207,382,0.541885
3,southwest,187,371,0.504043


We have many children in the southeast region. There're 382 childrens from 207 people in the southeast region.
And the most ratio for child per person.

In [117]:
df.groupby('sex')['bmi'].mean()

sex
female    30.377749
male      30.943129
Name: bmi, dtype: float64

In [120]:
female_bmi = df[df['sex'] == 'female']['bmi'].mean()
male_bmi = df[df['sex'] == 'male']['bmi'].mean()

pct_female_bmi = ((avg_bmi - female_bmi)/avg_bmi)*100
pct_male_bmi = ((avg_bmi - male_bmi)/avg_bmi)*100

print(f'The average female bmi is {pct_female_bmi}% from the average bmi')
print(f'The average male bmi is {pct_male_bmi}% from the average bmi')

The average female bmi is 0.931558944915683% from the average bmi
The average male bmi is -0.9122663040447463% from the average bmi
