# Basic Statistics Concepts applied to an insurance company analysis

In [1]:
import pandas as pd
import matplotlib as plt

## Loading the data

In [2]:
df_insurance = pd.read_csv("../../data-csv/basic-statistics-concepts/insurance.csv")
df_insurance.head(20)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
9,60,female,25.84,0,no,northwest,28923.13692


In [3]:
df_insurance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


## Summary statistics for each variable

In [4]:
# Computing the mean of all the charges for all classes

mean = df_insurance["charges"].mean()
print(f"The mean of the charges for all classes is {mean:.2f} USDs.")


The mean of the charges for all classes is 13270.42 USDs.


In [5]:
# df_insurance.groupby("smoker")["charges"].mean()
df_insurance[["smoker", "charges"]].groupby("smoker").mean()

Unnamed: 0_level_0,charges
smoker,Unnamed: 1_level_1
no,8434.268298
yes,32050.231832


In [6]:
# The mean of charges for smokers is greater than the mean of charges for non smokers?
smokers_means = df_insurance[["smoker", "charges"]].groupby("smoker").mean()
smoker_mean = smokers_means.loc["yes", "charges"]
non_smoker_mean = smokers_means.loc["no", "charges"]
percent = (smoker_mean - non_smoker_mean)/non_smoker_mean
print("Smokers pay {:.2f}% more than non smokers on average.".format(percent*100))

Smokers pay 280.00% more than non smokers on average.


In [7]:
# Which gender tends to smoke the most? -> Men tends to smoke more than women do.

# Filtering the df by only smoker (smoker = yes), and compute the mode, which is the most repeated value of the column "sex"
df_insurance[df_insurance["smoker"]=="yes"]["sex"].mode()

0    male
Name: sex, dtype: object

In [8]:
# How much is charged for male smokers in comparison to female smokers?

# Saving the mean values of the charges grouped by sex in a df.
smokers_charges = df_insurance[df_insurance["smoker"]=="yes"][["sex", "charges"]].groupby("sex").mean()
smokers_charges

Unnamed: 0_level_0,charges
sex,Unnamed: 1_level_1
female,30678.996276
male,33042.005975


In [9]:
# Computing the male and female smokers charges
female_mean_charges = smokers_charges.loc["female", "charges"]
male_mean_charges = smokers_charges.loc["male", "charges"]

# Computing the percentage in the difference of charges for male and female smokers
percent = (male_mean_charges - female_mean_charges) / female_mean_charges
print(f"Male smokers pay {percent*100:.2f}% more than female smokers on average.")

Male smokers pay 7.70% more than female smokers on average.


In [10]:
# Range in charges for smokers?

# Min value paid by smokers
min_smokers = df_insurance[df_insurance["smoker"] == "yes"]["charges"].min()

# Max value paid by smokers
max_smokers = df_insurance[df_insurance["smoker"] == "yes"]["charges"].max()

# Range for charges paid by smokers
range = max_smokers - min_smokers

print(f"Range: ${range:.2f} ----> [${min_smokers:.2f}, ${max_smokers:.2f}]")

Range: $50940.97 ----> [$12829.46, $63770.43]


In [11]:
# Inter-quantile range iqr in charges for smokers?

# Min value paid by smokers
iqr_min_smokers = df_insurance[df_insurance["smoker"] == "yes"]["charges"].quantile(0.25)

# Max value paid by smokers
iqr_max_smokers = df_insurance[df_insurance["smoker"] == "yes"]["charges"].quantile(0.75)

# Range for charges paid by smokers
iqr_range = iqr_max_smokers - iqr_min_smokers

print(f"Inter Quantile Range (iqr): ${iqr_range:.2f} ----> [${iqr_min_smokers:.2f}, ${iqr_max_smokers:.2f}]")

Inter Quantile Range (iqr): $20192.96 ----> [$20826.24, $41019.21]


In [12]:
charges_region = df_insurance[["region", "charges"]].groupby("region")
charges_region.describe()

Unnamed: 0_level_0,charges,charges,charges,charges,charges,charges,charges,charges
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
region,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
northeast,324.0,13406.384516,11255.803066,1694.7964,5194.322288,10057.652025,16687.3641,58571.07448
northwest,325.0,12417.575374,11072.276928,1621.3402,4719.73655,8965.79575,14711.7438,60021.39897
southeast,364.0,14735.411438,13971.098589,1121.8739,4440.8862,9294.13195,19526.2869,63770.42801
southwest,325.0,12346.937377,11557.179101,1241.565,4751.07,8798.593,13462.52,52590.82939


In [13]:
# df_insurance["smoker"].value_counts()
df_insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [14]:
# Filtering the df by only the smokers, and groupying it by region
df_smokers_per_region = df_insurance[df_insurance["smoker"]=="yes"][["smoker", "region"]].groupby("region").count()

# Get the total number of smokers in the df
# total_number_smokers = len(df_insurance[df_insurance["smoker"]=="yes"][["smoker", "region"]])
total_number_smokers = df_insurance[df_insurance["smoker"]=="yes"][["smoker", "region"]].shape[0]

# print(total_number_smokers)

# Getting the percentage of smokers by region
df_smokers_per_region["Percentage of smokers (%)"] = 100*df_smokers_per_region["smoker"] / total_number_smokers
df_smokers_per_region

Unnamed: 0_level_0,smoker,Percentage of smokers (%)
region,Unnamed: 1_level_1,Unnamed: 2_level_1
northeast,67,24.452555
northwest,58,21.167883
southeast,91,33.211679
southwest,58,21.167883


In [15]:
# Probability of choosing an smoker person from the region that smokes the most (southeast)
se_smokers = df_insurance[df_insurance["region"] == "southeast"]["smoker"].value_counts()
number_smokers_se = se_smokers.sum()
print(number_smokers_se)
# se_smokers
prob = se_smokers["yes"] / number_smokers_se
print(f"Probability of choosing a smoker person in the southest region is {100*prob:.2f}%")

364
Probability of choosing a smoker person in the southest region is 25.00%


In [16]:
# Probability of chooseing a smoker person from southeast from all the persons in all regions
total_number_persons = df_insurance.shape[0]
print(total_number_persons)

prob = se_smokers["yes"] / total_number_persons
print(f"Probability of chooseing a smoker person from southeast from all the persons in all regions is {100*prob:.2f}%")

1338
Probability of chooseing a smoker person from southeast from all the persons in all regions is 6.80%


## Correlation Matrix

In [17]:
def sex_int_function(sex_value):
    value = -1
    if(sex_value=="female"):
        value = 0
    if(sex_value=="male"):
        value = 1
    return value

In [18]:
# Filtering by only smokers
df_smokers = df_insurance[df_insurance["smoker"] == "yes"].copy()

# Selecting only the smokers in the interquantile range of charges -> Reduce outlier values
central_smokers = df_smokers[(df_smokers["charges"] >= df_smokers["charges"].quantile(q=0.25)) &
                        (df_smokers["charges"] <= df_smokers["charges"].quantile(q=0.75))].copy()

# To include the gender in the correlation matrix, we create a column with 0->female and 1->male
central_smokers["sex_int"] = central_smokers["sex"].apply(sex_int_function)

# central_smokers.head()
corr = central_smokers.corr(numeric_only=True)
corr.style.background_gradient(cmap = "coolwarm").format(precision=2)

Unnamed: 0,age,bmi,children,charges,sex_int
age,1.0,-0.65,0.1,-0.53,-0.12
bmi,-0.65,1.0,-0.08,0.78,0.16
children,0.1,-0.08,1.0,-0.04,0.04
charges,-0.53,0.78,-0.04,1.0,0.12
sex_int,-0.12,0.16,0.04,0.12,1.0


In [19]:
central_smokers.head()
# central_smokers["sex_int"].value_counts()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,sex_int
11,62,female,26.29,0,yes,southeast,27808.7251,0
14,27,male,42.13,0,yes,southeast,39611.7577,1
19,30,male,35.3,0,yes,southwest,36837.467,1
23,34,female,31.92,1,yes,northeast,37701.8768,0
29,31,male,36.3,2,yes,southwest,38711.0,1
