# U.S. Medical Insurance Costs

In [8]:
import pandas as pd
import csv
import statistics as stats
from collections import Counter

In [3]:
df = pd.read_csv("insurance.csv")
print(df.head())                 

   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520


First we will save the columns from insurance.csv in python lists.

In [14]:
ages = []
sexes = []
bmis = []
num_children = []
smoker_status = []
regions = []
charges = []

with open("insurance.csv") as csv_file:
    reader = csv.DictReader(csv_file)
    for dictionary in reader:
        ages.append(int(dictionary["age"]))
        sexes.append(dictionary["sex"])
        bmis.append(float(dictionary["bmi"]))
        num_children.append(dictionary["children"])
        smoker_status.append(dictionary["smoker"])
        regions.append(dictionary["region"])
        charges.append(float(dictionary["charges"]))

# Confirm data in lists
print("First 5 ages:", ages[:5])
print("First 5 BMI values:", bmis[:5])
print("First 5 charges:", charges[:5])

First 5 ages: [19, 18, 28, 33, 32]
First 5 BMI values: [27.9, 33.77, 33.0, 22.705, 28.88]
First 5 charges: [16884.924, 1725.5523, 4449.462, 21984.47061, 3866.8552]


Next we define a class which will contain methods to carry out analyses.

In [16]:
class InsuranceAnalysis:
    def __init__(self, ages, sexes, bmis, num_children, smoker_statuses, regions, charges):
        self.ages = ages
        self.sexes = sexes
        self.bmis = bmis
        self.num_children = num_children
        self.smoker_statuses = smoker_statuses
        self.regions = regions
        self.charges = charges

    def age_summary(self):
        ages_sorted = sorted(self.ages)
        quartiles = stats.quantiles(ages_sorted, n=4, method='inclusive')
        q1 = quartiles[0]
        q3 = quartiles[2]
        iqr = q3 - q1

        return {
            "mean": round(stats.mean(self.ages), 2),
            "median": round(stats.median(self.ages), 2),
            "stdev": round(stats.stdev(self.ages), 2),
            "min": min(self.ages),
            "max": max(self.ages),
            "range": max(self.ages) - min(self.ages),
            "Q1": round(q1, 2),
            "Q3": round(q3, 2),
            "IQR": round(iqr, 2)
        }

    def count_sexes(self):
        return Counter(self.sexes)

    def average_bmi(self):
        return round(stats.mean(self.bmis), 2)

    def count_regions(self):
        return Counter(self.regions)

    def charges_summary(self):
        charges_sorted = sorted(self.charges)
        quartiles = stats.quantiles(charges_sorted, n=4, method='inclusive')
        q1 = quartiles[0]
        q3 = quartiles[2]
        iqr = q3 - q1

        return {
            "mean": round(stats.mean(self.charges), 2),
            "median": round(stats.median(self.charges), 2),
            "stdev": round(stats.stdev(self.charges), 2),
            "min": min(self.charges),
            "max": max(self.charges),
            "range": max(self.charges) - min(self.charges),
            "Q1": round(q1, 2),
            "Q3": round(q3, 2),
            "IQR": round(iqr, 2)
        }

Lets create an instance of the class InsuranceAnalysis.

In [17]:
analysis = InsuranceAnalysis(ages, sexes, bmis, num_children, smoker_status, regions, charges)

Now we can view the summary statistics for the ages

In [18]:
print(analysis.age_summary())

{'mean': 39.21, 'median': 39.0, 'stdev': 14.05, 'min': 18, 'max': 64, 'range': 46, 'Q1': 27.0, 'Q3': 51.0, 'IQR': 24.0}


The mean age (39.21) and median age (39.0) are almost identical, suggesting that the distribution of ages is roughly symmetric.
Most people are between 27 and 51 years old (the interquartile range), with ages ranging from 18 to 64 overall.
This indicates that the dataset primarily consists of working-age adults.

In [19]:
print(analysis.count_sexes())

Counter({'male': 676, 'female': 662})


Both sexes are equally represented.

In [20]:
print(analysis.count_regions())

Counter({'southeast': 364, 'southwest': 325, 'northwest': 325, 'northeast': 324})


The dataset is equally distributed across the four regions in USA. 

In [21]:
print(analysis.charges_summary())

{'mean': 13270.42, 'median': 9382.03, 'stdev': 12110.01, 'min': 1121.8739, 'max': 63770.42801, 'range': 62648.554110000005, 'Q1': 4740.29, 'Q3': 16639.91, 'IQR': 11899.63}


The standard deviation (\\$12110) is high suggesting charges vary a lot between people. The mean (\\$13270) is significantly higher than the median, suggesting there are a small number of people paying very high charges which pulls up the mean. This would imply the charges are right skewed.  
Most people pay between \\$4700 and \\$16600, but the max charge is much higher at \\$63770. So there may be some potential outliers.