# U.S. Medical Insurance Costs

In [2]:
import csv

insurance_array = list()

# Open and save data from insurance.csv to Python variable
with open("insurance.csv", newline='') as insurance_csv:
  insurance_data = csv.DictReader(insurance_csv)
  for row in insurance_data:
    insurance_array.append(row)

print(len(insurance_array), "\n", insurance_array[0], "\n", insurance_array[1200])
    


1338 
 {'age': '19', 'sex': 'female', 'bmi': '27.9', 'children': '0', 'smoker': 'yes', 'region': 'southwest', 'charges': '16884.924'} 
 {'age': '37', 'sex': 'male', 'bmi': '24.32', 'children': '2', 'smoker': 'no', 'region': 'northwest', 'charges': '6198.7518'}


### Make sorted dictionaries (sex, region, age, smoker, children)

In [12]:
def the_sort_machine(dataset, category):
    # dictionary to be returned
    new_dictionary = dict()

    #Take each datapoint in the data set and add to dictionary based on category passed
    for dp in dataset:
        if dp[category] not in new_dictionary:
            new_dictionary[dp[category]] = [dp]
        else:
            new_dictionary[dp[category]].append(dp)
    return new_dictionary

sort_by_sex = the_sort_machine(insurance_array, 'sex')
# print(sort_by_sex['male'])

sort_by_region = the_sort_machine(insurance_array, 'region')
# print(sort_by_region['southeast'])

sort_by_age = the_sort_machine(insurance_array, 'age')
# print(sort_by_age['19'])

sort_by_smoker = the_sort_machine(insurance_array, 'smoker')
# print(sort_by_smoker['yes'])

sort_by_children = the_sort_machine(insurance_array, 'children')
# print(sort_by_children['4'])

### Average BMI of Female and Male persons in dataset

In [4]:
#Find average BMI for males and females
def average_bmi(dataset):
    total_bmi = 0
    total_datapoints = len(dataset)

    for p in dataset:
        total_bmi += float(p['bmi'])
    
    average_bmi = total_bmi / total_datapoints
    
    return round(average_bmi,4)

avg_female_bmi = average_bmi(sort_by_sex['female'])
avg_male_bmi = average_bmi(sort_by_sex['male'])

print(f"Average Female BMI: {avg_female_bmi}\nAverage Male BMI: {avg_male_bmi}")


Average Female BMI: 30.3777
Average Male BMI: 30.9431


### Average Cost: Sex, Region, Age, # of Children, Smoker Status

In [5]:
def average_cost_by_category(dataset):
    averages_dictionary = dict()

    for d in dataset:
        total_cost = 0
        total_datapoints = len(dataset[d])

        for dp in dataset[d]:
            total_cost += float(dp['charges'])
        
        averages_dictionary[d] = round(total_cost / total_datapoints, 2)
    
    return averages_dictionary

cost_by_sex = average_cost_by_category(sort_by_sex)
print(cost_by_sex)
cost_by_region = average_cost_by_category(sort_by_region)
print(cost_by_region)
cost_by_age = average_cost_by_category(sort_by_age)
print(cost_by_age['22'])
cost_by_children = average_cost_by_category(sort_by_children)
print(cost_by_children)
cost_by_smoker = average_cost_by_category(sort_by_smoker)
print(cost_by_smoker)

{'female': 12569.58, 'male': 13956.75}
{'southwest': 12346.94, 'southeast': 14735.41, 'northwest': 12417.58, 'northeast': 13406.38}
10012.93
{'0': 12365.98, '1': 12731.17, '3': 15355.32, '2': 15073.56, '5': 8786.04, '4': 13850.66}
{'yes': 32050.23, 'no': 8434.27}


Here we can see that:
  1. Females average about $1300 less in insurance charges
  2. The Southeast region has the highest average insurance charges
  3. People age 22 average insurance charges are $10,012.93
  4. Those who smoke have almost 4x the amount of insurance charges than those who do not

## Sort into BMI Groups
Using information from the CDC[^1], there are four ranges they use to determine weight status when looking at BMIs. They are as followed:
  1. Underweight range: BMI < 18.5
  2. Healthy Weight range: BMI >= 18.5 and BMI <= 24.9
  3. Overweight range: BMI >= 25.0 and BMI <= 29.9
  4. Obese range: BMI >= 30.0

These are for categories that the data is sorted, the number corresponding to the weight range

[^1]:https://www.cdc.gov/healthyweight/assessing/index.html

In [6]:
# The value associated to each group is the upper range of the group
bmi_classification = {
    1: 18.5,
    2: 24.9,
    3: 29.9,
    4: 30.0
}

# Dictionary to hold sorted records
bmi_groups = {"underweight":[], "healthy weight":[], "overweight":[], "obese":[]}


for row in insurance_array:
    bmi = round(float(row['bmi']), 1)
    if bmi < bmi_classification[1]:
        bmi_groups["underweight"].append(row)
    elif bmi >= bmi_classification[1] and bmi <= bmi_classification[2]:
        bmi_groups["healthy weight"].append(row)
    elif bmi >= bmi_classification[2] and bmi <= bmi_classification[3]:
        bmi_groups["overweight"].append(row)
    elif bmi >= bmi_classification[4]:
        bmi_groups["obese"].append(row)

average_cost_by_bmi_class = average_cost_by_category(bmi_groups)
print(average_cost_by_bmi_class) 

for g in bmi_groups:
    num = len(bmi_groups[g])

    print(f"# of {g} in dataset: {num}")

{'underweight': 8852.2, 'healthy weight': 10379.5, 'overweight': 11000.08, 'obese': 15552.34}
# of underweight in dataset: 20
# of healthy weight in dataset: 222
# of overweight in dataset: 389
# of obese in dataset: 707


Now lets take a deeper look into these numbers. While we already know smoking correlates to higher insurance charges, I want to see how being a smoker compares to non-smokers in each bmi group.

In [7]:
smoker_dict = dict()

for g in bmi_groups:
    smoker_cost = 0
    non_smoker_cost = 0
    total_smoker = 0
    total_non_smoker = 0

    for row in bmi_groups[g]:
        if row['smoker'] == 'yes':
            total_smoker += 1
            smoker_cost += float(row['charges'])
        elif row['smoker'] == 'no':
            total_non_smoker += 1
            non_smoker_cost += float(row['charges'])
    
    average_non_smoker = round((non_smoker_cost / total_non_smoker), 2)
    average_smoker = round((smoker_cost / total_smoker), 2)
    smoker_dict[g] = {'yes': average_smoker, 'no': average_non_smoker}

print("Comparing smoker status and BMI grouping to average insurance charges:")
for r in smoker_dict:
    smoking_cost = smoker_dict[r]['yes']
    non_smoking_cost = smoker_dict[r]['no']
    
    print(r.capitalize(), f"\nSmoker: {smoking_cost} | Non-smoker: {non_smoking_cost}")
    
    


Comparing smoker status and BMI grouping to average insurance charges:
Underweight 
Smoker: 18809.82 | Non-smoker: 5532.99
Healthy weight 
Smoker: 19942.22 | Non-smoker: 7599.64
Overweight 
Smoker: 22495.87 | Non-smoker: 8299.48
Obese 
Smoker: 41557.99 | Non-smoker: 8842.69


### Average Smoking vs Non-Smoking costs by Category
I want to take make a function from the BMI category to Average smoker cost to check the averages against age, sex, region, and child count

In [8]:
def average_smoking_cost_by_category(dataset):
  smoker_dict = dict()

  for g in dataset:
      
      smoker_cost = 0
      non_smoker_cost = 0
      total_smoker = 0
      total_non_smoker = 0

      for row in dataset[g]:
          
          if row['smoker'] == 'yes':
              total_smoker += 1
              smoker_cost += float(row['charges'])
          elif row['smoker'] == 'no':
              total_non_smoker += 1
              non_smoker_cost += float(row['charges'])
      
      average_non_smoker = round((non_smoker_cost / total_non_smoker), 2)
      average_smoker = round((smoker_cost / total_smoker), 2)
      smoker_dict[g] = {'yes': average_smoker, 'no': average_non_smoker}

  return smoker_dict

smoker_sex = average_smoking_cost_by_category(sort_by_sex)
print(smoker_sex)

smoker_region = average_smoking_cost_by_category(sort_by_region)
print(smoker_region['southeast'])

smoker_children = average_smoking_cost_by_category(sort_by_children)
print(smoker_children)

smoker_age = average_smoking_cost_by_category(sort_by_age)
print(smoker_age['21'])

{'female': {'yes': 30679.0, 'no': 8762.3}, 'male': {'yes': 33042.01, 'no': 8087.2}}
{'yes': 34845.0, 'no': 8032.22}
{'0': {'yes': 31341.36, 'no': 7611.79}, '1': {'yes': 31822.65, 'no': 8303.11}, '3': {'yes': 32724.92, 'no': 9614.52}, '2': {'yes': 33844.24, 'no': 9493.09}, '5': {'yes': 19023.26, 'no': 8183.85}, '4': {'yes': 26532.28, 'no': 12121.34}}
{'yes': 16650.61, 'no': 3813.53}
