# U.S. Medical Insurance Costs

 Goals:
1. Determine whether data collected is balanced. (sex, ages, region)
2. Compare insurance costs across different metrics. (age, sex, bmi, smoker, region)
3. Find average age for:
    - males
    - females
    - someone with at least one child.
    - someone with no children
    - smokers
    - non-smokers
    - people in each region
4. Find average bmi per region
5. Find average insurance cost for 12 bmi ranges (<20,20-22,22-24,24-26,26-28,28-30,30-32,32-34,34-36,36-38,38-40,>40)

In [1]:
import csv

insurance_data_dicts = []
fieldnames = []
#data_size = 0

with open('insurance.csv') as insurance_csv:
    insurance_data_dicts = list(csv.DictReader(insurance_csv))
    fieldnames = insurance_data_dicts[0].keys()

#data_size = len(insurance_data_dicts)

### Analyzing Age Data
1. Find average age across dataset
2. Find percentage of people in each decade bucket
3. Find average age for each sex
4. Find average age for someone with at least one child
5. Find average age for someone with no children
6. Find average age for smokers and non-smokers
7. Find average age for people in each region

In [2]:
def get_mean(data_list):
    return sum(data_list) / len(data_list)

def get_median(data_list):
    sorted_data = sorted(data_list)
    n = len(data_list)
    midpoint = n//2
    if n % 2:
        return (sorted_data[midpoint-1] + sorted_data[midpoint]) / 2
    else:
        return sorted_data[midpoint]

def get_max_min(data_list):
    return max(data_list),min(data_list)

ages = [int(row['age']) for row in insurance_data_dicts]
avg_age = get_mean(ages)
median_age = get_median(ages)
max_age, min_age = get_max_min(ages)
print(f"Average age: {avg_age}\nMedian Age: {median_age}\nMax Age: {max_age}\nMin Age: {min_age}")

Average age: 39.20702541106129
Median Age: 39
Max Age: 64
Min Age: 18


In [3]:
def get_percentage_each_decade(age_list):
    age_buckets = {}
    for age in age_list:
        bucket = age // 10
        age_buckets[bucket*10] = age_buckets.get(bucket*10, 0) + 1

    for bucket in age_buckets:
        age_buckets[bucket] = age_buckets[bucket] * 100 / len(age_list)

    return age_buckets

age_buckets = get_percentage_each_decade(ages)
for bucket in age_buckets:
    print(f"{age_buckets[bucket]:.2f}% of data collected is from people in their {bucket}s")

10.24% of data collected is from people in their 10s
20.93% of data collected is from people in their 20s
19.21% of data collected is from people in their 30s
20.85% of data collected is from people in their 40s
8.52% of data collected is from people in their 60s
20.25% of data collected is from people in their 50s


In [5]:
male_data = []
female_data = []
for row in insurance_data_dicts:
    if row['sex'] == 'male':
        male_data.append(row)
    else:
        female_data.append(row)
        
male_avg_age = get_mean([int(row['age']) for row in male_data])
female_avg_age = get_mean([int(row['age']) for row in female_data])

print(f"The average age for males in the dataset is {male_avg_age} in a sample of {len(male_data)} males")
print(f"The average age for females in the dataset is {female_avg_age} in a sample of {len(female_data)} females\n")

has_child_avg_age = get_mean([int(row['age']) for row in insurance_data_dicts if int(row['children']) > 0])
has_no_child_avg_age = get_mean([int(row['age']) for row in insurance_data_dicts if int(row['children']) == 0])
print(f"The average age for people with children is {has_child_avg_age}")
print(f"The average age for people without children is {has_no_child_avg_age}\n")

smoker_data = []
non_smoker_data = []
for row in insurance_data_dicts:
    if row['smoker'] == 'yes':
        smoker_data.append(row)
    else:
        non_smoker_data.append(row)

smoker_avg_age = get_mean([int(row['age']) for row in smoker_data])
non_smoker_avg_age = get_mean([int(row['age']) for row in non_smoker_data])
print(f"The average age for smokers in the dataset is {smoker_avg_age}")
print(f"The average age for non-smokers in the dataset is {non_smoker_avg_age}")

The average age for males in the dataset is 38.917159763313606 in a sample of 676 males
The average age for females in the dataset is 39.503021148036254 in a sample of 662 females

The average age for people with children is 39.78010471204188
The average age for people without children is 38.444250871080136

The average age for smokers in the dataset is 38.51459854014598
The average age for non-smokers in the dataset is 39.38533834586466


In [7]:
def sample_size_by_feature(data):
    people_by_feature = {}
    for row in data:
        people_by_feature[row] = people_by_feature.get(row, 0) + 1
    
    return people_by_feature

def avg_metric_by_feature(data, metric, feature):
    people_by_feature = sample_size_by_feature([row[feature] for row in data])
    metric_by_feature = {}
    for row in data:
        current_feature = row[feature]
        metric_by_feature[current_feature] = metric_by_feature.get(current_feature,0) + float(row[metric])

    for feature in people_by_feature:
        metric_by_feature[feature] /= people_by_feature[feature]
    
    return metric_by_feature

for region, sample in sample_size_by_feature([row['region'] for row in insurance_data_dicts]).items():
    print(f"The sample size from the {region} surveyed {sample} people.")

print()

for region,avg_age in avg_metric_by_feature(insurance_data_dicts, 'age', 'region').items():
    print(f"The average age for people in the {region} region is {avg_age}")

The sample size from the southwest surveyed 325 people.
The sample size from the southeast surveyed 364 people.
The sample size from the northwest surveyed 325 people.
The sample size from the northeast surveyed 324 people.

The average age for people in the southwest region is 39.45538461538462
The average age for people in the southeast region is 38.93956043956044
The average age for people in the northwest region is 39.19692307692308
The average age for people in the northeast region is 39.26851851851852


### Analyzing BMI Data
1. Find average BMI in dataset
2. Find average BMI per region
3. Find average BMI for male and female populations

In [8]:
avg_bmi = get_mean([float(row['bmi']) for row in insurance_data_dicts])
print(f"The average BMI for the dataset is {avg_bmi}")

The average BMI for the dataset is 30.663396860986538


In [10]:
for region,avg_bmi in avg_metric_by_feature(insurance_data_dicts, 'bmi', 'region').items():
    print(f"The average bmi for people in the {region} region is {avg_bmi}")

The average bmi for people in the southwest region is 30.59661538461538
The average bmi for people in the southeast region is 33.35598901098903
The average bmi for people in the northwest region is 29.199784615384626
The average bmi for people in the northeast region is 29.17350308641976


In [12]:
avg_male_bmi = get_mean([float(row['bmi']) for row in male_data])
avg_female_bmi = get_mean([float(row['bmi']) for row in female_data])
print(f"The average bmi for males in the dataset is {avg_male_bmi}.")
print(f"The average bmi for females in the dataset is {avg_female_bmi}.\n")

for region,avg_bmi in avg_metric_by_feature(male_data, 'bmi', 'region').items():
    print(f"The average bmi for males in the {region} region is {avg_bmi}.")

for region,avg_bmi in avg_metric_by_feature(female_data, 'bmi', 'region').items():
    print(f"The average bmi for females in the {region} region is {avg_bmi}")

The average bmi for males in the dataset is 30.943128698224832.
The average bmi for females in the dataset is 30.377749244713023.

The average bmi for males in the southeast region is 33.99.
The average bmi for males in the northwest region is 29.120155279503102.
The average bmi for males in the northeast region is 29.024539877300615.
The average bmi for males in the southwest region is 31.129447852760737.
The average bmi for females in the southwest region is 30.060493827160496
The average bmi for females in the southeast region is 32.67125714285712
The average bmi for females in the northwest region is 29.27795731707316
The average bmi for females in the northeast region is 29.324316770186336


### Analyzing Insurance Data and its relations
1. Find average charges in the dataset
2. Find average charges by sex
3. Find average charges by region
4. Find average charges for smokers and non-smokers
5. Find average charges for people with different amounts of children

In [13]:
avg_charges = get_mean([float(row['charges']) for row in insurance_data_dicts])
print(f"The average insurance cost in the dataset is {avg_charges}")

The average insurance cost in the dataset is 13270.422265141257


In [14]:
avg_male_charges = get_mean([float(row['charges']) for row in male_data])
avg_female_charges = get_mean([float(row['charges']) for row in female_data])
print(f"The average insurance cost for males in the dataset is {avg_male_charges}.")
print(f"The average insurance cost for females in the dataset is {avg_female_charges}.\n")
print(f"On average, males in the dataset pay {avg_male_charges-avg_female_charges} more for insurance.")

The average insurance cost for males in the dataset is 13956.751177721886.
The average insurance cost for females in the dataset is 12569.57884383534.

On average, males in the dataset pay 1387.1723338865468 more for insurance.


In [16]:
charges_by_region = avg_metric_by_feature(insurance_data_dicts, 'charges', 'region')
for region,avg_charges in charges_by_region.items():
    print(f"The average insurance cost for people in the {region} region is {avg_charges}")

highest_cost_region = max(charges_by_region, key=charges_by_region.get)
print(f"The region with the highest insurance costs on average is the {highest_cost_region} region.")

The average insurance cost for people in the southwest region is 12346.93737729231
The average insurance cost for people in the southeast region is 14735.411437609895
The average insurance cost for people in the northwest region is 12417.575373969228
The average insurance cost for people in the northeast region is 13406.3845163858
The region with the highest insurance costs on average is the southeast region.


In [33]:
smoker_dict = avg_metric_by_feature(insurance_data_dicts, 'charges','smoker')

print(f"The average insurance cost for smokers in the dataset is {smoker_dict['yes']}")
print(f"The average insurance cost for non-smokers in the dataset is {smoker_dict['no']}")
print(f"On average, smokers in the dataset pay ${smoker_dict['yes']-smoker_dict['no']} more on insurance than non-smokers.")

The average insurance cost for smokers in the dataset is 32050.23183153285
The average insurance cost for non-smokers in the dataset is 8434.268297856199
On average, smokers in the dataset pay $23615.96353367665 more on insurance than non-smokers.


In [29]:
people_by_num_children = sample_size_by_feature([int(row['children']) for row in insurance_data_dicts])
for num_children, people in sorted(people_by_num_children.items()):
    print(f"{people} people had {num_children} children")

print()

charges_by_num_children = avg_metric_by_feature(insurance_data_dicts, 'charges','children')
for num_children, charges in sorted(charges_by_num_children.items()):
    print(f"People with {num_children} children have an average insurance cost of {charges}.")

574 people had 0 children
324 people had 1 children
240 people had 2 children
157 people had 3 children
25 people had 4 children
18 people had 5 children

People with 0 children have an average insurance cost of 12365.975601635882.
People with 1 children have an average insurance cost of 12731.171831635793.
People with 2 children have an average insurance cost of 15073.563733958328.
People with 3 children have an average insurance cost of 15355.31836681528.
People with 4 children have an average insurance cost of 13850.656311199999.
People with 5 children have an average insurance cost of 8786.035247222222.
