# U.S. Medical Insurance Costs Analysis

##### First, we import 'DictReader' from the 'csv' module to read the dataset.

In [3]:
from csv import DictReader

##### Function `open_csv()` reads the CSV file and returns a list of dictionaries, where each dictionary represents person's data (insurance_list).

In [4]:
def open_csv(filename):
    with open(filename, 'r') as file:
        dict_reader = DictReader(file)
        list_of_dict = list(dict_reader)
        return list_of_dict
        
insurance_list = open_csv('insurance.csv')

#### All values in 'insurance_list' are strings by default.
##### Function `fix_data_types()` converts 'age' and 'children' to integers, and 'bmi' and 'charges' to floats.
##### The resulting list (fixed_insurance_list) stores data with corrected types for further analysis.

In [5]:
def fix_data_types(data_list):
    conversions = {'age': int, 'children': int, 'bmi': float, 'charges': float}
    for data in data_list:
        for key, func in conversions.items():
            data[key] = func(data[key])
    return data_list
    
fixed_insurance_list = fix_data_types(insurance_list)

##### Function `list_for_column` is used to get all values for a specific key from the dataset, such as `age`, `sex`, or `bmi`.

In [7]:
def list_for_column(data, key):
    return [person[key] for person in data]

keys = ['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges']

ages_list, sexes_list, bmis_list, children_list, smokers_list, region_list, charges_list = [list_for_column(fixed_insurance_list, key) for key in keys]

##### Function `numeric_average()` calculates the mean of a numeric list and returns it as a float value.

In [8]:
def numeric_average(data):
    if not data:
        return 0
    return sum(data)/len(data)

print(round(numeric_average([2, 3, 1, 5, 7, 4, 2, 8, 9]), 2))

4.56


##### Here we calculate average age, BMI, number of children, and medical charges across all records.

In [9]:
avg_age = round(numeric_average(ages_list))
avg_bmi = round(numeric_average(bmis_list), 1)
avg_children = round(numeric_average(children_list))
avg_charge = round(numeric_average(charges_list), 2)

print(f"Average age: {avg_age} years \nAverage bmi: {avg_bmi} \nAverage children count: {avg_children} \nAverage charge: {avg_charge}")

Average age: 39 years 
Average bmi: 30.7 
Average children count: 1 
Average charge: 13270.42


##### Function `category_percentage()` calculates what percentage each category represents within a list (e.g., smokers vs. non-smokers).

In [7]:
def category_percentage(data):
    percentage = {}
    options = list(set(data))
    for option in options:
        percentage[option] = round(data.count(option)/len(data)*100, 2)
    return percentage

##### We calculate and display the percentage of smokers, regional distribution, and gender balance in the dataset.

In [8]:
smoker_percentage = category_percentage(smokers_list)
region_percentage = category_percentage(region_list)
sexes_percentage = category_percentage(sexes_list)
print(f"Smokers: {smoker_percentage['yes']}% Non-smokers: {smoker_percentage['no']}%")
print(f"Southeast: {region_percentage['southeast']}% Northeast: {region_percentage['northeast']}% Northwest: {region_percentage['northwest']}% Southwest: {region_percentage['southwest']}%")
print(f"Males: {sexes_percentage['male']}%, Females: {sexes_percentage['female']}%")

Smokers: 20.48% Non-smokers: 79.52%
Southeast: 27.2% Northeast: 24.22% Northwest: 24.29% Southwest: 24.29%
Males: 50.52%, Females: 49.48%


##### Function `filter_by_condition()` filters data by a given condition (like 'male' or 'yes') and returns the matching values.

In [9]:
def filter_by_condition(values, condition):
    filtered = []
    for data in values:
        if data[0] == condition:
            filtered.append(data[1] if len(data) == 2 else data[1:])  
    return filtered

##### Function `compare_groups()` compares two groups (e.g., smokers vs. non-smokers) and returns their average metric values.

In [10]:
def compare_groups(values, group1, group2):
    group1_values = [val for grp, val in values if grp == group1]
    group2_values = [val for grp, val in values if grp == group2]
    return round(numeric_average(group1_values), 2), round(numeric_average(group2_values), 2)

##### Function `print_group_comparison()` prints average data for two groups (e.g., smokers vs. non-smokers).

In [11]:
def print_group_comparison(zipped, group1, group2, metric, label_map=None):
    avg1, avg2 = compare_groups(zipped, group1, group2)

    name1 = label_map.get(group1, group1) if label_map else group1
    name2 = label_map.get(group2, group2) if label_map else group2
    
    print(f"Average {metric}: {name1} - {avg1} | {name2} - {avg2}")

##### Here we compare smokers and non-smokers in terms of their average charges, BMI, and age.

In [12]:
smoker_labels = {'yes': 'smoker', 'no': 'non-smoker'}
print_group_comparison(list(zip(smokers_list, charges_list)), 'yes', 'no', 'charge', smoker_labels)

print_group_comparison(list(zip(smokers_list, bmis_list)), 'yes', 'no', 'bmi', smoker_labels)

print_group_comparison(list(zip(smokers_list, ages_list)), 'yes', 'no', 'age', smoker_labels)

Average charge: smoker - 32050.23 | non-smoker - 8434.27
Average bmi: smoker - 30.71 | non-smoker - 30.65
Average age: smoker - 38.51 | non-smoker - 39.39


##### We repeat the comparison for males and females to explore how gender affects these average values.

In [13]:
print_group_comparison(list(zip(sexes_list, charges_list)), 'male', 'female', 'charge')

print_group_comparison(list(zip(sexes_list, bmis_list)), 'male', 'female', 'bmi')

print_group_comparison(list(zip(sexes_list, ages_list)), 'male', 'female', 'age')

Average charge: male - 13956.75 | female - 12569.58
Average bmi: male - 30.94 | female - 30.38
Average age: male - 38.92 | female - 39.5


##### Function `compare_charge_percentage()` calculates how much of the total insurance charges come from each group.

In [14]:
def compare_charge_percentage(values_1, values_2, charges):
    percentage_1 = round(sum(values_1)/sum(charges)*100, 2)
    percentage_2 = round(sum(values_2)/sum(charges)*100, 2)
    return percentage_1, percentage_2    

##### Function `compare_charge_percentage_by_group()` compares how much groups (e.g. males and females) contribute to total charges.

In [15]:
def compare_charge_percentage_by_group(values, charges, group1, group2, label, label_map=None):
    group1_values = filter_by_condition(values, group1)
    group2_values = filter_by_condition(values, group2)
    
    p1, p2 = compare_charge_percentage(group1_values, group2_values, charges)
    
    name1 = label_map.get(group1, group1) if label_map else group1
    name2 = label_map.get(group2, group2) if label_map else group2

    print(f"Percentage of total charges: {name1} - {p1}% | {name2} - {p2}%")

#####  Calling the function to print charge contribution comparison between smokers and non-smokers.

In [16]:
compare_charge_percentage_by_group(list(zip(smokers_list, charges_list)), charges_list, 'yes', 'no', 'smokers', smoker_labels)

Percentage of total charges: smoker - 49.46% | non-smoker - 50.54%


##### Calling the function to print charge contribution comparison between males and females.

In [17]:
compare_charge_percentage_by_group(list(zip(sexes_list, charges_list)), charges_list, 'male', 'female', 'sexes')

Percentage of total charges: male - 53.14% | female - 46.86%


#####  Function `charges_for_group()` returns all charges that belong to a given age range.

In [18]:
def charges_for_group(data, age_range):
    return [charge for age, charge in data if age in age_range]

##### Function `avg_charge_by_age_group()` prints the average charge for every defined age range.

In [19]:
def avg_charge_by_age_group(age_group_charges, age_ranges_group):
    for i in range(len(age_group_charges)):
        avg_group_charge = round(numeric_average(age_group_charges[i]), 2)
        first, last = age_ranges_group[i]
        print(f"Average charge for age ({first}-{last-1}): {avg_group_charge}")

##### Function `calculate_avg_charge_by_age()` creates several age ranges and calculates average charges for each of them.

In [20]:
def calculate_avg_charge_by_age(zipped_list):
    age_ranges = [(18, 25), (25, 35), (35, 45), (45, 55), (55, 65)]
    age_group_charges_list = [charges_for_group(zipped_list, range(start, end)) for start, end in age_ranges]
    return avg_charge_by_age_group(age_group_charges_list, age_ranges)

calculate_avg_charge_by_age(list(zip(ages_list, charges_list)))

Average charge for age (18-24): 9011.34
Average charge for age (25-34): 10352.39
Average charge for age (35-44): 13134.17
Average charge for age (45-54): 15853.93
Average charge for age (55-64): 18513.28


##### Function `calculate_and_print_by_group()` prints a clear label for the selected group and shows average charges by age.

In [21]:
def calculate_and_print_by_group(zipped_data, label, function):
    print(f'---------------{label.upper()}S-DATA---------------')
    function(zipped_data)

#####  Here we split data by gender and calculate average insurance charges for each age range.

In [22]:
sex_age_zipped = list(zip(sexes_list, ages_list, charges_list))

male_age_zipped = filter_by_condition(sex_age_zipped, 'male')

female_age_zipped = filter_by_condition(sex_age_zipped, 'female')

calculate_and_print_by_group(male_age_zipped, 'male', calculate_avg_charge_by_age)
calculate_and_print_by_group(female_age_zipped, 'female', calculate_avg_charge_by_age)

---------------MALES-DATA---------------
Average charge for age (18-24): 9366.23
Average charge for age (25-34): 11386.59
Average charge for age (35-44): 13812.91
Average charge for age (45-54): 16820.86
Average charge for age (55-64): 19230.4
---------------FEMALES-DATA---------------
Average charge for age (18-24): 8629.97
Average charge for age (25-34): 9263.35
Average charge for age (35-44): 12444.91
Average charge for age (45-54): 14893.71
Average charge for age (55-64): 17819.48


##### Now we perform the same analysis, but comparing smokers and non-smokers across age groups.

In [23]:
is_smoker_age_zipped = list(zip(smokers_list, ages_list, charges_list))

smokers_age_zipped = filter_by_condition(is_smoker_age_zipped, 'yes')

non_smokers_age_zipped = filter_by_condition(is_smoker_age_zipped, 'no')

calculate_and_print_by_group(smokers_age_zipped, 'smoker', calculate_avg_charge_by_age)
calculate_and_print_by_group(non_smokers_age_zipped, 'non-smoker', calculate_avg_charge_by_age)

---------------SMOKERS-DATA---------------
Average charge for age (18-24): 27796.54
Average charge for age (25-34): 28416.48
Average charge for age (35-44): 31366.05
Average charge for age (45-54): 35310.4
Average charge for age (55-64): 39696.37
---------------NON-SMOKERS-DATA---------------
Average charge for age (18-24): 3841.1
Average charge for age (25-34): 5647.33
Average charge for age (35-44): 7545.5
Average charge for age (45-54): 11241.4
Average charge for age (55-64): 14064.83


##### Function `charge_by_bmi()` groups people by BMI category (underweight, healthy, overweight, obese).

In [24]:
def charge_by_bmi(zipped_list):
    bmi_ranges = {"underweight": (0.0, 18.5), "healthy": (18.5, 25.0), "overweight": (25.0, 30.0), "obese": (30.0, float("inf"))}
    healthiness_dct = {}
    for category, (low, high) in bmi_ranges.items():
        charges = [charge for bmi, charge in zipped_list if low <= bmi < high]
        healthiness_dct[category] = charges
    return healthiness_dct

##### Function `calculate_charge_by_bmi()` calculates the average insurance charge for each BMI category.

In [25]:
def calculate_charge_by_bmi(zipped_list):
    dct = charge_by_bmi(zipped_list)
    for key, value in dct.items():
        print(f"Average charge ({key}): {round(numeric_average(value), 3)}")

#####  Here we calculate and display the average charge for every BMI group in the full dataset.

In [50]:
charge_by_bmi_zipped = list(zip(bmis_list, charges_list))

calculate_charge_by_bmi(charge_by_bmi_zipped)

Average charge (underweight): 8852.201
Average charge (healthy): 10409.338
Average charge (overweight): 10987.51
Average charge (obese): 15552.335


##### We now compare BMI-related average charges between males and females.

In [52]:
charge_by_bmi_sex = list(zip(sexes_list, bmis_list, charges_list))

male_charge_by_bmi = filter_by_condition(charge_by_bmi_sex, 'male')

female_charge_by_bmi = filter_by_condition(charge_by_bmi_sex, 'female')

calculate_and_print_by_group(male_charge_by_bmi, 'male', calculate_charge_by_bmi)
calculate_and_print_by_group(female_charge_by_bmi, 'female', calculate_charge_by_bmi)

---------------MALES-DATA---------------
Average charge (underweight): 5611.706
Average charge (healthy): 9868.02
Average charge (overweight): 11381.954
Average charge (obese): 16610.452
---------------FEMALES-DATA---------------
Average charge (underweight): 11012.53
Average charge (healthy): 10909.016
Average charge (overweight): 10616.851
Average charge (obese): 14370.667


##### This section repeats the same analysis for smokers and non-smokers.

In [29]:
charge_by_bmi_is_smoker = list(zip(smokers_list, bmis_list, charges_list))

smokers_charge_by_bmi = filter_by_condition(charge_by_bmi_is_smoker, 'yes')

non_smokers_charge_by_bmi = filter_by_condition(charge_by_bmi_is_smoker, 'no')

calculate_and_print_by_group(smokers_charge_by_bmi, 'smoker', calculate_charge_by_bmi)
calculate_and_print_by_group(non_smokers_charge_by_bmi, 'non-smoker', calculate_charge_by_bmi)

---------------SMOKERS-DATA---------------
Average charge (underweight): 18809.825
Average charge (healthy): 19942.224
Average charge (overweight): 22495.874
Average charge (obese): 41557.99
---------------NON-SMOKERS-DATA---------------
Average charge (underweight): 5532.992
Average charge (healthy): 7685.656
Average charge (overweight): 8257.962
Average charge (obese): 8842.692


##### Function `avg_charge_by_num_of_children()` calculates how the number of children affects average insurance charges.

In [30]:
def avg_charge_by_num_of_children(child_num_charges):
    child_uniq = [0, 1, 2, 3, 4, 5]
    for num in child_uniq:
        charges = [charge for num_children, charge in child_num_charges if num_children == num]
        print(f"{num} children average charge: {round(numeric_average(charges), 3)}")

##### We now analyze how the number of children impacts insurance charges across the whole dataset.

In [31]:
avg_charge_by_num_of_children(list(zip(children_list, charges_list)))

0 children average charge: 12365.976
1 children average charge: 12731.172
2 children average charge: 15073.564
3 children average charge: 15355.318
4 children average charge: 13850.656
5 children average charge: 8786.035


##### We compare how this relationship differs between males and females.

In [32]:
children_charge_by_sex = list(zip(sexes_list, children_list, charges_list))

male_children_charge = filter_by_condition(children_charge_by_sex, 'male')

female_children_charge = filter_by_condition(children_charge_by_sex, 'female')

calculate_and_print_by_group(male_children_charge, 'male', avg_charge_by_num_of_children)
calculate_and_print_by_group(female_children_charge, 'female', avg_charge_by_num_of_children)

---------------MALES-DATA---------------
0 children average charge: 12832.697
1 children average charge: 13273.522
2 children average charge: 16187.095
3 children average charge: 16789.167
4 children average charge: 13782.285
5 children average charge: 7931.658
---------------FEMALES-DATA---------------
0 children average charge: 11905.714
1 children average charge: 12161.36
2 children average charge: 13941.317
3 children average charge: 13865.605
4 children average charge: 13937.675
5 children average charge: 9854.006


##### Finally, we examine how smoking status affects the relationship between number of children and insurance charges.

In [33]:
children_charge_by_is_smoker = list(zip(smokers_list, children_list, charges_list))

smokers_children_charge = filter_by_condition(children_charge_by_is_smoker, 'yes')

non_smokers_children_charge = filter_by_condition(children_charge_by_is_smoker, 'no')

calculate_and_print_by_group(smokers_children_charge, 'smoker', avg_charge_by_num_of_children)
calculate_and_print_by_group(non_smokers_children_charge, 'non-smoker', avg_charge_by_num_of_children)

---------------SMOKERS-DATA---------------
0 children average charge: 31341.364
1 children average charge: 31822.654
2 children average charge: 33844.236
3 children average charge: 32724.915
4 children average charge: 26532.277
5 children average charge: 19023.26
---------------NON-SMOKERS-DATA---------------
0 children average charge: 7611.793
1 children average charge: 8303.109
2 children average charge: 9493.094
3 children average charge: 9614.519
4 children average charge: 12121.344
5 children average charge: 8183.846


##### SUMMARY

- **Smokers** pay on average **over 3x more** than non-smokers, making smoking the strongest cost factor.
- **The highest average costs** are observed for **obese smokers** (i.e. people in the "obese" BMI category who also smoke).
- **Higher BMI** categories (especially obese) show a clear correlation with higher insurance charges.
- **Age** increases the average cost gradually, but less significantly than BMI or smoking.
- **Number of children** slightly affects total charges, with parents paying a bit more on average.
- **Gender** differences in costs are **minimal**, showing that sex has little impact on insurance prices.