# U.S. Medical Insurance 
### **Business Intelligence Data Analysis Project**

---
### **Description**

- This notebook is part of a project designed to develop and demonstrate skills in business intelligence and data analysis. The focus is to perform data analysis using Python by implementing key programming and analytical techniques.
---
### **Project Objectives**

#### 1. **Work locally on your own computer**
- All tasks are performed locally, ensuring familiarity with file management and local development tools.

#### 2. **Import a dataset into your program**
- Load data from files (e.g., CSV, Excel, etc.) and prepare it for analysis by addressing any data loading issues.

#### 3. **Analyze a dataset by building out functions or class methods**
- Write reusable functions or classes to clean, process, and extract insights from the data.

#### 4. **Use libraries to assist in your analysis**
- Leverage Python libraries such as `pandas`, `NumPy`, and `matplotlib` for data manipulation, computation, and visualization.
---
### **Optional Objectives**

#### 5. **Document and organize your findings**
- Summarize insights, observations, and results to ensure clarity and potential sharing.

#### 6. **Make predictions about a dataset’s features based on your findings**
- Apply predictive modeling techniques (optional) to make informed projections about dataset trends.
---


In [91]:
import csv  # Importing the CSV module

# Setting Variables
file_path = 'insurance.csv'  # Replace with the path to your CSV file

#################################################################################################################

# Function to load data from a CSV file
def load_csv_data():
    """
    Reads data from a CSV file and returns a list of dictionaries.
    """
    with open(file_path, mode='r') as file:  # Open the file in read mode
        reader = csv.DictReader(file)  # Read the file as a dictionary
        return [row for row in reader]  # Convert the rows to a list of dictionaries


# Function to convert string values in a dictionary to numbers where applicable
def str_to_num(row):
    """
    Converts numeric strings in a dictionary row to int or float.
    Leaves non-numeric values unchanged.
    """
    for key, value in row.items():
        if value is None or value == '':
            row[key] = 0  # Replace missing values with 0
        try:
            if isinstance(value, str) and '.' in value:
                row[key] = float(value)
            elif isinstance(value, str) and value.isdigit():
                row[key] = int(value)
        except ValueError:
            pass  # Leave non-convertible values unchanged
    return row


# Function to extract values from a specific column
def get_column_values(column_name):
    """
    Extracts all values from a specific column in the CSV file.
    """
    data = load_csv_data()  # Load data
    cleaned_data = [str_to_num(row) for row in data]  # Clean data
    return [row[column_name] for row in cleaned_data]


# Function to count group instances
def count_group_instances(group_dict, column_data):
    """
    Counts the number of instances in each group based on ranges in `group_dict`.
    """
    group_counts = {description: {"label": range_key, "value": 0} for range_key, description in group_dict.items()}

    for value in column_data:
        if not isinstance(value, (int, float)):  # Skip invalid entries
            continue
        for range_key, description in group_dict.items():
            if "-" in range_key:  # Closed range
                low, high = map(int, range_key.split("-"))
                if low <= value <= high:
                    group_counts[description]["value"] += 1
                    break
            elif "+" in range_key:  # Open-ended range (e.g., "65+")
                low = int(range_key.strip("+"))
                if value >= low:
                    group_counts[description]["value"] += 1
                    break

    return group_counts


# Function to format distribution
def format_distribution(data_dict, description_template="- {key} ({label}): {value}", indent=8, include_label=True):
    """
    Formats a dictionary into a structured, indented string with customizable templates.
    """
    indent_space = " " * indent
    formatted_lines = [
        f"{indent_space}{description_template.format(key=key, label=value['label'], value=value['value'])}"
        for key, value in data_dict.items()
    ]
    return "\n".join(formatted_lines)

def calculate_percentage_by_group(group_column, condition_column, condition_value):
    """
    Calculates the percentage of occurrences of `condition_value` in `condition_column`, grouped by `group_column`.

    Parameters:
    - group_column: List of group identifiers (e.g., gender, region).
    - condition_column: List of values to check the condition against (e.g., smoker status).
    - condition_value: The value in `condition_column` to count (e.g., 'yes').

    Returns:
    - A dictionary where keys are unique group values and values are percentages.
    """
    # Count occurrences of each group
    group_counts = {group: group_column.count(group) for group in set(group_column)}

    # Calculate percentage of condition occurrences for each group
    percentages = {
        group: sum([1 for g, cond in zip(group_column, condition_column) 
        if g == group and cond == condition_value]) / group_counts[group]
        for group in group_counts
    }

    return percentages


# Correlation calculation
def calculate_correlation(x, y):
    n = len(x)
    mean_x, mean_y = sum(x) / n, sum(y) / n
    covariance = sum((xi - mean_x) * (yi - mean_y) for xi, yi in zip(x, y))
    std_dev_x = (sum((xi - mean_x) ** 2 for xi in x) / n) ** 0.5
    std_dev_y = (sum((yi - mean_y) ** 2 for yi in y) / n) ** 0.5
    return covariance / (std_dev_x * std_dev_y) if std_dev_x and std_dev_y else 0


###############################################################################################################

# Process data
data = load_csv_data()  # Load the raw data
cleaned_data = [str_to_num(row) for row in data]  # Clean the data using str_to_num

# Extract information
customer_qty = len(cleaned_data)
ages_lst = get_column_values('age')
genders_lst = get_column_values('sex')
charges_lst = get_column_values('charges')
smokers_lst = get_column_values('smoker')
regions_lst = get_column_values('region')
bmi_lst = get_column_values('bmi')
children_lst = get_column_values('children')



In [92]:

# Average age calculation
avg_age = sum(ages_lst) / customer_qty

# Define age groups for categorization
age_groups = {
    "0-17": "children/minors",
    "18-24": "young adults",
    "25-34": "early career",
    "35-44": "mid-career adults",
    "45-54": "pre-senior adults",
    "55-64": "early retirees",
    "65+": "Medicare-eligible seniors",
}

# Calculate age distribution
age_distribution = count_group_instances(age_groups, ages_lst)

# Find oldest and youngest ages
oldest_age = max(ages_lst)
youngest_age = min(ages_lst)

# Format age distribution
formatted_age_distribution = format_distribution(
    age_distribution,
    description_template="- {key} ({label}): {value}",
    indent=8,
    include_label=True
)

# Print findings
print(f"""Age-related findings:

    - Average Age: The mean age of individuals in the dataset is {avg_age:.0f} years.
    - Oldest Age: The oldest age in the dataset is {oldest_age} years.
    - Youngest Age: The youngest age in the dataset is {youngest_age} years.

    - Age Distribution by Group:
{formatted_age_distribution}
""")

Age-related findings:

    - Average Age: The mean age of individuals in the dataset is 39 years.
    - Oldest Age: The oldest age in the dataset is 64 years.
    - Youngest Age: The youngest age in the dataset is 18 years.

    - Age Distribution by Group:
        - children/minors (0-17): 0
        - young adults (18-24): 278
        - early career (25-34): 271
        - mid-career adults (35-44): 260
        - pre-senior adults (45-54): 287
        - early retirees (55-64): 242
        - Medicare-eligible seniors (65+): 0



In [93]:
# Gender Distribution
gender_dist = {gender: genders_lst.count(gender) for gender in set(genders_lst)}

# Average Charges by Gender
avg_charges_by_gender = {
    gender: sum(charge for g, charge in zip(genders_lst, charges_lst) if g == gender) /
            sum(1 for g in genders_lst if g == gender)
    for gender in set(genders_lst)
}
# Smoker Percentage by Gender
smoker_percentage_by_gender = calculate_percentage_by_group(genders_lst, smokers_lst, 'yes')



#Print Findings
print(f""" Sex-related findings:

    - Gender Distribution:
        Male: {gender_dist.get('male', 0)}
        Female: {gender_dist.get('female', 0)}

    - Average Charges by Gender:
        Male: ${avg_charges_by_gender.get('male', 0):.2f}
        Female: ${avg_charges_by_gender.get('female', 0):.2f}

    - Smoker Proportion by Gender:
        Male: {smoker_percentage_by_gender.get('male', 0):.2%}
        Female: {smoker_percentage_by_gender.get('female', 0):.2%}
""")




 Sex-related findings:

    - Gender Distribution:
        Male: 676
        Female: 662

    - Average Charges by Gender:
        Male: $13956.75
        Female: $12569.58

    - Smoker Proportion by Gender:
        Male: 23.52%
        Female: 17.37%



In [94]:
# Define BMI categories
bmi_categories = {
    "Underweight": lambda bmi: bmi < 18.5,
    "Normal weight": lambda bmi: 18.5 <= bmi < 25,
    "Overweight": lambda bmi: 25 <= bmi < 30,
    "Obesity": lambda bmi: bmi >= 30,
}

# Calculate average BMI
avg_bmi = sum(bmi_lst) / len(bmi_lst)

# Count individuals in each BMI category
bmi_category_counts = {category: sum(1 for bmi in bmi_lst if condition(bmi)) for category, condition in bmi_categories.items()}

# Print findings
print(f"""BMI (Body Mass Index)-related findings:

    - Average BMI: {avg_bmi:.2f}

    - BMI Categories:
""")
for category, count in bmi_category_counts.items():
    print(f"        {category}: {count}")

BMI (Body Mass Index)-related findings:

    - Average BMI: 30.66

    - BMI Categories:

        Underweight: 20
        Normal weight: 225
        Overweight: 386
        Obesity: 707


In [95]:
# Average number of children
avg_children = sum(children_lst) / len(children_lst)

# Proportion of individuals with no children
no_children_proportion = sum(1 for children in children_lst if children == 0) / len(children_lst)

# Average charges by number of children
avg_charges_by_children = {
    num_children: sum(charge for c, charge in zip(children_lst, charges_lst) if c == num_children) /
                  sum(1 for c in children_lst if c == num_children)
    for num_children in set(children_lst)
}

# Print findings
print(f"""Children-related findings:

    - Average Number of Children: {avg_children:.2f}
    - Proportion with No Children: {no_children_proportion:.2%}

    - Average Charges by Number of Children:
""")
for num_children, avg_charge in sorted(avg_charges_by_children.items()):
    print(f"        {num_children} Children: ${avg_charge:.2f}")



Children-related findings:

    - Average Number of Children: 1.09
    - Proportion with No Children: 42.90%

    - Average Charges by Number of Children:

        0 Children: $12365.98
        1 Children: $12731.17
        2 Children: $15073.56
        3 Children: $15355.32
        4 Children: $13850.66
        5 Children: $8786.04


In [96]:
# Smoker vs. Non-Smoker Proportion
smoker_proportion = {
    "Smokers": sum(1 for s in smokers_lst if s == "yes") / len(smokers_lst),
    "Non-Smokers": sum(1 for s in smokers_lst if s == "no") / len(smokers_lst),
}

# Average Charges for Smokers vs. Non-Smokers
avg_charges_by_smoker_status = {
    "Smokers": sum(charge for s, charge in zip(smokers_lst, charges_lst) if s == "yes") /
               sum(1 for s in smokers_lst if s == "yes"),
    "Non-Smokers": sum(charge for s, charge in zip(smokers_lst, charges_lst) if s == "no") /
                   sum(1 for s in smokers_lst if s == "no"),
}

# Correlation Between Smoking and Region
smoker_by_region = {
    region: sum(1 for s, r in zip(smokers_lst, regions_lst) if s == "yes" and r == region) /
            sum(1 for r in regions_lst if r == region)
    for region in set(regions_lst)
}

# Print findings
print(f""" Smoker related findings:

    - Smoker vs. Non-Smoker Proportion:
        Smokers: {smoker_proportion['Smokers']:.2%}
        Non-Smokers: {smoker_proportion['Non-Smokers']:.2%}

    - Average Charges for Smokers vs. Non-Smokers:
        Smokers: ${avg_charges_by_smoker_status['Smokers']:.2f}
        Non-Smokers: ${avg_charges_by_smoker_status['Non-Smokers']:.2f}

    - Correlation Between Smoking and Region:
""")
for region, proportion in smoker_by_region.items():
    print(f"        {region}: {proportion:.2%}")


 Smoker related findings:

    - Smoker vs. Non-Smoker Proportion:
        Smokers: 20.48%
        Non-Smokers: 79.52%

    - Average Charges for Smokers vs. Non-Smokers:
        Smokers: $32050.23
        Non-Smokers: $8434.27

    - Correlation Between Smoking and Region:

        northwest: 17.85%
        southwest: 17.85%
        southeast: 25.00%
        northeast: 20.68%


In [97]:
# Population distribution by region
population_by_region = {region: regions_lst.count(region) for region in set(regions_lst)}

# Average BMI by region
avg_bmi_by_region = {
    region: sum(bmi for bmi, r in zip(bmi_lst, regions_lst) if r == region) /
            sum(1 for r in regions_lst if r == region)
    for region in set(regions_lst)
}

# Average charges by region
avg_charges_by_region = {
    region: sum(charge for charge, r in zip(charges_lst, regions_lst) if r == region) /
            sum(1 for r in regions_lst if r == region)
    for region in set(regions_lst)
}

# Print findings
print(f""" Region-related findings:

    - Population Distribution by Region:
""")
for region, count in population_by_region.items():
    print(f"        {region.capitalize()}: {count}")

print(f"""

    - Average BMI by Region:
""")
for region, avg_bmi in avg_bmi_by_region.items():
    print(f"        {region.capitalize()}: {avg_bmi:.2f}")

print(f"""

    - Average Charges by Region:
""")
for region, avg_charge in avg_charges_by_region.items():
    print(f"        {region.capitalize()}: ${avg_charge:.2f}")


 Region-related findings:

    - Population Distribution by Region:

        Northwest: 325
        Southwest: 325
        Southeast: 364
        Northeast: 324


    - Average BMI by Region:

        Northwest: 29.20
        Southwest: 30.60
        Southeast: 33.36
        Northeast: 29.17


    - Average Charges by Region:

        Northwest: $12417.58
        Southwest: $12346.94
        Southeast: $14735.41
        Northeast: $13406.38


In [142]:
# Average charges
avg_charges = sum(charges_lst) / len(charges_lst)

# Median charges
sorted_charges = sorted(charges_lst)
n = len(sorted_charges)
median_charges = sorted_charges[n // 2] if n % 2 == 1 else (sorted_charges[n // 2 - 1] + sorted_charges[n // 2]) / 2

# Highest and lowest charges
highest_charge = max(charges_lst)
lowest_charge = min(charges_lst)

# Charges distribution (manual binning)
bin_size = 10000
charge_bins = {}

for charge in charges_lst:
    # Define integer-based ranges
    lower_bound = int((charge // bin_size) * bin_size)
    upper_bound = int(((charge // bin_size) + 1) * bin_size - 1)
    bin_range = f"${lower_bound:,}-{upper_bound:,}"
    charge_bins[bin_range] = charge_bins.get(bin_range, 0) + 1
distribution_output = "\n".join([f"    - {bin_range:}: {count}" for bin_range, count in sorted(charge_bins.items())])

# Correlation calculation
def calculate_correlation(x, y):
    n = len(x)
    mean_x, mean_y = sum(x) / n, sum(y) / n
    covariance = sum((xi - mean_x) * (yi - mean_y) for xi, yi in zip(x, y))
    std_dev_x = (sum((xi - mean_x) ** 2 for xi in x) / n) ** 0.5
    std_dev_y = (sum((yi - mean_y) ** 2 for yi in y) / n) ** 0.5
    return covariance / (std_dev_x * std_dev_y) if std_dev_x and std_dev_y else 0

correlation_age_charges = calculate_correlation(ages_lst, charges_lst)
correlation_bmi_charges = calculate_correlation(bmi_lst, charges_lst)
correlation_children_charges = calculate_correlation(children_lst, charges_lst)

# Print findings
print(f""" Charges Analysis:

    - Average Charges: ${avg_charges:.2f}
    - Median Charges: ${median_charges:.2f}
    - Highest Charge: ${highest_charge:.2f}
    - Lowest Charge: ${lowest_charge:.2f}

    Charges Distribution:
{distribution_output}

    Correlation with Other Variables:
    - Charges vs. Age: {correlation_age_charges:.2f}
    - Charges vs. BMI: {correlation_bmi_charges:.2f}
    - Charges vs. Number of Children: {correlation_children_charges:.2f}
""")


 Charges Analysis:

    - Average Charges: $13270.42
    - Median Charges: $9382.03
    - Highest Charge: $63770.43
    - Lowest Charge: $1121.87

    Charges Distribution:
    - $0-9,999: 712
    - $10,000-19,999: 353
    - $20,000-29,999: 111
    - $30,000-39,999: 83
    - $40,000-49,999: 72
    - $50,000-59,999: 4
    - $60,000-69,999: 3

    Correlation with Other Variables:
    - Charges vs. Age: 400.07
    - Charges vs. BMI: 265.38
    - Charges vs. Number of Children: 90.98



In [148]:

# Smoker and region impact on charges
smoker_region_charges = {
    (region, smoker): sum(charge for r, s, charge in zip(regions_lst, smokers_lst, charges_lst) if r == region and s == smoker) /
                      sum(1 for r, s in zip(regions_lst, smokers_lst) if r == region and s == smoker)
    for region in set(regions_lst) for smoker in set(smokers_lst)
}

# Format and display results
print("Smoker and Region Impact on Charges:")
for (region, smoker), avg_charge in smoker_region_charges.items():
    print(f"    Region: {region.capitalize()}, Smoker: {smoker.capitalize()}, Average Charges: ${avg_charge:.2f}")


# Impact of children on charges by gender
children_gender_charges = {
    (gender, children): sum(charge for g, c, charge in zip(genders_lst, children_lst, charges_lst) if g == gender and c == children) /
                        sum(1 for g, c in zip(genders_lst, children_lst) if g == gender and c == children)
    for gender in set(genders_lst) for children in set(children_lst)
}

print("\nImpact of Children on Charges by Gender:")
for (gender, children), avg_charge in sorted(children_gender_charges.items()):
    print(f"    Gender: {gender.capitalize()}, Children: {children}, Average Charges: ${avg_charge:.2f}")



Smoker and Region Impact on Charges:
    Region: Northwest, Smoker: No, Average Charges: $8556.46
    Region: Northwest, Smoker: Yes, Average Charges: $30192.00
    Region: Southwest, Smoker: No, Average Charges: $8019.28
    Region: Southwest, Smoker: Yes, Average Charges: $32269.06
    Region: Southeast, Smoker: No, Average Charges: $8032.22
    Region: Southeast, Smoker: Yes, Average Charges: $34845.00
    Region: Northeast, Smoker: No, Average Charges: $9165.53
    Region: Northeast, Smoker: Yes, Average Charges: $29673.54

Impact of Children on Charges by Gender:
    Gender: Female, Children: 0, Average Charges: $11905.71
    Gender: Female, Children: 1, Average Charges: $12161.36
    Gender: Female, Children: 2, Average Charges: $13941.32
    Gender: Female, Children: 3, Average Charges: $13865.61
    Gender: Female, Children: 4, Average Charges: $13937.67
    Gender: Female, Children: 5, Average Charges: $9854.01
    Gender: Male, Children: 0, Average Charges: $12832.70
    Gend