In [1]:
import csv
import pprint

In [3]:
class InsuranceAnalysis:
    def __init__(self, file_path):
        """Initialize the class by reading the CSV file."""
        self.data = self.read_csv(file_path)

    def read_csv(self, file_path):
        """Reads CSV file and stores data as a list of dictionaries."""
        with open(file_path, mode='r') as file:
            reader = csv.DictReader(file)
            data = [row for row in reader]
        return data

    def convert_numeric(self, keys):
        """Converts specified keys to numeric values to allow calculations."""
        for row in self.data:
            for key in keys:
                row[key] = float(row[key]) if key in row and row[key] else 0

    def get_average(self, key):
        """Calculates the average of a numeric column."""
        total = sum(row[key] for row in self.data)
        return round(total / len(self.data), 2)

    def count_categories(self, key):
        """Counts occurrences of unique values in a categorical column."""
        counts = {}
        for row in self.data:
            value = row[key]
            counts[value] = counts.get(value, 0) + 1
        return counts

    def compare_smoker_charges(self):
        """Compares average insurance charges for smokers vs non-smokers."""
        smokers = [row['charges'] for row in self.data if row['smoker'] == 'yes']
        non_smokers = [row['charges'] for row in self.data if row['smoker'] == 'no']
        return {
            'Smokers Avg Charges': round(sum(smokers) / len(smokers), 2),
            'Non-Smokers Avg Charges': round(sum(non_smokers) / len(non_smokers), 2)
        }

    def bmi_correlation(self):
        """Finds the correlation between BMI and Charges (basic method)."""
        bmi_charges = [(row['bmi'], row['charges']) for row in self.data]
        bmi_sum, charges_sum = sum(x[0] for x in bmi_charges), sum(x[1] for x in bmi_charges)
        bmi_mean, charges_mean = bmi_sum / len(bmi_charges), charges_sum / len(bmi_charges)
        num = sum((x[0] - bmi_mean) * (x[1] - charges_mean) for x in bmi_charges)
        denom = sum((x[0] - bmi_mean) ** 2 for x in bmi_charges)
        correlation = num / denom if denom != 0 else 0
        return round(correlation, 4)

    def analyze_children_impact(self):
        """Analyzes how the number of children affects insurance charges."""
        children_charges = {}
        for row in self.data:
            key = int(row['children'])
            children_charges.setdefault(key, []).append(row['charges'])
        return {k: round(sum(v)/len(v), 2) for k, v in children_charges.items()}

    def max_charge(self):
        """Returns the highest insurance charge in the dataset."""
        return max(row['charges'] for row in self.data)

    def min_charge(self):
        """Returns the lowest insurance charge in the dataset."""
        return min(row['charges'] for row in self.data)

    def avg_charge_by_gender(self):
        """Returns average charge by gender."""
        gender_totals = {}
        gender_counts = {}
        for row in self.data:
            gender = row['sex']
            gender_totals[gender] = gender_totals.get(gender, 0) + row['charges']
            gender_counts[gender] = gender_counts.get(gender, 0) + 1
        return {k: round(gender_totals[k]/gender_counts[k], 2) for k in gender_totals}

    def avg_charge_by_region(self):
        """Returns average charge by region."""
        region_totals = {}
        region_counts = {}
        for row in self.data:
            region = row['region']
            region_totals[region] = region_totals.get(region, 0) + row['charges']
            region_counts[region] = region_counts.get(region, 0) + 1
        return {k: round(region_totals[k]/region_counts[k], 2) for k in region_totals}

    def smoker_ratio(self):
        """Returns percentage of smokers in the dataset."""
        total = len(self.data)
        smokers = len([row for row in self.data if row['smoker'] == 'yes'])
        return round((smokers / total) * 100, 2)

    def average_bmi_by_gender(self):
        """Returns average BMI by gender."""
        bmi_totals = {}
        counts = {}
        for row in self.data:
            gender = row['sex']
            bmi_totals[gender] = bmi_totals.get(gender, 0) + row['bmi']
            counts[gender] = counts.get(gender, 0) + 1
        return {k: round(bmi_totals[k]/counts[k], 2) for k in bmi_totals}

    def average_age_with_children(self):
        """Returns average age for those with at least one child."""
        parents = [row['age'] for row in self.data if row['children'] > 0]
        return round(sum(parents)/len(parents), 2) if parents else 0

    def analyze_age_groups(self):
        """Analyzes average charges based on age groups."""
        age_brackets = {
            '<20': [], '20-30': [], '30-40': [], '40-50': [], '50-60': [], '60+': []
        }
        for row in self.data:
            age = row['age']
            if age < 20:
                age_brackets['<20'].append(row['charges'])
            elif age < 30:
                age_brackets['20-30'].append(row['charges'])
            elif age < 40:
                age_brackets['30-40'].append(row['charges'])
            elif age < 50:
                age_brackets['40-50'].append(row['charges'])
            elif age < 60:
                age_brackets['50-60'].append(row['charges'])
            else:
                age_brackets['60+'].append(row['charges'])
        return {k: round(sum(v)/len(v), 2) if v else 0 for k, v in age_brackets.items()}

    def generate_report(self):
        """Generates a full analysis report including descriptive statistics."""
        self.convert_numeric(['age', 'bmi', 'children', 'charges'])
        report = {
            'Average Age': self.get_average('age'),
            'Average BMI': self.get_average('bmi'),
            'Average Charges': self.get_average('charges'),
            'Gender Distribution': self.count_categories('sex'),
            'Region Distribution': self.count_categories('region'),
            'Smoker vs Non-Smoker Charges': self.compare_smoker_charges(),
            'BMI vs Charges Correlation': self.bmi_correlation(),
            'Children Impact on Charges': self.analyze_children_impact(),
            'Max Charge': self.max_charge(),
            'Min Charge': self.min_charge(),
            'Avg Charge by Gender': self.avg_charge_by_gender(),
            'Avg Charge by Region': self.avg_charge_by_region(),
            'Smoker Ratio (%)': self.smoker_ratio(),
            'Avg BMI by Gender': self.average_bmi_by_gender(),
            'Avg Age with Children': self.average_age_with_children(),
            'Charges by Age Group': self.analyze_age_groups()
        }
        return report

In [4]:
# Loading the dataset and performing analysis
file_path = "insurance.csv"  
analysis = InsuranceAnalysis(file_path)
results = analysis.generate_report()

# Display the results
pprint.pprint(results)

{'Average Age': 39.21,
 'Average BMI': 30.66,
 'Average Charges': 13270.42,
 'Avg Age with Children': 39.78,
 'Avg BMI by Gender': {'female': 30.38, 'male': 30.94},
 'Avg Charge by Gender': {'female': 12569.58, 'male': 13956.75},
 'Avg Charge by Region': {'northeast': 13406.38,
                          'northwest': 12417.58,
                          'southeast': 14735.41,
                          'southwest': 12346.94},
 'BMI vs Charges Correlation': 393.873,
 'Charges by Age Group': {'20-30': 9561.75,
                          '30-40': 11738.78,
                          '40-50': 14399.2,
                          '50-60': 16495.23,
                          '60+': 21248.02,
                          '<20': 8407.35},
 'Children Impact on Charges': {0: 12365.98,
                                1: 12731.17,
                                2: 15073.56,
                                3: 15355.32,
                                4: 13850.66,
                                5: 8786.04},