# U.S. MEDICAL INSURANCE COSTS - PROJECT

## 0. CSV library
We will need it to work with the provided csv file cointaining all the data

In [1]:
import csv

## 1. Dataset description
- Personal information about 1338 Individuals (US citizens)

#### List of personal details:
- Age: *integer - discrete*
- Sex: *string - nominal*
- Bmi: *float continuous*
- Number of children: *integer - discrete*
- Smoker: *string - nominal*
- US Region: *string - nominal*
- Insurance charges: *float - continuous*

## 2. Create a list for each variable
Creating a list for each variable will allow us to manipulate and analyze each data seperately

In [2]:
ages = []
sexes = []
bmis = []
children = []
smokers = []
regions = []
charges = []

## 2. Create a function feeding each list of variable
Create a function that fills each list of variables depending on which csv file is given as a parameter

Create a class InsureesData that receives the 
1. Find out the average age of the patients in the dataset.
2. Analyze where a majority of the individuals are from
3. Look at the different costs between smokers vs. non-smokers.
4. Figure out what the average age is for someone who has at least one child in this dataset.
5. Find a function predicting insurance cost based on other factors

In [3]:
def feed_list_of_data(csv_file):
    with open(csv_file) as insurance_data:
        insurance_data_dict = csv.DictReader(insurance_data)
        for individual in insurance_data_dict:
            ages.append(float(individual['age']))
            sexes.append(individual['sex'])
            bmis.append(float(individual['bmi']))
            children.append(float(individual['children']))
            smokers.append(individual['smoker'])
            regions.append(individual['region'])
            charges.append(float(individual['charges']))

In [4]:
feed_list_of_data('insurance.csv')

## 3. Create a class with methods analyzing the dataset
Create a class that performs statistical analysis on the dataset

Through methods, the class will do the following analysis:



In [24]:
class DataAnalysis:
    def __init__(self, ages, sexes, bmis, children, smokers, regions, charges):
        self.ages = ages
        self.sexes = sexes
        self.bmis = bmis
        self.children = children
        self.smokers = smokers
        self.regions = regions
        self.charges = charges
        
    def get_average(self, variable_list):
        variable_list_sum = 0
        for observation in variable_list:
            variable_list_sum += observation
        return variable_list_sum / len(variable_list)
    
    def get_standard_deviation(self, variable_list):
        average = self.get_average(variable_list)
        sum_of_squared_diff = 0
        for observation in variable_list:
            diff = observation - average
            sum_of_squared_diff += (diff ** 2)
        return (sum_of_squared_diff / (len(variable_list) - 1)) ** 0.5
    
    def get_median(self, variable_list):
        variable_list.sort()
        if len(variable_list) % 2 == 0:
            return (variable_list[len(variable_list) // 2 - 1] + variable_list[len(variable_list) // 2]) / 2
        else:
            return observation_list[len(observation_list) // 2]

    def get_first_quartile(self, variable_list):
        variable_list.sort()
        if len(variable_list) % 4 == 0:
            return (variable_list[len(variable_list) // 4 - 1] + variable_list[len(variable_list) // 4]) / 2
        else:
            return variable_list[len(variable_list) // 4]
        
    def get_third_quartile(self, variable_list):
        variable_list.sort()
        if len(variable_list) % 4 == 0:
            return (variable_list[3 * len(variable_list) // 4 - 1] + variable_list[3 * len(variable_list) // 4]) / 2
        else:
            return variable_list[3 * len(variable_list) // 4]
        
    def get_interquartile_range(self, variable_list):
        q1 = self.get_first_quartile(variable_list)
        q3 = self.get_third_quartile(variable_list)
        return q3 - q1
    
    def distribution_analysis(self, variable_list):
        print(f'| MINIMUM = {round(float(min(variable_list)),1)} | Q1 = {round(self.get_first_quartile(variable_list),1)} | MEDIAN = {round(self.get_median(variable_list),1)} | Q3 = {round(self.get_third_quartile(variable_list),1)} | MAXIMUM = {round(float(max(variable_list)),1)} |')
        print(f'| Average = {round(self.get_average(variable_list), 1)} | Standard Deviation = {round(self.get_standard_deviation(variable_list), 1)} | Interquartile Range = {round(self.get_interquartile_range(variable_list), 1)} |')
        
    def sex_frequency_table(self):
        print(' ----------------------------------------------')
        print('| SEX    | FREQUENCY | PROPORTION | PERCENTAGE |')
        print(' ----------------------------------------------')
        print(f'| Male   |    {self.sexes.count("male")}    |    {round(self.sexes.count("male")/len(self.sexes), 2)}    |     {round(self.sexes.count("male")/len(self.sexes)*100)}%    |')
        print(' ----------------------------------------------')
        print(f'| Female |    {self.sexes.count("female")}    |    {round(self.sexes.count("female")/len(self.sexes), 2)}    |     {round(self.sexes.count("female")/len(self.sexes)*100)}%    |')
        print(' ----------------------------------------------')
        print(f'| TOTAL  |    {len(self.sexes)}   |    1       |     100%   |')
        print(' ----------------------------------------------')
    
    def smoking_frequency_table(self):
        print(' -----------------------------------------------')
        print(f'| SMOKING | FREQUENCY | PROPORTION | PERCENTAGE |')
        print(' -----------------------------------------------')
        print(f'| yes     |    {self.smokers.count("yes")}    |     {round(self.smokers.count("yes")/len(self.smokers), 2)}    |     {round(self.smokers.count("yes")/len(self.smokers)*100)}%    |')
        print(' -----------------------------------------------')
        print(f'| no      |    {self.smokers.count("no")}   |     {round(self.smokers.count("no")/len(self.smokers), 2)}    |     {round(self.smokers.count("no")/len(self.smokers)*100)}%    |')
        print(' -----------------------------------------------')
        print(f'| TOTAL   |    {len(self.smokers)}   |     1      |     100%   |')
        print(' -----------------------------------------------')
        
    def regions_frequency_table(self):
        print(' -------------------------------------------------')
        print('| REGION    | FREQUENCY | PROPORTION | PERCENTAGE |')
        print(' -------------------------------------------------')
        print(f'| northeast |    {self.regions.count("northeast")}    |    {round(self.regions.count("northeast")/len(self.regions), 2)}    |     {round(self.regions.count("northeast")/len(self.regions)*100)}%    |')
        print(' -------------------------------------------------')
        print(f'| northwest |    {self.regions.count("northwest")}    |    {round(self.regions.count("northwest")/len(self.regions), 2)}    |     {round(self.regions.count("northwest")/len(self.regions)*100)}%    |')
        print(' -------------------------------------------------')
        print(f'| southeast |    {self.regions.count("southeast")}    |    {round(self.regions.count("southeast")/len(self.regions), 2)}    |     {round(self.regions.count("southeast")/len(self.regions)*100)}%    |')
        print(' -------------------------------------------------')
        print(f'| southwest |    {self.regions.count("southwest")}    |    {round(self.regions.count("southwest")/len(self.regions), 2)}    |     {round(self.regions.count("southwest")/len(self.regions)*100)}%    |')
        print(' -------------------------------------------------')
        print(f'| TOTAL     |    {len(self.regions)}   |    1       |     100%   |')
        print(' -------------------------------------------------')

## 4. Instantiate the data object
Create an object using the class just created

In [25]:
data_insurance = DataAnalysis(ages, sexes, bmis, children, smokers, regions, charges)

## 5. Describe the distribution of numeric variables
By using the distribution_analysis method, describe the distribution of the following numeric variable:
- age
- bmi
- children
- insurance charge

In [26]:
data_insurance.distribution_analysis(data_insurance.ages)

| MINIMUM = 18.0 | Q1 = 27.0 | MEDIAN = 39.0 | Q3 = 51.0 | MAXIMUM = 64.0 |
| Average = 39.2 | Standard Deviation = 14.0 | Interquartile Range = 24.0 |


In [27]:
data_insurance.distribution_analysis(data_insurance.bmis)

| MINIMUM = 16.0 | Q1 = 26.3 | MEDIAN = 30.4 | Q3 = 34.7 | MAXIMUM = 53.1 |
| Average = 30.7 | Standard Deviation = 6.1 | Interquartile Range = 8.4 |


In [28]:
data_insurance.distribution_analysis(data_insurance.children)

| MINIMUM = 0.0 | Q1 = 0.0 | MEDIAN = 1.0 | Q3 = 2.0 | MAXIMUM = 5.0 |
| Average = 1.1 | Standard Deviation = 1.2 | Interquartile Range = 2.0 |


In [29]:
data_insurance.distribution_analysis(data_insurance.charges)

| MINIMUM = 1121.9 | Q1 = 4738.3 | MEDIAN = 9382.0 | Q3 = 16657.7 | MAXIMUM = 63770.4 |
| Average = 13270.4 | Standard Deviation = 12110.0 | Interquartile Range = 11919.4 |


## 6. Build the frequency table of categorical variables
By using the different frequency_table() method, build the frequency table of the following categorical variables:
- sex
- smoking
- region

In [30]:
data_insurance.sex_frequency_table()

 ----------------------------------------------
| SEX    | FREQUENCY | PROPORTION | PERCENTAGE |
 ----------------------------------------------
| Male   |    676    |    0.51    |     51%    |
 ----------------------------------------------
| Female |    662    |    0.49    |     49%    |
 ----------------------------------------------
| TOTAL  |    1338   |    1       |     100%   |
 ----------------------------------------------


In [31]:
data_insurance.smoking_frequency_table()

 -----------------------------------------------
| SMOKING | FREQUENCY | PROPORTION | PERCENTAGE |
 -----------------------------------------------
| yes     |    274    |     0.2    |     20%    |
 -----------------------------------------------
| no      |    1064   |     0.8    |     80%    |
 -----------------------------------------------
| TOTAL   |    1338   |     1      |     100%   |
 -----------------------------------------------


In [32]:
data_insurance.regions_frequency_table()

 -------------------------------------------------
| REGION    | FREQUENCY | PROPORTION | PERCENTAGE |
 -------------------------------------------------
| northeast |    324    |    0.24    |     24%    |
 -------------------------------------------------
| northwest |    325    |    0.24    |     24%    |
 -------------------------------------------------
| southeast |    364    |    0.27    |     27%    |
 -------------------------------------------------
| southwest |    325    |    0.24    |     24%    |
 -------------------------------------------------
| TOTAL     |    1338   |    1       |     100%   |
 -------------------------------------------------
