# U.S. Medical Insurance Costs
### The scope of this project is to use raw python for the analysis without using pandas and numpy.

In [1]:
import csv

In [2]:
#Initializing our dictionary that is going to hold our data
dataDict = {}

In [3]:
with open("insurance.csv") as data:
    data_dict = csv.DictReader(data)
    counter = 0
    for row in data_dict:
        dataDict[counter] = row
        counter += 1

### Our data are identified by columns:
#### Age , Sex, Bmi, CHildren, smoker, Region, Charges

In [4]:
#accesing the first row to get a view of our values and decide if need to clear them or convert them
#Age is string, it must be converted to int
#Sex is string, can be converted to 0 for male 1 for female 
#bmi is string, it must be converted to float
#children is string, it must be converted to int
#smoker is string, it can be converted to 0 for non-smoker 1 for smoker
#region is string, it can remain either this way or be converted depending on how many regions we explore
#charges is string, it must be converted to float
print(dataDict[0])

{'age': '19', 'sex': 'female', 'bmi': '27.9', 'children': '0', 'smoker': 'yes', 'region': 'southwest', 'charges': '16884.924'}


### Converting bmi , age, children, charges from string to their corresponding types
#### int for age and children, float for bmi and charges

In [5]:
for row in dataDict:
    dataDict[row]['bmi'] = float(dataDict[row]['bmi'])
    dataDict[row]['age'] = int(dataDict[row]['age'])
    dataDict[row]['children'] = int(dataDict[row]['children'])
    dataDict[row]['charges'] = float(dataDict[row]['charges'])   

In [6]:
#Data type conversion was succesfull
print(type(dataDict[0]['bmi']))
print(type(dataDict[0]['age']))
print(type(dataDict[0]['children']))
print(type(dataDict[0]['charges']))

<class 'float'>
<class 'int'>
<class 'int'>
<class 'float'>


### Smoker , sex and region columns are categorical so they need their corresponding values
#### 0 or 1 for smoker and sex according to our data(yes or no, male or female)
#### and 0 to 3 for region(southwest, southeast, northwest, northeast)

In [7]:
#Transforming smoker yes or no to 1 or 0
for row in dataDict:
    if dataDict[row]['smoker'] == 'yes':
        dataDict[row]['smoker'] = 1
    else:
        dataDict[row]['smoker'] = 0

In [8]:
for row in dataDict:
    if dataDict[row]['sex'] == 'male':
        dataDict[row]['sex'] = 0
    else:
        dataDict[row]['sex'] = 1   

In [9]:
for row in dataDict:
    if dataDict[row]['region'] == 'southwest':
        dataDict[row]['region'] = 0
    elif dataDict[row]['region'] == 'southeast':
        dataDict[row]['region'] = 1
    elif dataDict[row]['region'] == 'northwest':
        dataDict[row]['region'] = 2
    else:
        dataDict[row]['region'] = 3

### Creating empty lists for each of our columns and create a function to pass each corresponding value

In [10]:
#Pass the values to lists
age = []
sex = []
bmi = []
children = []
smoker = []
charges = []
region = []

In [11]:
def pass_to_list(dataDict,category_name):
    lst = [dataDict[i][category_name] for i in range(0,len(dataDict))]
    return lst    

In [12]:
age = pass_to_list(dataDict, 'age')
smoker = pass_to_list(dataDict,'smoker')
bmi = pass_to_list(dataDict, 'bmi')
children = pass_to_list(dataDict, 'children')
charges = pass_to_list(dataDict, 'charges')
sex = pass_to_list(dataDict, 'sex')
region = pass_to_list(dataDict, 'region')

### After having our data almost ready it's time to decide what we want to find from our data
#### 1st: Average age of our patients and correlation of age with charges
#### 2nd: Average insurance_cost(charges list)
#### 3d:  How many are smokers and how many non-smokers and the average of each category. Attempt to calculate correlation with charges
#### 4th:Amount of patients for each region and average insurance cost of each region
#### 5th: Count how many males and females in the dataset to check for proper sampling between sex and the average insurance cost of each category
#### 6th: Average bmi of our patients and the correlation of bmi with charges
#### 7th: Create a dictionary that is going to hold all of our patients

### Before our analysis, let's create a class that is going to hold the majority of analysis methods

In [13]:
class Patient():
    def __init__(self, age, sex, bmi, children, smoker, region, charges):
        self.age = age
        self.sex = sex
        self.bmi = bmi
        self.children = children
        self.smoker = smoker
        self.region = region
        self.charges = charges
        
    #Method for analyzing
    def get_mean(self, lst):
        mean = sum(lst)/len(lst)
        return mean
    
    def variance(self,lst):
        n = len(lst) - 1
        mean = self.get_mean(lst)
        return sum((x - mean) ** 2 for x in lst) / n
    
    def stdev(self,data):
        var = self.variance(data)
        std_dev = var ** 0.5
        return std_dev
    
    def get_covariance(self, lst1, lst2):
        mean_lst1 = self.get_mean(lst1)
        mean_lst2 = self.get_mean(lst2)
        product_total = 0
        for i in range(0,len(lst1)):
            product_total += (lst1[i] - mean_lst1)*(lst2[i] - mean_lst2)
        cov = product_total/(len(lst1)-1)
        return cov
            
    def get_correlation(self, lst1, lst2):
        cov = self.get_covariance(lst1,lst2)
        std1 = self.stdev(lst1)
        std2 = self.stdev(lst2)
        cor = cov/(std1*std2)
        return cor
    
    def create_dictionary(self):
        data_dict = {}
        for i in range(0,len(self.age)):
            data_dict.update({i : [self.age[i], self.sex[i], self.bmi[i], self.children[i], self.smoker[i], self.region[i], self.charges[i]]})
        return data_dict
    
    def _0_1_average(self,charges,smoker):
            counter_0 = 0
            counter_1 = 0
            total_0 = 0
            total_1 = 0
            for i in range(0,len(smoker)):
                if smoker[i] == 0:
                    counter_0 += 1
                    total_0 += charges[i]
                else:
                    counter_1 += 1
                    total_1 += charges[i]
            return total_0/counter_0, total_1/counter_1,counter_0,counter_1
        
    def corr_categorical_cont(self, charges, smoker):
        non_smokers_average, smokers_average, non_smokers, smokers = self._0_1_average(charges, smoker)
        std = self.stdev(charges)
        n = len(charges)
        part1 = (smokers_average - non_smokers_average) / std
        part2 = (non_smokers*smokers)/(n*(n-1))
        part2 = part2 ** 0.5
        rpb = part1 * part2
        return rpb
    
    def region_average(self, charges, region):
        counter = [0 for i in range(0,4)]
        total = [0 for i in range(0,4)]
        for i in range(0,len(charges)):
            if region[i] == 0:
                counter[0] += 1
                total[0] += charges[i]
            elif region[i] == 1:
                counter[1] += 1
                total[1] += charges[i]
            elif region[i] == 2:
                counter[2] += 1
                total[2] += charges[i]
            else:
                counter[3] += 1
                total[3] += charges[i]
        region_dict = {}
        region_dict.update({"Southwest" : [counter[0],total[0]/counter[0]]})
        region_dict.update({"Southeast" : [counter[1],total[1]/counter[1]]})
        region_dict.update({"Northwest" : [counter[2],total[2]/counter[2]]})
        region_dict.update({"Northeast" : [counter[3],total[3]/counter[3]]})
        return region_dict
                
        

### Let's initialize our class by passing our lists

In [14]:
patients = Patient(age, sex, bmi, children, smoker, region, charges)

## 1st: Time to find the average of age in our data and the correlation with the insurance_cost(charges)

In [15]:
#We are going to call the get_mean method and the get_correlation method to get our results
mean_age = patients.get_mean(age)
correlation_age_charges = patients.get_correlation(age,charges)

In [16]:
#Let's print the results
print("The average age of our patients is {} years of age.".format(round(mean_age)))

The average age of our patients is 39 years of age.


### We see that our patients are around 40 years of age, now we can see if age is correlated with insurance cost and if yes how much is affected by it.

In [17]:
print("The correlation value between age and insurance cost is {:.2f}".format(correlation_age_charges))

The correlation value between age and insurance cost is 0.30


### As we see the correlation at 0.3 is a very weak positive value and that means that the linearity between these 2 values is weak. As age increases it has very little to do with the insurance cost increase.

## 2nd: We can now calculate the average of the insurance costs to get a better view of our data overall

In [18]:
insurance_cost_mean = patients.get_mean(charges)

In [19]:
print("The average insurance cost in our data is {:0.2f}$.".format(insurance_cost_mean))

The average insurance cost in our data is 13270.42$.


In [20]:
#We can also find the min and max values to get a better idea of how are data are distributed
print("The maximum insurance cost is {:0.2f}$.".format(max(charges)))
print("The minimum insurance cost is {:0.2f}$.".format(min(charges)))

The maximum insurance cost is 63770.43$.
The minimum insurance cost is 1121.87$.


### We can see that our has a minimum far away from the mean and 5 times bigger maximum value

## 3d: How many patients are smokers and how many are not smokers. What is the average for each category. Attempt to find correlation between smoking and charges

In [21]:
average_non_smokers,average_smokers, num_of_non_smokers, num_of_smokers = patients._0_1_average(charges, smoker)

In [22]:
print("The average insurance cost for the {} people who are not smoking is {:0.2f}$.".format(num_of_non_smokers, average_non_smokers))
print("The average insurance cost for the {} people who are smoking is {:0.2f}$.".format(num_of_smokers, average_smokers))

The average insurance cost for the 1064 people who are not smoking is 8434.27$.
The average insurance cost for the 274 people who are smoking is 32050.23$.


### As we observe even if the people who don't smoke are ~4 times more than the people who do smoke the difference in their insurance cost average is actually 4 times lower. 
### From this perspective smoking is heavily affecting the insurance cost.

In [23]:
# We are going to use point biserial correlation to calculate the correlation between the 2 of them
rpb = patients.corr_categorical_cont(charges, smoker)

In [24]:
print("The correlation between smoking and insurance cost is {:0.2f}".format(rpb))

The correlation between smoking and insurance cost is 0.79


### A correlation of 0.79 is indicating that smoking is heavily affecting the insurance cost in a positive way.
### This correlation value also explains the huge difference between the averages of smokers and non-smokers.

## 4th: Amount of patients for each region and their average

In [25]:
region_dict = patients.region_average(charges, region)

In [26]:
print("Southwest Region has {} patients with an average of {:0.2f}$ insurance cost".format(region_dict['Southwest'][0], region_dict['Southwest'][1]))
print("Southeast Region has {} patients with an average of {:0.2f}$ insurance cost".format(region_dict['Southeast'][0], region_dict['Southeast'][1]))
print("Northwest Region has {} patients with an average of {:0.2f}$ insurance cost".format(region_dict['Northwest'][0], region_dict['Northwest'][1]))
print("Northeast Region has {} patients with an average of {:0.2f}$ insurance cost".format(region_dict['Northeast'][0], region_dict['Northeast'][1]))

Southwest Region has 325 patients with an average of 12346.94$ insurance cost
Southeast Region has 364 patients with an average of 14735.41$ insurance cost
Northwest Region has 325 patients with an average of 12417.58$ insurance cost
Northeast Region has 324 patients with an average of 13406.38$ insurance cost


### Every region has an equally balanced amount of patients with similar average insurance cost. That means that our sample is unbiased toward the regions

## 5th: Check sex category for any bias between male and female

In [27]:
male_average, female_average, male_count, female_count = patients._0_1_average(charges,sex)

In [28]:
print("The average insurance cost for the {} male is {:0.2f}$.".format(male_count, male_average))
print("The average insurance cost for the {} female is {:0.2f}$.".format(female_count, female_average))

The average insurance cost for the 676 male is 13956.75$.
The average insurance cost for the 662 female is 12569.58$.


### Each sex has an almost equal amount of people with similar average, male seems to have a bit higher average. We can attempt to calculate if sex has anything to do with the prices of charges with point biserial correlation

In [29]:
rpb2 = patients.corr_categorical_cont(charges, sex)

In [30]:
print("The correlation between sex and insurance cost is {:0.2f}".format(rpb2))

The correlation between sex and insurance cost is -0.06


### The correlation of -0.06 is really weak. It can be more than safely ignored and assume that sex has nothing to do with the insurance costs of each patient.

## 6th: Average bmi and correlation between bmi and insurance cost(charges)

In [31]:
bmi_average = patients.get_mean(bmi)
correlation_bmi_charges = patients.get_correlation(charges, bmi)

In [32]:
print("The average bmi of our patients is {:0.2f}".format(bmi_average))

The average bmi of our patients is 30.66


In [33]:
print("The correlation between bmi and insurance cost is {:0.2f}".format(correlation_bmi_charges))

The correlation between bmi and insurance cost is 0.20


### The correlation of 0.20 indicates a very weak impact of bmi on the insurance cost but patients' average bmi is really high and we can attempt to inform them to lower it to be healthier in their life. According to cdc bmi of 30 and above indicates an obese person.

## 7th: If we need to get our data back to a dictionary we can use the class create_dictionary()

In [34]:
patients_data = patients.create_dictionary()

In [35]:
print(patients_data[0])

[19, 1, 27.9, 0, 1, 0, 16884.924]


## This marks the end of our analysis on the us-medical-insurance-costs dataset.
### Takeaway results:
### The average age of the patients is at the middle age gap of 40 and has very little to do with their insurance costs!
###  Smokers have a significantly higher average insurance cost of 4 times of those who do not smoke while their count is actually 4 times lower than their counterpart. Smoking heavily affectes their cost and we can advise the patient to stop smoking if he wants to lower his cost!
### None of the 4 regions holds any big difference with each other on either count or average. Only the Southeast region with slightly higher count of patients! 
###  It seems our data holds no bias against either sex(male or female) that currently holds.(Bias against other sex still exists). Their count is similar, their average is similar and according to the correlation value being male or female has no impact on the insurance cost of the patient! 
### Our patients have a really high bmi average of 30.7! According to cdc that ranks as obese so we should advise our patients to take a better care of their body and promote healthy nutrition even though bmi has a small correlation value with the insurance cost and by lowering it has almost no impact on their costs! It has though on their lives!

In [36]:
print("Have a nice day!")

Have a nice day!
