# PORTFOLIO: US-Medical-Insurance-Costs
## By Juan Miguel Reyes Pérez

We´ll begin by importing insurance.csv into this notebook and inspecting its contents.

In [36]:
import csv

#First we will collect data into lists
ages_lst=[]
sex_lst=[]
bmi_lst=[]
children_lst=[]
smoker_lst=[]
region_lst=[]
charges_lst=[]

#In order to be as efficient as possible, we will loop through the file only once
with open ('insurance.csv') as insurance_file:

    insurance_dict = csv.DictReader(insurance_file)
    for row in insurance_dict:
            # add the data from each row to each list
            ages_lst.append(row['age'])
            sex_lst.append(row['sex'])
            bmi_lst.append(row['bmi'])
            children_lst.append(row['children'])
            smoker_lst.append(row['smoker'])
            region_lst.append(row['region'])
            charges_lst.append(row['charges'])
        
   
    


It seems convenient to arrange this piece of data within a class, lets label it as 'human'. Its fields are as it follows:

In [72]:
class human:
    def __init__(self, age, sex, bmi, children, smoker, region, charges):
        
        self.age=age
        self.sex=sex
        self.bmi=bmi
        self.children=children
        self.smoker=smoker
        self.region=region
        self.charges=charges

    #Here we define methods to perform a better analysis

    #Calculates: age_average, range, and counts how many unique ages there are
    def anal_age(self):
        #average
        average=0.0
        for i in self.age:
            average+=float(i)
        average=int(average/len(self.age))

        #range
        range=[min(self.age),max(self.age)]

        #counts unique ages
        unique_ages=[]
        for i in self.age:
            if i not in unique_ages:
                unique_ages.append(i)
            else:
                continue
        unique_ages.sort()

        return ('Average:', average), ('Range:', range), ('Unique ages:',unique_ages)

    #Count porcentaje of male and females
    def anal_sexes(self):
        female=0.0
        male=0.0
        for i in self.sex:
            if i=='male':
                male+=1
            else:
                female+=1
        return ('% male:',float(male)/len(self.sex)*100),('% female:',float(female)/len(self.sex)*100)
    
    #Calculates: average of childre, porcentage of parents
    def anal_children(self):

        #average plus porcentage
        yes=0
        no=0
        num_children=0.0
        for i in self.children:
            num_children+=float(i)
            if float(i)!=0:
                yes+=1

        porcentage=float(yes)/len(self.children)*100
        return ('Average children:', int(num_children/len(self.children))),('Have children:', porcentage),('No children:', 100-porcentage)

    #Calculates porcentage of smokers
    def anal_smokers(self):
        yes=0
        no=0
        for i in self.smoker:
            if i=='yes':
                yes+=1
        porcentage=float(yes)/len(self.smoker)*100
        return ('Smokers:', porcentage),('No smokers:', 100-porcentage)

    
    #Creates a list with unique regions and nº of inhabitants
    def anal_regions(self):
        regions={}
        for i in self.region:
            if i not in regions:
                regions[i]=0
            else:
                regions[i]+=1                
                
        return (regions)


    #Since the bmi is not a trustworthy indicator of a peatient´s health, there is no need to define an anal_bmi method

    #Calculate de average charges and range.
    def anal_charges(self):
        #average
        average=0
        for i in self.charges:
            average+=float(i)
        average=int(average/len(self.charges))

        #range
        range=[min(self.charges),max(self.charges)]

       
        return ('Average:', average), ('Range:', range)


# I saw the method dictionary in the portfolio example, and thought it could be useful:

# method to create dictionary with all patients information
    def create_dictionary(self):
        self.patients_dict = {}
        self.patients_dict["age"] = [int(age) for age in self.age]
        self.patients_dict["sex"] = self.sex
        self.patients_dict["bmi"] = self.bmi
        self.patients_dict["children"] = self.children
        self.patients_dict["smoker"] = self.smoker
        self.patients_dict["regions"] = self.region
        self.patients_dict["charges"] = self.charges
        return self.patients_dict
  


Now we will instantiate the main variable.

In [73]:
humans_insurance_data = human(ages_lst,sex_lst,bmi_lst,children_lst, smoker_lst, region_lst, charges_lst)


    

¡Great! Everything is ready to begin our analysis:

In [75]:
print('AGE')
print(humans_insurance_data.anal_age())
print('SEX')
print(humans_insurance_data.anal_sexes())
print('CHILDREN')
print(humans_insurance_data.anal_children())
print('SMOKER')
print(humans_insurance_data.anal_smokers())
print('REGION')
print(humans_insurance_data.anal_regions())
print('CHARGES')
print(humans_insurance_data.anal_charges())

AGE
(('Average:', 39), ('Range:', ['18', '64']), ('Unique ages:', ['18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64']))
SEX
(('% male:', 50.52316890881914), ('% female:', 49.47683109118087))
CHILDREN
(('Average children:', 1), ('Have children:', 57.100149476831085), ('No children:', 42.899850523168915))
SMOKER
(('Smokers:', 20.47832585949178), ('No smokers:', 79.52167414050822))
REGION
{'southwest': 324, 'southeast': 363, 'northwest': 324, 'northeast': 323}
CHARGES
(('Average:', 13270), ('Range:', ['10043.249', '9991.03765']))


### FINDINGS

##### Age

    · The verage age is 39. Taking into account the wide range of ages of the sample ([18,64]), it is logical.
    · It could be useful for a more profound analysis to asses the number of patients by age, in order to discuss possible biases.
    
##### Sex

    · The anal_sexes methos shows that both genders are equally well represented in the sample. The only way there could be any bias via this category would be if the genders were not uniformmly distributed among ages. That it, most women could be youngsters.
    
##### Smoking status

    · A 1/5 of the patients are smokers. 
##### Number of children

    · The average of children is of 1 per patient. Since half of the sample have children, we can infer that those who have children have more tan one indeed. Thus must have compensated the fact that the other half does not have any offsprings.


##### Region

    · 4 regions were distinguised:
        {'southwest': 324, 'southeast': 363, 'northwest': 324, 'northeast': 323}
    
    · Again, we must highlight how apparently well scattered the dataset is. Yet, it is hard to derive any conclusion from this as we cannot tell whether a region lacks of variety or not. For instance, smokers and parents could be concentrated in different regions.

##### Charges

    · Data speaks by itself: (('Average:', 13270), ('Range:', ['10043.249', '9991.03765']))
    · *NOTE: the range is too closed to acquire any deeper conlusion, as no association can be made with the other variables. This is, we cannot relate having high charged with a higher number of children.


