In [1]:
import pandas as pd
#this is what the data looks like
df_insurance = pd.read_csv("insurance.csv")
df_insurance

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


# Medical Info Class
This class will import the csv and has methods for counting and averaging the data

In [2]:
class medical_info:
    def __init__(self, csv):
        #checking for a csv file or for a dataframe so class can work on age bracket dataframes
        if ".csv" in csv:
            self.df_insurance = pd.read_csv(csv)
        else:
            self.df_insurance = pd.DataFrame(csv)
        # grouping the data for futher analysis
        self.grp_sex = self.df_insurance.groupby("sex")
        self.grp_region = self.df_insurance.groupby("region")
        self.grp_smoker = self.df_insurance.groupby("smoker")
        self.grp_region_smoker = self.df_insurance.groupby(["region", "smoker"])
        
    def counts(self):
        #this will have a breakddown of how many individuals there are based on different groups such as smokers or males and females
        self.males = self.df_insurance["sex"].loc[df_insurance["sex"] == "male"].count()
        self.females = self.df_insurance["sex"].loc[df_insurance["sex"] == "female"].count()
        self.smokers = df_insurance.groupby("smoker").get_group("yes")["smoker"].count()
        self.nonsmokers = df_insurance.groupby("smoker").get_group("no")["smoker"].count()
        self.count_region_smokers = self.grp_region_smoker['age'].count()
        self.count_sex = self.grp_sex['age'].count()
        self.count_region = self.grp_region['age'].count()
        self.count_smokers = self.grp_smoker['age'].count()
        return (f"Number of Males and Females: \n{self.count_sex} \n\nThe number of individuals in each region:\n{self.count_region}\n\n" +
        f"The number of smokers and non-smokers:\n{self.count_smokers}\n\n" + f"The number of smokers in each region:\n{self.count_region_smokers}")
        
    def averages(self):
        #this is the average of the entire dataset
        self.mean = round(self.df_insurance[["age", "bmi", "children", "charges"]].mean(), 2)
        #individual column averages
        self.age_mean, self.bmi_mean, self.children_mean, self.charges_mean = self.mean
        #averages by groups
        self.mean_region = round(self.grp_region.mean(),2)
        self.mean_sex = round(self.grp_sex.mean(), 2)
        return (str.format('''The averages are as follows:\nAverage age is {} \nAverage bmi is {} \nAverage num of children is {}\nAverage insurance charges are ${}\n\n''', self.age_mean, self.bmi_mean, self.children_mean, self.charges_mean) +
            str.format("The averages grouped by Region: \n{}", self.mean_region) + "\n\n" +
            str.format("The averages grouped by Sex: \n{}", self.mean_sex))
    
    def __repr__(self):
        return "Here are basic statistics on the dataset: \n" + str(self.df_insurance.describe())
    
ins1 = medical_info("insurance.csv")
print(ins1)
print("\n")
print(ins1.counts())
print("\n") 
print(ins1.averages())
print("\n\n")


Here are basic statistics on the dataset: 
               age          bmi     children       charges
count  1338.000000  1338.000000  1338.000000   1338.000000
mean     39.207025    30.663397     1.094918  13270.422265
std      14.049960     6.098187     1.205493  12110.011237
min      18.000000    15.960000     0.000000   1121.873900
25%      27.000000    26.296250     0.000000   4740.287150
50%      39.000000    30.400000     1.000000   9382.033000
75%      51.000000    34.693750     2.000000  16639.912515
max      64.000000    53.130000     5.000000  63770.428010


Number of Males and Females: 
sex
female    662
male      676
Name: age, dtype: int64 

The number of individuals in each region:
region
northeast    324
northwest    325
southeast    364
southwest    325
Name: age, dtype: int64

The number of smokers and non-smokers:
smoker
no     1064
yes     274
Name: age, dtype: int64

The number of smokers in each region:
region     smoker
northeast  no        257
           yes    

## Separate by age brackets

In [3]:
#function runs on dataframe and creates a list of variables storing each age bracket
def age_brackets(data):
    # age brackets are < 20; 20-29, 30-39, 40-49; 50+
    brackets = {1: 20, 2: 30, 3: 40, 4: 50}
    age_4 = data.loc[data["age"] >= brackets[4]]
    age_3 = data.loc[(data["age"] >= brackets[3]) & (data["age"] < brackets[4])]
    age_2 = data.loc[(data["age"] >= brackets[2]) & (data["age"] < brackets[3])]
    age_1 = data.loc[(data["age"] >= brackets[1]) & (data["age"] < brackets[2])]
    age_0 = data.loc[(data["age"] < brackets[1])]
    age_breakdown = [age_0, age_1, age_2, age_3, age_4]
    return age_breakdown

age_breakdown_list = age_brackets(ins1.df_insurance)
for item in age_breakdown_list:
    print(item[['age', 'bmi', 'children', 'charges']].mean())

age           18.496350
bmi           29.971496
children       0.437956
charges     8407.349242
dtype: float64
age           24.467857
bmi           29.786964
children       0.967857
charges     9561.751018
dtype: float64
age            34.420233
bmi            30.443833
children        1.501946
charges     11738.784117
dtype: float64
age            44.573477
bmi            30.709642
children        1.415771
charges     14399.203564
dtype: float64
age            56.602597
bmi            31.660065
children        0.916883
charges     17902.552070
dtype: float64


In [4]:
# getting averages and counts for each age bracket
for item in age_breakdown_list:
    item = medical_info(item)
    print(item)
    print("\n")
    print(item.averages())
    print("\n")
    print(item.counts())
    print("----------\n----------\n")


Here are basic statistics on the dataset: 
              age         bmi    children       charges
count  137.000000  137.000000  137.000000    137.000000
mean    18.496350   29.971496    0.437956   8407.349242
std      0.501821    6.563831    0.881687  11418.833134
min     18.000000   15.960000    0.000000   1121.873900
25%     18.000000   25.175000    0.000000   1634.573400
50%     18.000000   30.030000    0.000000   2138.070700
75%     19.000000   34.400000    1.000000  13747.872350
max     19.000000   53.130000    5.000000  39722.746200


The averages are as follows:
Average age is 18.5 
Average bmi is 29.97 
Average num of children is 0.44
Average insurance charges are $8407.35

The averages grouped by Region: 
             age    bmi  children  charges
region                                    
northeast  18.00  28.56      0.41  7558.73
northwest  19.00  28.64      0.47  9479.64
southeast  18.08  33.89      0.45  8844.51
southwest  19.00  27.83      0.42  7543.20

The averages gr

### BMI difference function

In [5]:
# function for seeing the difference between each age bracket's BMI
def bmi_difference(firstage, secondage):
    firstageclass = medical_info(firstage)
    firstageclass.averages()
    secondageclass = medical_info(secondage)
    secondageclass.averages()
    print(str(firstageclass.bmi_mean) + " - " + str(secondageclass.bmi_mean) + " = ")
    print(firstageclass.bmi_mean - secondageclass.bmi_mean)

bmi_difference(age_breakdown_list[0], age_breakdown_list[3])

29.97 - 30.71 = 
-0.740000000000002


In [6]:
# Percent of average (charges)
df_sex = df_insurance.groupby("sex")
df_charges_sex = df_insurance.groupby("sex")["charges"].mean()
print(df_charges_sex)
for charge in df_charges_sex:
    print(str(round(charge / 13270.42 * 100)) + "% of Average")

sex
female    12569.578844
male      13956.751178
Name: charges, dtype: float64
95% of Average
105% of Average
