# Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
#Importing data to open in pandas

raw_data = 'LLCP2018.XPT'

cdc_df = pd.read_sas(raw_data, format='xport')

pd.set_option('display.max_columns', 500)

#changing column names to lower case and removing the dash

cdc_df.columns= cdc_df.columns.map(lambda x: x.replace('_',''))
cdc_df.columns= cdc_df.columns.map(lambda x: x.lower())

In [4]:
# subset of the raw data that will be used for this project 

cdc = cdc_df[['age80', 'sex1', 'imprace', 'marital', 'weight2', 'htin4','rfbmi5', 
              'educag', 'income2', 'renthom1', 'employ1', 'genhlth',
              
              'rfbing5', 'hcvu651', 'checkup1','persdoc2', 'totinda', 'rfbing5', 'smoker3',  
              'drnkany5', 
              
              'sleptim1', 'michd', 'chcocncr', 'addepev2',
              'cvdstrk3', 'diabete3', 'asthma3', 'cvdinfr4', 'chcscncr', 'chccopd1', 'chckdny1' ,'drdxar1'
             ]]

cdc.head(2)

Unnamed: 0,age80,sex1,imprace,marital,weight2,htin4,rfbmi5,educag,income2,renthom1,employ1,genhlth,rfbing5,hcvu651,checkup1,persdoc2,totinda,rfbing5.1,smoker3,drnkany5,sleptim1,michd,chcocncr,addepev2,cvdstrk3,diabete3,asthma3,cvdinfr4,chcscncr,chccopd1,chckdny1,drdxar1
0,80.0,2.0,1.0,3.0,130.0,64.0,1.0,4.0,6.0,1.0,2.0,2.0,1.0,9.0,1.0,1.0,2.0,1.0,4.0,2.0,7.0,2.0,2.0,2.0,2.0,3.0,2.0,2.0,1.0,2.0,2.0,1.0
1,33.0,2.0,2.0,5.0,200.0,65.0,2.0,4.0,4.0,2.0,1.0,3.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,5.0,2.0,2.0,2.0,2.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0


In [5]:
cdc.genhlth.value_counts()

2.0    142197
3.0    138321
1.0     71893
4.0     60762
5.0     23120
7.0       800
9.0       318
Name: genhlth, dtype: int64

In [6]:
# 7 and 9 are used as place holders in the survey (7-i dont know, 9 -refused to answer, they have been either changed 
#a nan value and after dropped or filled with the median or mean value )

#RACE
#1 white  #2 black   #3 Asian   #4native american   #5hispanic   #6 other 

#For sex, replacing placeholders with the median value
for val in cdc.sex1.values:
    if val > 2:
        cdc.sex1 = cdc.sex1.replace({val: cdc.sex1.median()})
        

#for marital status 1) married  2) divorced 3)widowed 4)single
cdc.marital = cdc.marital.fillna(value=cdc.marital.median())
cdc.marital = cdc.marital.replace({5:4})
cdc.marital = cdc.marital.replace({6:4})
cdc.marital = cdc.marital.replace({9:4})


#Dealing with NaN values for weight and height

cdc.htin4 = cdc.htin4.fillna(value=cdc.htin4.mean()) 


#Overweight
cdc.rfbmi5 = cdc.rfbmi5.replace({9:np.nan})


#Education
cdc.educag = cdc.educag.fillna(value=cdc.educag.median())


#binge drinkers
cdc.rfbing5 = cdc.rfbing5.replace({9:cdc.rfbing5.median()})

#Smoker, former smoker or not 

cdc.smoker3 = cdc.smoker3.replace({2:1})
cdc.smoker3 = cdc.smoker3.replace({4:0})
cdc.smoker3 = cdc.smoker3.replace({9:np.nan})
cdc.smoker3 = cdc.smoker3.replace({3:0.5})

#Rent or own a home
cdc.renthom1 = cdc.renthom1.replace({9:0})
cdc.renthom1 = cdc.renthom1.replace({7:cdc.renthom1.median()})
cdc.renthom1 = cdc.renthom1.replace({2:0})
cdc.renthom1 = cdc.renthom1.replace({3:0})
cdc.renthom1 = cdc.renthom1.fillna(value=cdc.renthom1.median())


#Primary physician Do you have a personal doctor or health care provider?
cdc.persdoc2 = cdc.persdoc2.replace({2:1})
cdc.persdoc2 = cdc.persdoc2.replace({3:0})
cdc.persdoc2 = cdc.persdoc2.replace({9:cdc.persdoc2.median()})
cdc.persdoc2 = cdc.persdoc2.replace({7:cdc.persdoc2.median()})


#Income
cdc.income2 = cdc.income2.replace({1:0})# 0 for income 25k or less
cdc.income2 = cdc.income2.replace({2:0}) #0 for income 25k or less
cdc.income2 = cdc.income2.replace({3:0}) #0 for income 25k or less
cdc.income2 = cdc.income2.replace({4:0}) #0 for income 25k or less
cdc.income2 = cdc.income2.replace({5:1}) #1 for income 50k or less
cdc.income2 = cdc.income2.replace({6:1}) #1 for income 50k or less
cdc.income2 = cdc.income2.replace({7:2}) #2 for income 75k or less
cdc.income2 = cdc.income2.replace({8:3}) #3 for income 75k or more
cdc.income2 = cdc.income2.fillna(cdc.income2.median())

#Employment Status
cdc.employ1 = cdc.employ1.replace({2:1})
cdc.employ1 = cdc.employ1.replace({3:0})
cdc.employ1 = cdc.employ1.replace({4:0})
cdc.employ1 = cdc.employ1.replace({5:0})
cdc.employ1 = cdc.employ1.replace({6:0})
cdc.employ1 = cdc.employ1.replace({8:0})
cdc.employ1 = cdc.employ1.replace({7:0.5})
cdc.employ1 = cdc.employ1.replace({9:0})

#general health status

cdc.genhlth = cdc.genhlth.replace({9:cdc.genhlth.median()})
cdc.genhlth = cdc.genhlth.replace({7:cdc.genhlth.median()})
cdc.genhlth = cdc.genhlth.replace({2:1})



#Health Care coverage

cdc.hcvu651 = cdc.hcvu651.replace({2:0})
cdc.hcvu651 = cdc.hcvu651.replace({9:cdc.hcvu651.median()})


#for check up 1 is last routine check up within the past year 
#2- within the past two years
#3- within the past five years
#4- 5 or more years

cdc.checkup1 = cdc.checkup1.replace({7:cdc.checkup1.median()})
cdc.checkup1 = cdc.checkup1.replace({8:cdc.checkup1.median()})
cdc.checkup1 = cdc.checkup1.replace({9:np.nan})
cdc.checkup1 = cdc.checkup1.fillna(value=cdc.checkup1.median())


#DO HAVE ASTHMA

cdc.asthma3 = cdc.asthma3.replace({9: 0})
cdc.asthma3 = cdc.asthma3.replace({7: 0})
cdc.asthma3 = cdc.asthma3.replace({2:0})


#filling placeholders (9, 7) for the question have you had at least one drink of alcohol in the past 30 days?

cdc.drnkany5 = cdc.drnkany5.replace({2:0})
cdc.drnkany5 = cdc.drnkany5.replace({9:cdc.drnkany5.median()})
cdc.drnkany5 = cdc.drnkany5.replace({7:cdc.drnkany5.median()})


#Adults who reported doing physical activity or exercise during the past 30 days
cdc.totinda = cdc.totinda.replace({2:0})
cdc.totinda = cdc.totinda.replace({9:cdc.totinda.median()})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [455]:

#removing outliers by removing data points for weight above 600ibs and height above 85 inches 
cdc = cdc.loc[cdc.weight2 >= 80]
cdc = cdc.loc[cdc.weight2 <= 700]
cdc = cdc.loc[cdc.htin4 <= 90]
cdc = cdc.loc[cdc.htin4 >= 45]

cdc.weight2.max()

663.0

In [456]:
# removing outliers for average sleep time (more than 24 hours is excluded)

for val in cdc.sleptim1.values:
    if val > 24:
        cdc.sleptim1 = cdc.sleptim1.replace({val:cdc.sleptim1.median()})

cdc.sleptim1 = cdc.sleptim1.fillna(value=cdc.sleptim1.mean())

In [457]:
cdc.head(2)

Unnamed: 0,age80,sex1,imprace,marital,weight2,htin4,rfbmi5,educag,income2,renthom1,employ1,genhlth,rfbing5,hcvu651,checkup1,persdoc2,totinda,rfbing5.1,smoker3,drnkany5,sleptim1,michd,chcocncr,addepev2,cvdstrk3,diabete3,asthma3,cvdinfr4,chcscncr,chccopd1,chckdny1,drdxar1
0,80.0,2.0,1.0,3.0,130.0,64.0,1.0,4.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,7.0,2.0,2.0,2.0,2.0,3.0,0.0,2.0,1.0,2.0,2.0,1.0
1,33.0,2.0,2.0,4.0,200.0,65.0,2.0,4.0,0.0,0.0,1.0,3.0,1.0,0.0,2.0,1.0,1.0,1.0,1.0,1.0,5.0,2.0,2.0,2.0,2.0,3.0,0.0,2.0,2.0,2.0,2.0,2.0


In [459]:
#renaming columns with columns names that are easier to understand 

cdc.columns =['age', 'sex', 'race', 'martial', 'weight', 'height', 'overweight', 
              'education', 'income', 'home','employment', 'health', 'binge','coverage', 
              'checkup', 'primaryphysician','exercise', 'heavyalcohol', 'smoker',
              
              'any_alcohol', 'sleep_avg', 'heart_dis', 'cancer', 
              'depression', 'stroke', 'diabete3', 'asthma3', 'heart_attack', 
              'skin_cancer', 'chronic_bronchitis','kidney_disease', 'arthritis']



In [460]:
cdc.health_coverage.value_counts()

AttributeError: 'DataFrame' object has no attribute 'health_coverage'

In [461]:
#9 AND 7 ARE PLACE HOLDERS IN THE DATA SET THEREFORE I AM REMOVING THEM TO REMOVE MISSING VALUES


cdc.chronic_bronchitis = cdc.chronic_bronchitis.replace({9:np.nan})
cdc.chronic_bronchitis = cdc.chronic_bronchitis.replace({7:np.nan})
cdc.chronic_bronchitis = cdc.chronic_bronchitis.replace({2:0})

cdc.heart_dis = cdc.heart_dis.replace({2:0})


cdc.heart_attack = cdc.heart_attack.replace({7:np.nan})
cdc.heart_attack = cdc.heart_attack.replace({2:0})



cdc.skin_cancer = cdc.skin_cancer.replace({9: np.nan})
cdc.skin_cancer = cdc.skin_cancer.replace({7: np.nan})
cdc.skin_cancer = cdc.skin_cancer.replace({2:0})

cdc.cancer = cdc.cancer.replace({9:np.nan})
cdc.cancer = cdc.cancer.replace({2:0})
cdc.cancer = cdc.cancer.replace({7:np.nan})

cdc.stroke = cdc.stroke.replace({9:np.nan})
cdc.stroke = cdc.stroke.replace({7:0})
cdc.stroke = cdc.stroke.replace({2:0})


cdc.kidney_disease = cdc.kidney_disease.replace({9:np.nan})
cdc.kidney_disease = cdc.kidney_disease.replace({7:0})
cdc.kidney_disease = cdc.kidney_disease.replace({2:0})


cdc.depression = cdc.depression.replace({9:np.nan})
cdc.depression = cdc.depression.replace({7:0})
cdc.depression = cdc.depression.replace({2:0})


#1 Did not graduate High School #2 Graduated High School
#3 Attended College or Technical School #4 Graduated from College 
cdc.education = cdc.education.replace({9:cdc.education.median()})

cdc.arthritis = cdc.arthritis.replace({2:0})

cdc.binge = cdc.binge.replace({9:cdc.binge.median()})


In [462]:
cdc.shape

(407359, 32)

# Feature engineering

BMI using weight and height 

In [463]:
cdc.head(2)

Unnamed: 0,age,sex,race,martial,weight,height,overweight,education,income,home,employment,health,binge,coverage,checkup,primaryphysician,exercise,heavyalcohol,smoker,any_alcohol,sleep_avg,heart_dis,cancer,depression,stroke,diabete3,asthma3,heart_attack,skin_cancer,chronic_bronchitis,kidney_disease,arthritis
0,80.0,2.0,1.0,3.0,130.0,64.0,1.0,4.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,1.0,0.0,0.0,1.0
1,33.0,2.0,2.0,4.0,200.0,65.0,2.0,4.0,0.0,0.0,1.0,3.0,1.0,0.0,2.0,1.0,1.0,1.0,1.0,1.0,5.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [464]:
cdc['bmi'] = cdc.weight/cdc.height**2*703

Obese using bmi greater than 30 as defined by the CDC

In [465]:
cdc['obese'] = np.where(cdc['bmi']>=30, 1, 0)

In [466]:
115/64**2*703

19.737548828125

In [467]:
#Dealing with outliers and misssing values for diabetes
for val in cdc.diabete3.values:
    if val ==3:
        cdc.diabete3 = cdc.diabete3.replace({val:0})
    if val == 4:
         cdc.diabete3 = cdc.diabete3.replace({val:1})
    if val == 2:
         cdc.diabete3 = cdc.diabete3.replace({val:1})
    if val == 7:
        cdc.diabete3 = cdc.diabete3.replace({val:np.nan})
    if val == 9:
        cdc.diabete3 = cdc.diabete3.replace({val:np.nan})
        

cdc.diabete3 = cdc.diabete3.fillna(value=cdc.diabete3.median())

In [468]:
cdc.head()

Unnamed: 0,age,sex,race,martial,weight,height,overweight,education,income,home,employment,health,binge,coverage,checkup,primaryphysician,exercise,heavyalcohol,smoker,any_alcohol,sleep_avg,heart_dis,cancer,depression,stroke,diabete3,asthma3,heart_attack,skin_cancer,chronic_bronchitis,kidney_disease,arthritis,bmi,obese
0,80.0,2.0,1.0,3.0,130.0,64.0,1.0,4.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,22.312012,0
1,33.0,2.0,2.0,4.0,200.0,65.0,2.0,4.0,0.0,0.0,1.0,3.0,1.0,0.0,2.0,1.0,1.0,1.0,1.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33.278107,1
2,76.0,2.0,1.0,3.0,142.0,58.0,2.0,2.0,0.0,1.0,0.5,5.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,7.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,29.674792,0
3,66.0,1.0,1.0,2.0,190.0,70.0,2.0,2.0,0.0,1.0,0.5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.259184,0
5,80.0,2.0,1.0,3.0,172.0,62.0,2.0,3.0,99.0,1.0,0.5,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,6.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,31.455775,1


In [480]:
cdc.dropna(axis=0, subset=('heart_dis', 'cancer', 
                           'chronic_bronchitis', 'heart_attack', 
                           'diabete3', 'asthma3', 'kidney_disease', 'depression', 
                            'skin_cancer', 'arthritis', 'primaryphysician', 
                           'employment', 'education', 'coverage', 'stroke', 'smoker', 'overweight', 'health' ), inplace=True)

In [481]:
cdc.obese.value_counts()

0    261982
1    122002
Name: obese, dtype: int64

In [482]:
cdc.shape

(383984, 34)

In [483]:
cdc.isna().sum()

age                   0
sex                   0
race                  0
martial               0
weight                0
height                0
overweight            0
education             0
income                0
home                  0
employment            0
health                0
binge                 0
coverage              0
checkup               0
primaryphysician      0
exercise              0
heavyalcohol          0
smoker                0
any_alcohol           0
sleep_avg             0
heart_dis             0
cancer                0
depression            0
stroke                0
diabete3              0
asthma3               0
heart_attack          0
skin_cancer           0
chronic_bronchitis    0
kidney_disease        0
arthritis             0
bmi                   0
obese                 0
dtype: int64

In [484]:
cdc.head()

%store cdc

Stored 'cdc' (DataFrame)
