In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [106]:
survey = pd.read_csv('./SBU_example_Surveydata_2014.csv', encoding = "ISO-8859-1")
print(survey.shape)
survey.head()

(6865, 120)


Unnamed: 0,STATE,GEOSTR,DENSTR2,PRECALL,REPNUM,REPDEPTH,FMONTH,IDATE,IMONTH,IDAY,...,HCVPRIME,HCVPRIMA,HLTHPREG,PREGEVER,BRTHCNTL3,TYPCNTRL2,NOBCUSE2,PFCHLDFT,PRNTLVIT,REGION
0,New York,201,D,To be called,40136,28,April,4252014,4,25,...,Not asked or Missing,Not asked or Missing,Not asked or Missing,Not asked or Missing,,,,Not asked or Missing,,NYS exclusive of NYC
1,New York,202,D,To be called,10105,17,January,3162014,3,16,...,Not asked or Missing,Not asked or Missing,Not asked or Missing,Not asked or Missing,,,,Not asked or Missing,,New York City (NYC)
2,New York,201,D,To be called,110014,4,November,11292014,November,29,...,Not asked or Missing,Not asked or Missing,Not asked or Missing,Not asked or Missing,,,,Not asked or Missing,,NYS exclusive of NYC
3,New York,201,D,To be called,110083,6,November,11252014,November,25,...,Not asked or Missing,Not asked or Missing,Not asked or Missing,Not asked or Missing,,,,Not asked or Missing,,NYS exclusive of NYC
4,New York,202,D,To be called,50129,7,May,10052014,October,5,...,Not asked or Missing,Not asked or Missing,Not asked or Missing,Not asked or Missing,,,,Not asked or Missing,,New York City (NYC)


In [107]:
features = ['NUMADULT', 'GENHLTH', 'HLTHPLN1', 'EXERANY2',
            'AGE', 'MARITAL', 'EDUCA', 'SEX', 'PREGNANT', 'PDIABTST', 'PREDIAB1', 'BMI5CAT', 'CHLDCNT', 'DRNKANY5', 'DRNKDY4',
            'SSBSUGAR', 'SSBFRUT2', 'LIFECHG', 'DIABETE3']
data = survey[features]

In [108]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6865 entries, 0 to 6864
Data columns (total 19 columns):
NUMADULT    4696 non-null float64
GENHLTH     6865 non-null object
HLTHPLN1    6865 non-null object
EXERANY2    6865 non-null object
AGE         6865 non-null object
MARITAL     6865 non-null object
EDUCA       6865 non-null object
SEX         6865 non-null object
PREGNANT    6865 non-null object
PDIABTST    6865 non-null object
PREDIAB1    6865 non-null object
BMI5CAT     6865 non-null object
CHLDCNT     6865 non-null object
DRNKANY5    6865 non-null object
DRNKDY4     6865 non-null object
SSBSUGAR    6865 non-null object
SSBFRUT2    6865 non-null object
LIFECHG     6865 non-null object
DIABETE3    6865 non-null object
dtypes: float64(1), object(18)
memory usage: 1019.1+ KB


In [87]:
data.head()

Unnamed: 0,NUMADULT,GENHLTH,HLTHPLN1,EXERANY2,AGE,MARITAL,EDUCA,SEX,PREGNANT,PDIABTST,PREDIAB1,BMI5CAT,CHLDCNT,DRNKANY5,DRNKDY4,SSBSUGAR,SSBFRUT2,LIFECHG,DIABETE3
0,,Very good,Yes,No,Age 65 or older,Divorced,College 1 year to 3 years (Some college or tec...,Male,Not asked or Missing,Not asked or Missing,Not asked or Missing,Overweight,One child in household,No,Did not drink,Not asked or Missing,Not asked or Missing,Not asked or Missing,No
1,,Very good,Yes,Yes,Age 65 or older,Divorced,College 4 years or more (College graduate),Female,Not asked or Missing,Not asked or Missing,Not asked or Missing,Normal Weight,No children in household,No,Did not drink,Not asked or Missing,Not asked or Missing,Not asked or Missing,No
2,,Good,Yes,Yes,Age 35 - 44,Married,College 4 years or more (College graduate),Female,No,Not asked or Missing,Not asked or Missing,Normal Weight,One child in household,Yes,Dont know/Not sure/Refused/Missing,Not asked or Missing,Not asked or Missing,Not asked or Missing,No
3,,Very good,Yes,Yes,Age 65 or older,Married,College 4 years or more (College graduate),Male,Not asked or Missing,Not asked or Missing,Not asked or Missing,Overweight,No children in household,Yes,Number of drinks per day,Not asked or Missing,Not asked or Missing,Not asked or Missing,No
4,,Excellent,Yes,Yes,Age 35 - 44,Married,College 4 years or more (College graduate),Female,No,Not asked or Missing,Not asked or Missing,Overweight,No children in household,No,Did not drink,Not asked or Missing,Not asked or Missing,Not asked or Missing,No


In [188]:
# factorize DIABETES into health_status
data = data[data.DIABETE3.isin(['No', 'Yes'])]
cat = pd.Categorical(data.DIABETE3, categories=['No', 'Yes'], ordered=True)
labels, unique = pd.factorize(cat, sort=True)
print(unique)
data['health_status'] = labels

# fill NA NUMADULT
data.NUMADULT.fillna(0, inplace=True)

# Process GENHLTH
data = data[data.GENHLTH.isin(['Very good', 'Good', 'Excelent', 'Fair', 'Poor'])]

# Process HLTHPLN1
data = data[data.HLTHPLN1.isin(['No', 'Yes'])]
cat = pd.Categorical(data.HLTHPLN1, categories=['No', 'Yes'], ordered=True)
labels, unique = pd.factorize(cat, sort=True)
data['health_coverage'] = labels

# Process EXERANY2
data = data[data.EXERANY2.isin(['No', 'Yes'])]
cat = pd.Categorical(data.EXERANY2, categories=['No', 'Yes'], ordered=True)
labels, unique = pd.factorize(cat, sort=True)
data['physical_activities'] = labels

# Process Age
data = data[~data.AGE.isin(['Refused'])]

# Process MARITAL
data = data[~data.MARITAL.isin(['Refused', 'Not asked or Missing'])]
data.MARITAL[data.MARITAL == 'A member of an unmarried couple'] = 'Couple'

# Process EDUCA
def education_map(x):
    if x == 'College 4 years or more (College graduate)': return 'College graduate'
    elif x == 'Grade 12 or GED (High school graduate)': return 'High school graduate'
    elif x == 'College 1 year to 3 years (Some college or technical school)': return 'Some college or technical school'
    elif x == 'Grades 9 through 11 (Some high school)': return 'Some high school'
    elif x == 'Grades 1 through 8 (Elementary)': return 'Elementary'
    else: return 'No Education'

data = data[~data.EDUCA.isin(['Refused', 'Not asked or Missing'])]
data['education'] = data.EDUCA.apply(education_map)

# Process Sex
cat = pd.Categorical(data.SEX, categories=['Female', 'Male'], ordered=True)
labels, unique = pd.factorize(cat, sort=True)
data['SEX'] = labels

# Process PDIABTST
data = data[data.PDIABTST.isin(['No', 'Yes', 'Not asked or Missing'])]

# Process PREDIAB1
data = data[data.PREDIAB1.isin(['No', 'Yes', 'Not asked or Missing'])]

# Process BMI5CAT
data = data[~data.BMI5CAT.isin(["Don't know/Not sure/Missing"])]

# Process CHLDCNT
data = data[~data.CHLDCNT.isin(["Don't know/Not sure/Missing"])]

# Process DRNKANY5
data = data[data.DRNKANY5.isin(['No', 'Yes'])]
cat = pd.Categorical(data.DRNKANY5, categories=['No', 'Yes'], ordered=True)
labels, unique = pd.factorize(cat, sort=True)
data['drink30'] = labels

# Process DRNKDY4
data['DRNKDY4'] = data.DRNKDY4.apply(lambda x: 'Did not drink' if x == 'Did not drink' else 'Number of drinks per day')
cat = pd.Categorical(data.DRNKDY4, categories=['Did not drink', 'Number of drinks per day'], ordered=True)
labels, unique = pd.factorize(cat, sort=True)
data['drink_per_day'] = labels

# Process SSBSUGAR
data = data[~data.SSBSUGAR.isin(['Refused', "Don't know/Not sure/Missing"])]

# Process SSBFRUT2
data = data[~data.SSBFRUT2.isin(['Refused', "Don't know/Not sure/Missing"])]

# Process LIFECHG
data = data[~data.LIFECHG.isin(['Refused'])]

[No, Yes]
Categories (2, object): [No < Yes]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [116]:
survey.DIABETE3.value_counts()

No                                           5848
Yes                                           842
No pre-diabetes or borderline diabetes        110
Yes but female told only during pregnancy      51
Refused                                         7
Dont know/Not Sure                             7
Name: DIABETE3, dtype: int64

In [174]:
print(data.health_status.value_counts())

0    3937
1     735
Name: health_status, dtype: int64


In [194]:
data.SSBFRUT2.value_counts()

Not asked or Missing    2500
Never                   1140
Times per week           298
Times per month          288
Times per day            201
Dont know/Not sure       15
Refused                    8
Name: SSBFRUT2, dtype: int64

In [179]:
survey.DRNKDY4.value_counts()

Number of drinks per day               3470
Did not drink                          2791
Dont know/Not sure/Refused/Missing     604
Name: DRNKDY4, dtype: int64

In [161]:
data.columns

Index(['NUMADULT', 'GENHLTH', 'HLTHPLN1', 'EXERANY2', 'AGE', 'MARITAL',
       'EDUCA', 'SEX', 'PREGNANT', 'PDIABTST', 'PREDIAB1', 'BMI5CAT',
       'CHLDCNT', 'DRNKANY5', 'DRNKDY4', 'SSBSUGAR', 'SSBFRUT2', 'LIFECHG',
       'DIABETE3', 'health_status', 'health_coverage', 'physical_activities',
       'education', 'blood_sugar'],
      dtype='object')

In [186]:
data.DRNKDY4.apply(lambda x: 'Did not drink' if x == 'Did not drink' else 'Number of drinks per day').value_counts()

Number of drinks per day    2668
Did not drink               2004
Name: DRNKDY4, dtype: int64