In [337]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import ExtraTreesClassifier

# Feature importance
[Feature desciptions](https://www.cdc.gov/brfss/annual_data/2015/pdf/codebook15_llcp.pdf)

In [355]:
data_2015 = pd.read_csv("../data/2015.csv")

In [356]:
y = data_2015["_MICHD"].dropna() # Respondents that have ever reported having coronary heart disease (CHD) or myocardial infarction (MI), combination of CVDINFR4 and  CVDCRHD4 features

# _MICHD calculation:
# 1 Reported having MI or CHD
# Notes: CVDINFR4=1 OR CVDCRHD4=1
# 2 Did not report having MI or CHD
# Notes: CVDINFR4=2 AND CVDCRHD4=2
# BLANK Not asked or Missing
# Notes: CVDINFR4=7, 9 OR MISSING OR CVDCRHD4=7, 9, OR MISSING

In [357]:
# remove observations where _MICHD is missing
data_2015 = data_2015[~data_2015["_MICHD"].isna()]
# drop target, and features used to create target, from features
data_2015 = data_2015.drop(["_MICHD", "CVDINFR4", "CVDCRHD4"], axis=1)

In [358]:
data_2015.head()

Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,CTELENUM,PVTRESD1,COLGHOUS,STATERES,CELLFON3,LADULT,NUMADULT,NUMMEN,NUMWOMEN,CTELNUM1,CELLFON2,CADULT,PVTRESD2,CCLGHOUS,CSTATE,LANDLINE,HHADULT,GENHLTH,PHYSHLTH,MENTHLTH,POORHLTH,HLTHPLN1,PERSDOC2,MEDCOST,CHECKUP1,BPHIGH4,BPMEDS,BLOODCHO,CHOLCHK,TOLDHI2,CVDSTRK3,ASTHMA3,ASTHNOW,CHCSCNCR,CHCOCNCR,CHCCOPD1,HAVARTH3,ADDEPEV2,CHCKIDNY,DIABETE3,DIABAGE2,SEX,MARITAL,EDUCA,RENTHOM1,NUMHHOL2,NUMPHON2,CPDEMO1,VETERAN3,EMPLOY1,CHILDREN,INCOME2,INTERNET,WEIGHT2,HEIGHT3,PREGNANT,QLACTLM2,USEEQUIP,BLIND,DECIDE,DIFFWALK,DIFFDRES,DIFFALON,SMOKE100,SMOKDAY2,STOPSMK2,LASTSMK2,USENOW3,ALCDAY5,AVEDRNK2,DRNK3GE5,MAXDRNKS,FRUITJU1,FRUIT1,FVBEANS,FVGREEN,FVORANG,VEGETAB1,EXERANY2,EXRACT11,EXEROFT1,EXERHMM1,EXRACT21,EXEROFT2,EXERHMM2,STRENGTH,LMTJOIN3,ARTHDIS2,ARTHSOCL,JOINPAIN,SEATBELT,FLUSHOT6,FLSHTMY2,IMFVPLAC,PNEUVAC3,HIVTST6,HIVTSTD3,WHRTST10,PDIABTST,PREDIAB1,INSULIN,BLDSUGAR,FEETCHK2,DOCTDIAB,CHKHEMO3,FEETCHK,EYEEXAM,DIABEYE,DIABEDU,PAINACT2,QLMENTL2,QLSTRES2,QLHLTH2,CAREGIV1,CRGVREL1,CRGVLNG1,CRGVHRS1,CRGVPRB1,CRGVPERS,CRGVHOUS,CRGVMST2,CRGVEXPT,VIDFCLT2,VIREDIF3,VIPRFVS2,VINOCRE2,VIEYEXM2,VIINSUR2,VICTRCT4,VIGLUMA2,VIMACDG2,CIMEMLOS,CDHOUSE,CDASSIST,CDHELP,CDSOCIAL,CDDISCUS,WTCHSALT,LONGWTCH,DRADVISE,ASTHMAGE,ASATTACK,ASERVIST,ASDRVIST,ASRCHKUP,ASACTLIM,ASYMPTOM,ASNOSLEP,ASTHMED3,ASINHALR,HAREHAB1,STREHAB1,CVDASPRN,ASPUNSAF,RLIVPAIN,RDUCHART,RDUCSTRK,ARTTODAY,ARTHWGT,ARTHEXER,ARTHEDU,TETANUS,HPVADVC2,HPVADSHT,SHINGLE2,HADMAM,HOWLONG,HADPAP2,LASTPAP2,HPVTEST,HPLSTTST,HADHYST2,PROFEXAM,LENGEXAM,BLDSTOOL,LSTBLDS3,HADSIGM3,HADSGCO1,LASTSIG3,PCPSAAD2,PCPSADI1,PCPSARE1,PSATEST1,PSATIME,PCPSARS1,PCPSADE1,PCDMDECN,SCNTMNY1,SCNTMEL1,SCNTPAID,SCNTWRK1,SCNTLPAD,SCNTLWK1,SXORIENT,TRNSGNDR,RCSGENDR,RCSRLTN2,CASTHDX2,CASTHNO2,EMTSUPRT,LSATISFY,ADPLEASR,ADDOWN,ADSLEEP,ADENERGY,ADEAT1,ADFAIL,ADTHINK,ADMOVE,MISTMNT,ADANXEV,QSTVER,QSTLANG,EXACTOT1,EXACTOT2,MSCODE,_STSTR,_STRWT,_RAWRAKE,_WT2RAKE,_CHISPNC,_CRACE1,_CPRACE,_CLLCPWT,_DUALUSE,_DUALCOR,_LLCPWT,_RFHLTH,_HCVU651,_RFHYPE5,_CHOLCHK,_RFCHOL,_LTASTH1,_CASTHM1,_ASTHMS1,_DRDXAR1,_PRACE1,_MRACE1,_HISPANC,_RACE,_RACEG21,_RACEGR3,_RACE_G1,_AGEG5YR,_AGE65YR,_AGE80,_AGE_G,HTIN4,HTM4,WTKG3,_BMI5,_BMI5CAT,_RFBMI5,_CHLDCNT,_EDUCAG,_INCOMG,_SMOKER3,_RFSMOK3,DRNKANY5,DROCDY3_,_RFBING5,_DRNKWEK,_RFDRHV5,FTJUDA1_,FRUTDA1_,BEANDAY_,GRENDAY_,ORNGDAY_,VEGEDA1_,_MISFRTN,_MISVEGN,_FRTRESP,_VEGRESP,_FRUTSUM,_VEGESUM,_FRTLT1,_VEGLT1,_FRT16,_VEG23,_FRUITEX,_VEGETEX,_TOTINDA,METVL11_,METVL21_,MAXVO2_,FC60_,ACTIN11_,ACTIN21_,PADUR1_,PADUR2_,PAFREQ1_,PAFREQ2_,_MINAC11,_MINAC21,STRFREQ_,PAMISS1_,PAMIN11_,PAMIN21_,PA1MIN_,PAVIG11_,PAVIG21_,PA1VIGM_,_PACAT1,_PAINDX1,_PA150R2,_PA300R2,_PA30021,_PASTRNG,_PAREC1,_PASTAE1,_LMTACT1,_LMTWRK1,_LMTSCL1,_RFSEAT2,_RFSEAT3,_FLSHOT6,_PNEUMO2,_AIDTST3
0,1.0,1.0,b'01292015',b'01',b'29',b'2015',1200.0,2015000000.0,2015000000.0,1.0,1.0,,1.0,2.0,,3.0,1.0,2.0,,,,,,,,,5.0,15.0,18.0,10.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,2.0,3.0,,2.0,1.0,4.0,1.0,2.0,,1.0,2.0,8.0,88.0,3.0,2.0,280.0,510.0,,1.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,3.0,,2.0,3.0,888.0,,,,305.0,310.0,320.0,310.0,305.0,101.0,2.0,,,,,,,888.0,1.0,1.0,1.0,6.0,1.0,1.0,112014.0,1.0,1.0,1.0,,,1.0,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,b'',,,,,,,,,,,,,,,,,,,,,,,,,10.0,1.0,b'',b'',3.0,11011.0,28.78156,3.0,86.344681,,,,,1.0,0.614125,341.384853,2.0,1.0,2.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,9.0,1.0,63.0,5.0,70.0,178.0,12701.0,4018.0,4.0,2.0,1.0,2.0,2.0,3.0,1.0,2.0,5.397605e-79,1.0,5.397605e-79,1.0,17.0,33.0,67.0,33.0,17.0,100.0,5.397605e-79,5.397605e-79,1.0,1.0,50.0,217.0,2.0,1.0,1.0,1.0,5.397605e-79,5.397605e-79,2.0,,,2469.0,423.0,,,,,,,,,5.397605e-79,5.397605e-79,,,,,,,4.0,2.0,3.0,3.0,2.0,2.0,4.0,2.0,1.0,1.0,1.0,1.0,1.0,,,1.0
1,1.0,1.0,b'01202015',b'01',b'20',b'2015',1100.0,2015000000.0,2015000000.0,1.0,1.0,,1.0,2.0,,1.0,5.397605e-79,1.0,,,,,,,,,3.0,88.0,88.0,,2.0,1.0,1.0,4.0,3.0,,1.0,4.0,2.0,2.0,2.0,,2.0,2.0,2.0,2.0,2.0,2.0,3.0,,2.0,2.0,6.0,1.0,2.0,,2.0,2.0,3.0,88.0,1.0,1.0,165.0,508.0,,1.0,2.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,2.0,,3.0,888.0,,,,302.0,305.0,302.0,202.0,202.0,304.0,1.0,64.0,212.0,100.0,69.0,212.0,100.0,888.0,,,,,3.0,2.0,,,2.0,2.0,,,2.0,3.0,,,,,,,,,,,,,,2.0,,,,,,,,1.0,,,,,,,,,,1.0,5.0,5.0,,5.0,2.0,2.0,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,2.0,,,,,,,,,,b'',1.0,2.0,,,2.0,60.0,,,,,,,,,,,,,,,,,,,10.0,1.0,b'',b'',5.0,11011.0,28.78156,1.0,28.78156,,,,,9.0,,108.060903,1.0,2.0,1.0,2.0,1.0,1.0,1.0,3.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,7.0,1.0,52.0,4.0,68.0,173.0,7484.0,2509.0,3.0,2.0,1.0,4.0,1.0,1.0,2.0,2.0,5.397605e-79,1.0,5.397605e-79,1.0,7.0,17.0,7.0,29.0,29.0,13.0,5.397605e-79,5.397605e-79,1.0,1.0,24.0,78.0,2.0,2.0,1.0,1.0,5.397605e-79,5.397605e-79,1.0,35.0,5.397605e-79,2876.0,493.0,1.0,5.397605e-79,60.0,60.0,2800.0,2800.0,168.0,5.397605e-79,5.397605e-79,5.397605e-79,168.0,5.397605e-79,168.0,5.397605e-79,5.397605e-79,5.397605e-79,2.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,3.0,3.0,4.0,2.0,2.0,,,2.0
3,1.0,1.0,b'01142015',b'01',b'14',b'2015',1100.0,2015000000.0,2015000000.0,1.0,1.0,,1.0,2.0,,3.0,1.0,2.0,,,,,,,,,5.0,30.0,30.0,30.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,,2.0,1.0,2.0,1.0,1.0,2.0,3.0,,2.0,1.0,4.0,1.0,2.0,,1.0,2.0,8.0,1.0,8.0,2.0,180.0,507.0,,1.0,2.0,1.0,1.0,1.0,2.0,1.0,2.0,,,,3.0,888.0,,,,555.0,101.0,555.0,301.0,301.0,201.0,2.0,,,,,,,888.0,1.0,1.0,1.0,8.0,1.0,1.0,777777.0,5.0,1.0,9.0,,,2.0,3.0,,,,,,,,,,,,,,2.0,,,,,,,,2.0,,,,,,,,,,1.0,1.0,1.0,2.0,1.0,1.0,2.0,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,1.0,2.0,1.0,,,,,,,,b'',4.0,7.0,,,,97.0,,,,,,,,,,,,,,,,,,,10.0,1.0,b'',b'',3.0,11011.0,28.78156,3.0,86.344681,,,,,1.0,0.614125,341.384853,2.0,1.0,2.0,1.0,2.0,1.0,1.0,3.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,9.0,1.0,63.0,5.0,67.0,170.0,8165.0,2819.0,3.0,2.0,2.0,2.0,5.0,4.0,1.0,2.0,5.397605e-79,1.0,5.397605e-79,1.0,5.397605e-79,100.0,5.397605e-79,3.0,3.0,14.0,5.397605e-79,5.397605e-79,1.0,1.0,100.0,20.0,1.0,2.0,1.0,1.0,5.397605e-79,5.397605e-79,2.0,,,2469.0,423.0,,,,,,,,,5.397605e-79,5.397605e-79,,,,,,,4.0,2.0,3.0,3.0,2.0,2.0,4.0,2.0,1.0,1.0,1.0,1.0,1.0,,,9.0
4,1.0,1.0,b'01142015',b'01',b'14',b'2015',1100.0,2015000000.0,2015000000.0,1.0,1.0,,1.0,2.0,,2.0,1.0,1.0,,,,,,,,,5.0,20.0,88.0,30.0,1.0,1.0,2.0,1.0,3.0,,1.0,1.0,2.0,2.0,2.0,,2.0,2.0,2.0,1.0,2.0,2.0,3.0,,2.0,1.0,5.0,1.0,2.0,,2.0,2.0,8.0,88.0,77.0,1.0,142.0,504.0,,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,,,3.0,888.0,,,,777.0,102.0,203.0,204.0,310.0,320.0,2.0,,,,,,,888.0,1.0,1.0,1.0,7.0,1.0,2.0,,,1.0,1.0,777777.0,1.0,1.0,3.0,,,,,,,,,,,,,,2.0,,,,,,,,7.0,,,,,,,,,,2.0,,,,,,1.0,777.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,1.0,2.0,5.0,,,,,,,,b'',5.0,5.0,,,,45.0,,,,,,,,,,,,,,,,,,,10.0,1.0,b'',b'',3.0,11011.0,28.78156,2.0,57.56312,,,,,9.0,,258.682223,2.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,9.0,1.0,61.0,5.0,64.0,163.0,6441.0,2437.0,2.0,1.0,1.0,3.0,9.0,4.0,1.0,2.0,5.397605e-79,1.0,5.397605e-79,1.0,,200.0,43.0,57.0,33.0,67.0,1.0,5.397605e-79,5.397605e-79,1.0,,200.0,9.0,1.0,1.0,1.0,1.0,5.397605e-79,2.0,,,2543.0,436.0,,,,,,,,,5.397605e-79,5.397605e-79,,,,,,,4.0,2.0,3.0,3.0,2.0,2.0,4.0,2.0,1.0,1.0,1.0,1.0,1.0,,,1.0
5,1.0,1.0,b'01142015',b'01',b'14',b'2015',1100.0,2015000000.0,2015000000.0,1.0,1.0,,1.0,2.0,,1.0,5.397605e-79,1.0,,,,,,,,,2.0,88.0,88.0,,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,,2.0,2.0,2.0,1.0,2.0,2.0,3.0,,2.0,3.0,3.0,1.0,2.0,,2.0,2.0,2.0,88.0,6.0,2.0,145.0,502.0,,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,,,3.0,888.0,,,,101.0,101.0,102.0,101.0,102.0,101.0,1.0,18.0,101.0,100.0,73.0,107.0,30.0,888.0,1.0,2.0,3.0,4.0,1.0,1.0,112014.0,1.0,1.0,2.0,,,1.0,3.0,,,,,,,,,,,,,,2.0,,,,,,,,2.0,,,,,,,,,,2.0,,,,,,1.0,415.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,1.0,2.0,1.0,,,,,,,,b'',5.0,5.0,1.0,54.0,,,,,,,,,,,,,,,,,,,,,10.0,1.0,b'',b'',5.0,11011.0,28.78156,1.0,28.78156,,,,,9.0,,256.518591,1.0,9.0,2.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,11.0,2.0,73.0,6.0,62.0,157.0,6577.0,2652.0,3.0,2.0,1.0,1.0,4.0,4.0,1.0,2.0,5.397605e-79,1.0,5.397605e-79,1.0,100.0,100.0,200.0,100.0,200.0,100.0,5.397605e-79,5.397605e-79,1.0,1.0,200.0,600.0,1.0,1.0,1.0,1.0,5.397605e-79,5.397605e-79,1.0,50.0,33.0,2099.0,360.0,2.0,1.0,60.0,30.0,1000.0,7000.0,60.0,210.0,5.397605e-79,5.397605e-79,120.0,210.0,330.0,60.0,5.397605e-79,60.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0,2.0,3.0,1.0,1.0,1.0,1.0,2.0


In [359]:
missing_data_dict = {}
greater_than_50_missing = {}
for col in data_2015.columns:
    missing_data = data_2015[col].isnull().value_counts()
    try:
        missing_data_perc = (missing_data.loc[True]/(missing_data[True] + missing_data[False]))*100
        if missing_data_perc > 50:
            greater_than_50_missing[col] = missing_data_perc
        missing_data_dict[col] = missing_data_perc
    except KeyError:
        # if all values are missing
        try:
            missing_data_perc = (missing_data.loc[True]/(missing_data[True]))*100
            missing_data_dict[col] = 100
            greater_than_50_missing[col] = missing_data_perc
        except KeyError:
        # if no values are missing
            missing_data_dict[col] = 0



In [360]:
# 150 columns are missing more than 50% of values
len(greater_than_50_missing)

150

In [361]:
# remove columns with greater than 50% missingness (not implemented for now)
# data_2015 = data_2015[[col for col in data_2015.columns if col not in greater_than_50_missing]]

In [362]:
set(data_2015.dtypes)

{dtype('float64'), dtype('O')}

In [363]:
for col in data_2015:
    if data_2015[col].dtype == object:
        print(col)

IDATE
IMONTH
IDAY
IYEAR
PCDMDECN
EXACTOT1
EXACTOT2


In [364]:
# drop columns with object datatype (won't be relevant to prediction anyway)
# IDATE/IMONTH/IDAY/IYEAR: Interview date/month/day/year
# PCDMDECN: Prostate Cancer Screening Decision Making, mostly missing
# EXACTOT1: First Activity Other response description, unstructured text
# EXACTOT2: Second Activity Other response description, unstructured text
# The following columns are completely unpopulated
# PAINACT2: During the past 30 days, for about how many days did pain make it hard for you to do your usual activities, such as self-care, work, or recreation? All missing
# QLMENTL2: During the past 30 days, for about how may days have you felt sad, blue, or depressed? All missing
# QLSTRES2: During the past 30 days, for about how many days have you felt worried, tense, or anxious? All missing
# QLHLTH2: During the past 30 days, for about how many days have you felt very healthy and full of energy? All missing
#drop_cols  = ["IDATE", "IMONTH", "IDAY", "IYEAR", "EXACTOT1", "EXACTOT2", "PCDMDECN"]

drop_cols  = ["IDATE", "IMONTH", "IDAY", "IYEAR", "EXACTOT1", "EXACTOT2", "PCDMDECN", "PAINACT2", "QLMENTL2", "QLSTRES2", "QLHLTH2"]

In [365]:
data_2015 = data_2015.drop(drop_cols, axis=1)

In [366]:
# replace missings values with the mode (we may want to refine this in our classifier)
data_2015_impute = data_2015.fillna(data_2015.mode().iloc[0])

In [367]:
data_2015_impute[data_2015_impute.isnull().any(axis=1)]

Unnamed: 0,_STATE,FMONTH,DISPCODE,SEQNO,_PSU,CTELENUM,PVTRESD1,COLGHOUS,STATERES,CELLFON3,LADULT,NUMADULT,NUMMEN,NUMWOMEN,CTELNUM1,CELLFON2,CADULT,PVTRESD2,CCLGHOUS,CSTATE,LANDLINE,HHADULT,GENHLTH,PHYSHLTH,MENTHLTH,POORHLTH,HLTHPLN1,PERSDOC2,MEDCOST,CHECKUP1,BPHIGH4,BPMEDS,BLOODCHO,CHOLCHK,TOLDHI2,CVDSTRK3,ASTHMA3,ASTHNOW,CHCSCNCR,CHCOCNCR,CHCCOPD1,HAVARTH3,ADDEPEV2,CHCKIDNY,DIABETE3,DIABAGE2,SEX,MARITAL,EDUCA,RENTHOM1,NUMHHOL2,NUMPHON2,CPDEMO1,VETERAN3,EMPLOY1,CHILDREN,INCOME2,INTERNET,WEIGHT2,HEIGHT3,PREGNANT,QLACTLM2,USEEQUIP,BLIND,DECIDE,DIFFWALK,DIFFDRES,DIFFALON,SMOKE100,SMOKDAY2,STOPSMK2,LASTSMK2,USENOW3,ALCDAY5,AVEDRNK2,DRNK3GE5,MAXDRNKS,FRUITJU1,FRUIT1,FVBEANS,FVGREEN,FVORANG,VEGETAB1,EXERANY2,EXRACT11,EXEROFT1,EXERHMM1,EXRACT21,EXEROFT2,EXERHMM2,STRENGTH,LMTJOIN3,ARTHDIS2,ARTHSOCL,JOINPAIN,SEATBELT,FLUSHOT6,FLSHTMY2,IMFVPLAC,PNEUVAC3,HIVTST6,HIVTSTD3,WHRTST10,PDIABTST,PREDIAB1,INSULIN,BLDSUGAR,FEETCHK2,DOCTDIAB,CHKHEMO3,FEETCHK,EYEEXAM,DIABEYE,DIABEDU,CAREGIV1,CRGVREL1,CRGVLNG1,CRGVHRS1,CRGVPRB1,CRGVPERS,CRGVHOUS,CRGVMST2,CRGVEXPT,VIDFCLT2,VIREDIF3,VIPRFVS2,VINOCRE2,VIEYEXM2,VIINSUR2,VICTRCT4,VIGLUMA2,VIMACDG2,CIMEMLOS,CDHOUSE,CDASSIST,CDHELP,CDSOCIAL,CDDISCUS,WTCHSALT,LONGWTCH,DRADVISE,ASTHMAGE,ASATTACK,ASERVIST,ASDRVIST,ASRCHKUP,ASACTLIM,ASYMPTOM,ASNOSLEP,ASTHMED3,ASINHALR,HAREHAB1,STREHAB1,CVDASPRN,ASPUNSAF,RLIVPAIN,RDUCHART,RDUCSTRK,ARTTODAY,ARTHWGT,ARTHEXER,ARTHEDU,TETANUS,HPVADVC2,HPVADSHT,SHINGLE2,HADMAM,HOWLONG,HADPAP2,LASTPAP2,HPVTEST,HPLSTTST,HADHYST2,PROFEXAM,LENGEXAM,BLDSTOOL,LSTBLDS3,HADSIGM3,HADSGCO1,LASTSIG3,PCPSAAD2,PCPSADI1,PCPSARE1,PSATEST1,PSATIME,PCPSARS1,PCPSADE1,SCNTMNY1,SCNTMEL1,SCNTPAID,SCNTWRK1,SCNTLPAD,SCNTLWK1,SXORIENT,TRNSGNDR,RCSGENDR,RCSRLTN2,CASTHDX2,CASTHNO2,EMTSUPRT,LSATISFY,ADPLEASR,ADDOWN,ADSLEEP,ADENERGY,ADEAT1,ADFAIL,ADTHINK,ADMOVE,MISTMNT,ADANXEV,QSTVER,QSTLANG,MSCODE,_STSTR,_STRWT,_RAWRAKE,_WT2RAKE,_CHISPNC,_CRACE1,_CPRACE,_CLLCPWT,_DUALUSE,_DUALCOR,_LLCPWT,_RFHLTH,_HCVU651,_RFHYPE5,_CHOLCHK,_RFCHOL,_LTASTH1,_CASTHM1,_ASTHMS1,_DRDXAR1,_PRACE1,_MRACE1,_HISPANC,_RACE,_RACEG21,_RACEGR3,_RACE_G1,_AGEG5YR,_AGE65YR,_AGE80,_AGE_G,HTIN4,HTM4,WTKG3,_BMI5,_BMI5CAT,_RFBMI5,_CHLDCNT,_EDUCAG,_INCOMG,_SMOKER3,_RFSMOK3,DRNKANY5,DROCDY3_,_RFBING5,_DRNKWEK,_RFDRHV5,FTJUDA1_,FRUTDA1_,BEANDAY_,GRENDAY_,ORNGDAY_,VEGEDA1_,_MISFRTN,_MISVEGN,_FRTRESP,_VEGRESP,_FRUTSUM,_VEGESUM,_FRTLT1,_VEGLT1,_FRT16,_VEG23,_FRUITEX,_VEGETEX,_TOTINDA,METVL11_,METVL21_,MAXVO2_,FC60_,ACTIN11_,ACTIN21_,PADUR1_,PADUR2_,PAFREQ1_,PAFREQ2_,_MINAC11,_MINAC21,STRFREQ_,PAMISS1_,PAMIN11_,PAMIN21_,PA1MIN_,PAVIG11_,PAVIG21_,PA1VIGM_,_PACAT1,_PAINDX1,_PA150R2,_PA300R2,_PA30021,_PASTRNG,_PAREC1,_PASTAE1,_LMTACT1,_LMTWRK1,_LMTSCL1,_RFSEAT2,_RFSEAT3,_FLSHOT6,_PNEUMO2,_AIDTST3


In [368]:
# implement ExtraTrees to quickly identify feature importance 
clf = ExtraTreesClassifier(n_estimators=100, random_state=0)
clf.fit(data_2015_impute, y)
y_pred = clf.predict(data_2015_impute)

In [375]:
clf.score(data_2015_impute, y)

1.0

In [376]:
# https://www.geeksforgeeks.org/ml-extra-tree-classifier-for-feature-selection/
# normalize feature importance
feature_importances_normalized = np.std([tree.feature_importances_ for tree in
                                        clf.estimators_],
                                        axis = 0)

In [377]:
feature_dict = {col: importance for col, importance in zip(data_2015.columns, feature_importances_normalized)}
sorted_feature_dict = sorted(feature_dict.items(), key=lambda x:x[1])

In [378]:
# top 30 most important features
sorted_feature_dict[-30:]

[('SEX', 0.0017850094713372654),
 ('_LMTWRK1', 0.0018339842384877574),
 ('POORHLTH', 0.0019570276743378096),
 ('CVDSTRK3', 0.002349686285096022),
 ('CHILDREN', 0.0026297813631710886),
 ('TOLDHI2', 0.0026389915142640986),
 ('DIABAGE2', 0.0027250142053644962),
 ('_LMTACT1', 0.0027309602300688474),
 ('DIABETE3', 0.0030081384854693063),
 ('PHYSHLTH', 0.0030172869608923207),
 ('QLACTLM2', 0.0030855497701234506),
 ('CHCCOPD1', 0.003114156536604252),
 ('ARTHSOCL', 0.003480707096235616),
 ('_LMTSCL1', 0.004064015659865208),
 ('USEEQUIP', 0.004095811766096816),
 ('PNEUVAC3', 0.004155743113925382),
 ('_AGE_G', 0.004201285315645771),
 ('_RFCHOL', 0.004439549752659385),
 ('MAXVO2_', 0.004870227854982245),
 ('_RFHYPE5', 0.006116612454442506),
 ('_RFHLTH', 0.00617906630625463),
 ('_DRDXAR1', 0.006342762060457415),
 ('_AGE65YR', 0.0069183605922455834),
 ('DIFFWALK', 0.007450298602939572),
 ('GENHLTH', 0.007890274380902992),
 ('BPHIGH4', 0.0079292581571995),
 ('EMPLOY1', 0.007991433571246652),
 ('_AGE