# Data Exploration

In [1]:
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

Figure prettyfying

In [2]:
mpl.rcParams.update({
    'figure.autolayout': True,
    'figure.dpi': 150
})
sns.set()

# Reading the data

`respondent_id` is dropped as we don't have any use for it

In [36]:
DATA_PATH = '../../data/raw'
X_train = pd.read_csv(
    os.path.join(DATA_PATH, 'training_set_features.csv')
)
y_train = pd.read_csv(
    os.path.join(DATA_PATH, 'training_set_labels.csv')
).drop('respondent_id', axis = 1)

Join features and labels for convenience

In [37]:
data = pd.concat([X_train, y_train], axis = 1)
data

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation,h1n1_vaccine,seasonal_vaccine
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,,0,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe,0,1
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo,0,0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,,0,1
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,Own,Not in Labor Force,qufhixun,Non-MSA,0.0,0.0,,,0,0
26703,26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,Rent,Employed,lzgpxyit,"MSA, Principle City",1.0,0.0,fcxhlnwr,cmhcxjea,0,0
26704,26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,...,Own,,lzgpxyit,"MSA, Not Principle City",0.0,0.0,,,0,1
26705,26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,...,Rent,Employed,lrircsnp,Non-MSA,1.0,0.0,fcxhlnwr,haliazsg,0,0


## Exploration

In [38]:
data.dtypes

respondent_id                    int64
h1n1_concern                   float64
h1n1_knowledge                 float64
behavioral_antiviral_meds      float64
behavioral_avoidance           float64
behavioral_face_mask           float64
behavioral_wash_hands          float64
behavioral_large_gatherings    float64
behavioral_outside_home        float64
behavioral_touch_face          float64
doctor_recc_h1n1               float64
doctor_recc_seasonal           float64
chronic_med_condition          float64
child_under_6_months           float64
health_worker                  float64
health_insurance               float64
opinion_h1n1_vacc_effective    float64
opinion_h1n1_risk              float64
opinion_h1n1_sick_from_vacc    float64
opinion_seas_vacc_effective    float64
opinion_seas_risk              float64
opinion_seas_sick_from_vacc    float64
age_group                       object
education                       object
race                            object
sex                      

Missing values

In [39]:
data.isna().sum().div(len(data))

respondent_id                  0.000000
h1n1_concern                   0.003445
h1n1_knowledge                 0.004343
behavioral_antiviral_meds      0.002658
behavioral_avoidance           0.007788
behavioral_face_mask           0.000711
behavioral_wash_hands          0.001573
behavioral_large_gatherings    0.003258
behavioral_outside_home        0.003070
behavioral_touch_face          0.004793
doctor_recc_h1n1               0.080878
doctor_recc_seasonal           0.080878
chronic_med_condition          0.036358
child_under_6_months           0.030704
health_worker                  0.030104
health_insurance               0.459580
opinion_h1n1_vacc_effective    0.014640
opinion_h1n1_risk              0.014528
opinion_h1n1_sick_from_vacc    0.014790
opinion_seas_vacc_effective    0.017299
opinion_seas_risk              0.019246
opinion_seas_sick_from_vacc    0.020107
age_group                      0.000000
education                      0.052683
race                           0.000000


- for those categorical/numeric features with a few % missing, we migh reasonable be able to impute using KNN
- more than 5% missing in a column is likely a structural issue in the data (there's probably a pattern to these)
and it may be better to create a new category for these

Thus, for the following columns, it might be useful to replaces NaNs with `missing`:
- `health_insurance`
- `income_poverty`
- `rent_or_own`
- `employment_industry`
- `employment_occupation`

In [24]:
data[['health_insurance', 'employment_industry', 'employment_occupation']] = data[
    ['health_insurance', 'employment_industry', 'employment_occupation']
].fillna(value = 'missing')

Unique categorie for non-obvious columns

In [44]:
data.groupby('hhs_geo_region')['respondent_id'].count().div(len(data))

hhs_geo_region
atmpeygn    0.076122
bhuqouqj    0.106564
dqpwygqj    0.042161
fpwskwrf    0.122253
kbazzjca    0.107013
lrircsnp    0.077807
lzgpxyit    0.160894
mlyzmhmf    0.083985
oxchjgsf    0.107051
qufhixun    0.116149
Name: respondent_id, dtype: float64

In [29]:
data.groupby('age_group')['respondent_id'].count().div(len(data))

array(['55 - 64 Years', '35 - 44 Years', '18 - 34 Years', '65+ Years',
       '45 - 54 Years'], dtype=object)

In [46]:
data.groupby('employment_industry')['respondent_id'].count().div(len(data))

employment_industry
arjwrbjb    0.032613
atmlpfrs    0.034673
cfqqtusy    0.012169
dotnnunm    0.007526
fcxhlnwr    0.092410
haxffmxo    0.005542
ldnlellj    0.046093
mcubkhph    0.010297
mfikgejo    0.022990
msuufmds    0.004643
nduyfdeo    0.010709
phxvnwax    0.003332
pxcmvdjn    0.038829
qnlwzans    0.000487
rucpziij    0.019583
saaquncn    0.012656
vjjrobsf    0.019733
wlfvacwt    0.008050
wxleyezf    0.067548
xicduogh    0.031864
xqicxuve    0.019134
Name: respondent_id, dtype: float64

In [47]:
data.groupby('employment_occupation')['respondent_id'].count().div(len(data))

employment_occupation
bxpfxfdn    0.012394
ccgxvspp    0.012768
cmhcxjea    0.046692
dcjcmpih    0.005542
dlvbwzss    0.008500
emcorrxb    0.047553
haliazsg    0.011083
hfxkjkmi    0.028682
hodpvpew    0.007788
kldqjyjy    0.017561
mxkfnird    0.056502
oijqvulv    0.012881
pvmttkik    0.003669
qxajmpny    0.020519
rcertsgn    0.010334
tfqavkke    0.014528
ukymxvdu    0.013929
uqqtjvyb    0.016924
vlluhbov    0.013255
xgwztkwe    0.040514
xqwwgdyp    0.018160
xtkaffoo    0.066574
xzmlyyjv    0.009286
Name: respondent_id, dtype: float64

In [48]:
data.groupby('education')['respondent_id'].count().div(len(data))                    

education
12 Years            0.217059
< 12 Years          0.088479
College Graduate    0.378066
Some College        0.263714
Name: respondent_id, dtype: float64

Helper function to plot vaccination rates