In [1]:
# Load packages
import os
import pandas as pd
import numpy as np

# Required for basic python plotting functionality
import matplotlib.pyplot as plt


In [2]:
# Load the data into python
training_set_features = os.path.join(os.getcwd(), "Data/training_set_features.csv")
training_set_labels = os.path.join( os.getcwd(), "Data/training_set_labels.csv")

features= pd.read_csv(training_set_features)
labels = pd.read_csv(training_set_labels)

# 1. Train test split
90% for training
10% for  testing

In [79]:
from sklearn.model_selection import train_test_split

y_h1n1 = labels.iloc[:, -2]
y_seasonal = labels.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(features, y_h1n1, test_size=0.1, stratify=y_h1n1, random_state=12345)

# 2. Data cleaning 

### Drop unwanted features

1. _employment_occupation_ 
2. _employment_industry_

In [61]:
X_train = X_train.drop(['employment_occupation', 'employment_industry'], axis = 1)

### Encode string to numbers

In [62]:
string_to_num_map = {'18 - 34 Years': 0, '35 - 44 Years':1, '45 - 54 Years':2, '55 - 64 Years':3, '65+ Years':4 ,
                     '< 12 Years': 0, '12 Years':1, 'Some College':2,'College Graduate':3,
                     'White':0, 'Black':1,'Hispanic':1,'Other or Multiple':1,
                     'Female':0, 'Male':1,
                     'Below Poverty':0, '<= $75,000, Above Poverty':1, '> $75,000':2,
                     'Not Married':0, 'Married':1,
                     'Rent':0, 'Own':1,
                     'Not in Labor Force':0, 'Unemployed':0, 'Employed':1,
                     'MSA, Principle City':0, 'MSA, Not Principle  City':1, 'Non-MSA':2
                     }

In [63]:
encoded_train = X_train.applymap(lambda s: string_to_num_map.get(s) if s in string_to_num_map else s)

In [64]:
pd.set_option('display.max_columns', None)
encoded_train.head()

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,doctor_recc_seasonal,chronic_med_condition,child_under_6_months,health_worker,health_insurance,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,age_group,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children
1691,1691,3.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,,3.0,4.0,5.0,4.0,2.0,5.0,1,,0,0,,,,,lzgpxyit,1,1.0,2.0
23350,23350,2.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,5.0,1.0,1.0,5.0,4.0,1.0,2,1.0,0,0,2.0,1.0,1.0,0.0,lrircsnp,0,1.0,0.0
10880,10880,2.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,,4.0,2.0,1.0,4.0,2.0,2.0,2,1.0,0,0,1.0,1.0,1.0,1.0,lrircsnp,1,1.0,0.0
23183,23183,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,5.0,2.0,2.0,5.0,4.0,1.0,0,3.0,0,0,1.0,1.0,0.0,1.0,lzgpxyit,0,1.0,0.0
15169,15169,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,5.0,5.0,4.0,5.0,5.0,4.0,0,2.0,0,0,1.0,0.0,0.0,1.0,atmpeygn,1,1.0,0.0


# 3. Impute missing values

In [65]:
num_df = encoded_train.loc[:, encoded_train.columns != 'hhs_geo_region']
cate_df = encoded_train.loc[:, 'hhs_geo_region']

In [66]:
cate_df.isnull().sum()

0

In [67]:
num_df.isnull().sum()

respondent_id                      0
h1n1_concern                      84
h1n1_knowledge                   107
behavioral_antiviral_meds         66
behavioral_avoidance             187
behavioral_face_mask              17
behavioral_wash_hands             38
behavioral_large_gatherings       78
behavioral_outside_home           67
behavioral_touch_face            116
doctor_recc_h1n1                1947
doctor_recc_seasonal            1947
chronic_med_condition            889
child_under_6_months             747
health_worker                    735
health_insurance               11058
opinion_h1n1_vacc_effective      358
opinion_h1n1_risk                352
opinion_h1n1_sick_from_vacc      354
opinion_seas_vacc_effective      420
opinion_seas_risk                468
opinion_seas_sick_from_vacc      481
age_group                          0
education                       1268
race                               0
sex                                0
income_poverty                  3964
m

In our dataset, all the missing values come from numerical variables. We can use KNN to impute them

In [68]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors = 3)
imputed = imputer.fit_transform(num_df)

In [69]:
Imputed_num = pd.DataFrame(np.round(imputed), columns = num_df.columns).set_index('respondent_id')

In [70]:
Imputed_train = pd.merge(Imputed_num, cate_df, left_index = True, right_index =True)

In [71]:
Imputed_train.head()

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,doctor_recc_seasonal,chronic_med_condition,child_under_6_months,health_worker,health_insurance,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,age_group,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,census_msa,household_adults,household_children,hhs_geo_region
1691.0,3.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,4.0,5.0,4.0,2.0,5.0,1.0,2.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,2.0,lzgpxyit
23350.0,2.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,5.0,1.0,1.0,5.0,4.0,1.0,2.0,1.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,1.0,0.0,lrircsnp
10880.0,2.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,2.0,1.0,4.0,2.0,2.0,2.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,lrircsnp
23183.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,5.0,2.0,2.0,5.0,4.0,1.0,0.0,3.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,lzgpxyit
15169.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,5.0,5.0,4.0,5.0,5.0,4.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,atmpeygn


# 4. Create new variables 

### NEW_1 : behavioural_score:
By adding up:
1. behavioral_antiviral_meds
2. behavioral_avoidance
3. behavioral_face_mask
4. behavioral_wash_hands
5. behavioral_large_gatherings
6. behavioral_outside_home
7. behavioral_touch_face

In [72]:
def get_behavioural_score(df):
    df['behavioural_score'] = df.apply(lambda x: x['behavioral_antiviral_meds'] + 
                                       x['behavioral_avoidance']+
                                       x['behavioral_face_mask']+
                                       x['behavioral_wash_hands']+
                                       x['behavioral_large_gatherings']+
                                       x['behavioral_outside_home']+
                                       x['behavioral_touch_face'] ,
                                       axis=1)
    return df

In [73]:
Imputed_train = get_behavioural_score(Imputed_train)

## NEW_2: worry_score

By adding up:

1. opinion_h1n1_risk
2. opinion_h1n1_sick_from_vacc
3. opinion_seas_risk
4. opinion_seas_sick_from_vacc

then adding the flip of:

1. opinion_h1n1_vacc_effective
2. opinion_seas_vacc_effective

In [74]:
def get_worry_score(df):
    df['worry_score'] = df.apply(lambda x: x['opinion_h1n1_risk'] + 
                                       x['opinion_h1n1_sick_from_vacc']+
                                       x['opinion_seas_risk']+
                                       x['opinion_seas_sick_from_vacc']+
                                       (- x['opinion_h1n1_vacc_effective'])+ #just flip no need to subtract tp 4
                                       (- x['opinion_seas_vacc_effective']), #just flip no need to subtract tp 4
                                       axis=1)
    return df

In [75]:
Imputed_train = get_worry_score(Imputed_train)

In [76]:
Imputed_train.head()

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,doctor_recc_seasonal,chronic_med_condition,child_under_6_months,health_worker,health_insurance,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,age_group,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,census_msa,household_adults,household_children,hhs_geo_region,behavioural_score,worry_score
1691.0,3.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,4.0,5.0,4.0,2.0,5.0,1.0,2.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,2.0,lzgpxyit,5.0,9.0
23350.0,2.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,5.0,1.0,1.0,5.0,4.0,1.0,2.0,1.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,1.0,0.0,lrircsnp,5.0,-3.0
10880.0,2.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,2.0,1.0,4.0,2.0,2.0,2.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,lrircsnp,5.0,-1.0
23183.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,5.0,2.0,2.0,5.0,4.0,1.0,0.0,3.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,lzgpxyit,3.0,-1.0
15169.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,5.0,5.0,4.0,5.0,5.0,4.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,atmpeygn,4.0,8.0


In [78]:
#Imputed_train.to_csv(r'/Users/guonaici/Documents/ds4a/project_1/Imputed_train.csv')