In [1]:
# Load packages
import os
import pandas as pd
import numpy as np

In [None]:
# Load the data into python
training_set_features = os.path.join(os.getcwd(), "Data/training_set_features.csv")
training_set_labels = os.path.join( os.getcwd(), "Data/training_set_labels.csv")

features= pd.read_csv(training_set_features)
labels = pd.read_csv(training_set_labels)

# 1. Train test split
90% for training
10% for  testing

In [3]:
from sklearn.model_selection import train_test_split

y_h1n1 = labels.iloc[:, -2]
y_seasonal = labels.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(features, y_h1n1, test_size=0.1, stratify=y_h1n1, random_state=12345)

In [4]:
X_train.shape

(24036, 36)

# 2. Data cleaning 

### Drop unwanted features

1. _employment_occupation_ 
2. _employment_industry_

In [5]:
X_train = X_train.drop(['employment_occupation', 'employment_industry'], axis = 1)

### Encode string to numbers

In [6]:
def encode_string_to_numbers (df):
    string_to_num_map = {'18 - 34 Years': 0, '35 - 44 Years':1, '45 - 54 Years':2, '55 - 64 Years':3, '65+ Years':4 ,
                     '< 12 Years': 0, '12 Years':1, 'Some College':2,'College Graduate':3,
                     'White':0, 'Black':1,'Hispanic':1,'Other or Multiple':1,
                     'Female':0, 'Male':1,
                     'Below Poverty':0, '<= $75,000, Above Poverty':1, '> $75,000':2,
                     'Not Married':0, 'Married':1,
                     'Rent':0, 'Own':1,
                     'Not in Labor Force':0, 'Unemployed':0, 'Employed':1,
                     'MSA, Principle City':0, 'MSA, Not Principle  City':1, 'Non-MSA':2
                     }
    encoded = df.applymap(lambda s: string_to_num_map.get(s) if s in string_to_num_map else s)
    return encoded 

In [7]:
encoded_train = encode_string_to_numbers(X_train)

In [8]:
pd.set_option('display.max_columns', None)
encoded_train.head()

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,doctor_recc_seasonal,chronic_med_condition,child_under_6_months,health_worker,health_insurance,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,age_group,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children
2320,2320,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,,4.0,2.0,1.0,4.0,4.0,1.0,0,1.0,1,0,0.0,1.0,0.0,0.0,qufhixun,1,1.0,0.0
15358,15358,3.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,,5.0,2.0,4.0,4.0,2.0,2.0,2,2.0,0,0,1.0,1.0,1.0,1.0,bhuqouqj,0,1.0,0.0
12872,12872,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,1.0,1.0,4.0,2.0,1.0,2,1.0,1,1,2.0,1.0,0.0,1.0,lzgpxyit,1,0.0,0.0
10185,10185,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,2.0,1.0,4.0,2.0,1.0,1,1.0,0,1,1.0,0.0,1.0,1.0,lrircsnp,0,1.0,1.0
20598,20598,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,3.0,1.0,1.0,4.0,3.0,1.0,4,1.0,0,0,,0.0,1.0,0.0,fpwskwrf,2,0.0,0.0


# 3. Impute missing values

In [9]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors = 3)

def impute_missing_values(df):
    num_df = df.loc[:, df.columns != 'hhs_geo_region']
    cate_df = df.loc[:, 'hhs_geo_region']
    imputed = imputer.fit_transform(num_df)
    Imputed = pd.DataFrame(np.round(imputed), columns = num_df.columns).set_index('respondent_id')
    Imputed_df = pd.merge(Imputed, cate_df, left_index = True, right_index =True)
    
    return Imputed_df      

In [10]:
imputed_train = impute_missing_values(encoded_train)

In [11]:
imputed_train.head()

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,doctor_recc_seasonal,chronic_med_condition,child_under_6_months,health_worker,health_insurance,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,age_group,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,census_msa,household_adults,household_children,hhs_geo_region
2320.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,4.0,2.0,1.0,4.0,4.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,qufhixun
15358.0,3.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,5.0,2.0,4.0,4.0,2.0,2.0,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,bhuqouqj
12872.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,1.0,1.0,4.0,2.0,1.0,2.0,1.0,1.0,1.0,2.0,1.0,0.0,1.0,1.0,0.0,0.0,lzgpxyit
10185.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,2.0,1.0,4.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,lrircsnp
20598.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,3.0,1.0,1.0,4.0,3.0,1.0,4.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,fpwskwrf


# 4. Create new variables 

### NEW_1 : behavioural_score:
By adding up:
1. behavioral_antiviral_meds
2. behavioral_avoidance
3. behavioral_face_mask
4. behavioral_wash_hands
5. behavioral_large_gatherings
6. behavioral_outside_home
7. behavioral_touch_face

In [12]:
def get_behavioural_score(df):
    df['behavioural_score'] = df.apply(lambda x: x['behavioral_antiviral_meds'] + 
                                       x['behavioral_avoidance']+
                                       x['behavioral_face_mask']+
                                       x['behavioral_wash_hands']+
                                       x['behavioral_large_gatherings']+
                                       x['behavioral_outside_home']+
                                       x['behavioral_touch_face'] ,
                                       axis=1)
    return df

In [13]:
imputed_train = get_behavioural_score(imputed_train)

## NEW_2: worry_score

By adding up:

1. opinion_h1n1_risk
2. opinion_h1n1_sick_from_vacc
3. opinion_seas_risk
4. opinion_seas_sick_from_vacc

then adding the flip of:

1. opinion_h1n1_vacc_effective
2. opinion_seas_vacc_effective

In [15]:
def get_worry_score(df):
    df['worry_score'] = df.apply(lambda x: x['opinion_h1n1_risk'] + 
                                       x['opinion_h1n1_sick_from_vacc']+
                                       x['opinion_seas_risk']+
                                       x['opinion_seas_sick_from_vacc']+
                                       ( - x['opinion_h1n1_vacc_effective'])+ #just flip no need to subtract tp 4
                                       ( - x['opinion_seas_vacc_effective']), #just flip no need to subtract tp 4
                                       axis=1)
    return df

In [16]:
imputed_train = get_worry_score(imputed_train)

In [17]:
imputed_train.head()

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,doctor_recc_seasonal,chronic_med_condition,child_under_6_months,health_worker,health_insurance,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,age_group,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,census_msa,household_adults,household_children,hhs_geo_region,behavioural_score,worry_score
2320.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,4.0,2.0,1.0,4.0,4.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,qufhixun,7.0,0.0
15358.0,3.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,5.0,2.0,4.0,4.0,2.0,2.0,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,bhuqouqj,5.0,1.0
12872.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,1.0,1.0,4.0,2.0,1.0,2.0,1.0,1.0,1.0,2.0,1.0,0.0,1.0,1.0,0.0,0.0,lzgpxyit,5.0,-3.0
10185.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,2.0,1.0,4.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,lrircsnp,4.0,-2.0
20598.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,3.0,1.0,1.0,4.0,3.0,1.0,4.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,fpwskwrf,3.0,-1.0


In [18]:
# imputed_train.to_csv(r'/Users/guonaici/Documents/ds4a/project_1/imputed_train.csv')

# 5. convert hhs_geo_region if needed 

In [19]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder()

def hot_encode (df):
    encoded = enc.fit_transform(df[['hhs_geo_region']])
    encoded_df = pd.DataFrame(encoded.toarray(), index =df.index)
    return encoded_df  

In [20]:
hot_encoded = hot_encode(imputed_train)

In [21]:
enc.categories_

[array(['atmpeygn', 'bhuqouqj', 'dqpwygqj', 'fpwskwrf', 'kbazzjca',
        'lrircsnp', 'lzgpxyit', 'mlyzmhmf', 'oxchjgsf', 'qufhixun'],
       dtype=object)]

In [22]:
hot_encoded.columns = ['atmpeygn', 'bhuqouqj', 'dqpwygqj', 'fpwskwrf', 'kbazzjca',
        'lrircsnp', 'lzgpxyit', 'mlyzmhmf', 'oxchjgsf', 'qufhixun']

In [25]:
imputed_train_hot = pd.merge(imputed_train, hot_encoded, left_index = True, right_index = True)
imputed_train_hot.drop(columns= 'hhs_geo_region', inplace = True)

In [26]:
imputed_train_hot.head()

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,doctor_recc_seasonal,chronic_med_condition,child_under_6_months,health_worker,health_insurance,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,age_group,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,census_msa,household_adults,household_children,behavioural_score,worry_score,atmpeygn,bhuqouqj,dqpwygqj,fpwskwrf,kbazzjca,lrircsnp,lzgpxyit,mlyzmhmf,oxchjgsf,qufhixun
2320.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,4.0,2.0,1.0,4.0,4.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
15358.0,3.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,5.0,2.0,4.0,4.0,2.0,2.0,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,5.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12872.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,1.0,1.0,4.0,2.0,1.0,2.0,1.0,1.0,1.0,2.0,1.0,0.0,1.0,1.0,0.0,0.0,5.0,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
10185.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,2.0,1.0,4.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,4.0,-2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
20598.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,3.0,1.0,1.0,4.0,3.0,1.0,4.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,3.0,-1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [223]:
# imputed_train_hot.to_csv(r'/Users/guonaici/Documents/ds4a/project_1/imputed_train_hot_encoded.csv')