In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
features = pd.read_csv('../Data/training_set_features.csv')
labels = pd.read_csv('../Data/training_set_labels.csv')

In [3]:
df = pd.merge(features, labels, on='respondent_id', how='inner')

In [4]:
df = df.drop(columns=['employment_occupation', 'employment_industry', 'health_insurance', 'respondent_id'])

In [5]:
seas_df = df.drop(columns=['h1n1_concern',
                          'h1n1_knowledge',
                          'doctor_recc_h1n1',
                          'opinion_h1n1_vacc_effective',
                          'opinion_h1n1_risk',
                          'opinion_h1n1_sick_from_vacc',
                          'h1n1_vaccine'])

h1n1_df = df.drop(columns=['doctor_recc_seasonal',
                          'opinion_seas_vacc_effective',
                          'opinion_seas_risk',
                          'opinion_seas_sick_from_vacc',
                          'seasonal_vaccine'])

In [6]:
categorical_columns = [
    'age_group',
    'health_worker',
    'doctor_recc_h1n1',
    'doctor_recc_seasonal',
    'chronic_med_condition',
    'child_under_6_months',
]

numerical_columns = [
    'household_children',
    'household_adults',
    'h1n1_concern',
    'h1n1_knowledge',
    'opinion_h1n1_risk',
    'opinion_h1n1_vacc_effective',
    'opinion_h1n1_sick_from_vacc',
    'opinion_seas_vacc_effective',
    'opinion_seas_risk',
    'opinion_seas_sick_from_vacc',    
]

## Deal with NAs

In [7]:
((df.isnull().sum() / len(df)) * 100).sort_values()

seasonal_vaccine                0.000000
census_msa                      0.000000
hhs_geo_region                  0.000000
sex                             0.000000
race                            0.000000
age_group                       0.000000
h1n1_vaccine                    0.000000
behavioral_face_mask            0.071142
behavioral_wash_hands           0.157262
behavioral_antiviral_meds       0.265848
behavioral_outside_home         0.307036
behavioral_large_gatherings     0.325757
h1n1_concern                    0.344479
h1n1_knowledge                  0.434343
behavioral_touch_face           0.479275
behavioral_avoidance            0.778822
household_children              0.932340
household_adults                0.932340
opinion_h1n1_risk               1.452803
opinion_h1n1_vacc_effective     1.464036
opinion_h1n1_sick_from_vacc     1.479013
opinion_seas_vacc_effective     1.729884
opinion_seas_risk               1.924589
opinion_seas_sick_from_vacc     2.010709
health_worker   

In [8]:
df = df.dropna()

## Initial Run

In [9]:
X = df.drop(columns=['h1n1_vaccine', 'seasonal_vaccine'])
y = df[['h1n1_vaccine', 'seasonal_vaccine']]
y_h1n1 = df[['h1n1_vaccine']]
y_seas = df[['seasonal_vaccine']]

#### Categorical

In [10]:
#Get Binary Data for Categorical Variables
cat_df = X[categorical_columns]
recat_df = pd.get_dummies(cat_df, drop_first=True)

#### Numerical

In [11]:
num_df = X[numerical_columns]

In [12]:
from sklearn.preprocessing import StandardScaler

#Scale Numerical Data
scaler = StandardScaler()
scaled_num = scaler.fit_transform(num_df)
scaled_num_df = pd.DataFrame(scaled_num, index=num_df.index, columns=num_df.columns)

In [13]:
encoded_df = pd.concat([recat_df, scaled_num_df], axis=1)

In [14]:
encoded_df

Unnamed: 0,health_worker,doctor_recc_h1n1,doctor_recc_seasonal,chronic_med_condition,child_under_6_months,age_group_35 - 44 Years,age_group_45 - 54 Years,age_group_55 - 64 Years,age_group_65+ Years,household_children,household_adults,h1n1_concern,h1n1_knowledge,opinion_h1n1_risk,opinion_h1n1_vacc_effective,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc
0,0.0,0.0,0.0,0.0,0.0,0,0,1,0,-0.587363,-1.210309,-0.692976,-2.184621,-1.053267,-0.938890,-0.256907,-1.940145,-1.266410,-0.079177
1,0.0,0.0,0.0,0.0,0.0,1,0,0,0,-0.587363,-1.210309,1.558709,1.169714,1.270301,1.097177,1.221174,-0.056950,-0.545672,1.429261
3,0.0,0.0,1.0,1.0,0.0,0,0,0,1,-0.587363,-1.210309,-0.692976,-0.507454,0.495778,-0.938890,1.960215,0.884647,0.895803,-0.833396
4,0.0,0.0,0.0,0.0,0.0,0,1,0,0,-0.587363,0.129688,0.432866,-0.507454,0.495778,-0.938890,-0.256907,-0.998548,-1.266410,1.429261
5,0.0,0.0,1.0,0.0,0.0,0,0,0,1,2.610248,1.469685,1.558709,-0.507454,-0.278745,1.097177,-0.995948,0.884647,0.895803,1.429261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26700,0.0,1.0,0.0,0.0,0.0,0,0,1,0,-0.587363,0.129688,1.558709,-0.507454,-0.278745,0.079144,1.960215,0.884647,0.895803,2.183480
26701,0.0,0.0,0.0,0.0,1.0,0,0,0,0,-0.587363,2.809681,0.432866,1.169714,-0.278745,0.079144,1.221174,-0.056950,-0.545672,1.429261
26702,0.0,0.0,0.0,0.0,0.0,0,0,0,1,-0.587363,-1.210309,0.432866,-2.184621,-1.053267,-0.938890,-0.995948,0.884647,-0.545672,-0.079177
26703,1.0,1.0,1.0,0.0,0.0,0,0,0,0,-0.587363,0.129688,-0.692976,1.169714,-0.278745,0.079144,-0.256907,0.884647,-1.266410,-0.833396


In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(encoded_df, y, test_size=0.3, random_state=42)

In [16]:
X_train = np.asarray(X_train)
X_test = np.asarray(X_test)
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)
X = np.asarray(encoded_df)

# Neural Network

In [21]:
from tensorflow import keras
model = keras.Sequential([
    keras.layers.Dense(20, activation='relu', input_dim=19),
    keras.layers.LeakyReLU(200),
    keras.layers.LeakyReLU(300),
    keras.layers.LeakyReLU(500),
    keras.layers.Dense(15, activation='relu'),
    keras.layers.Dense(2, activation='softmax')
])

In [22]:
model.compile(optimizer='sgd',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=[tf.keras.metrics.AUC(from_logits=True)])

In [23]:
history = model.fit(
    X_train,
    y_train,
    batch_size=200,
    epochs=5000,
    validation_data=(X_test, y_test)
)

Epoch 1/5000




Epoch 2/5000
Epoch 3/5000
Epoch 4/5000
Epoch 5/5000
Epoch 6/5000
Epoch 7/5000
Epoch 8/5000
Epoch 9/5000
Epoch 10/5000
Epoch 11/5000
Epoch 12/5000
Epoch 13/5000
Epoch 14/5000
Epoch 15/5000
Epoch 16/5000
Epoch 17/5000
Epoch 18/5000
Epoch 19/5000
Epoch 20/5000
Epoch 21/5000
Epoch 22/5000
Epoch 23/5000
Epoch 24/5000
Epoch 25/5000
Epoch 26/5000
Epoch 27/5000
Epoch 28/5000
Epoch 29/5000
Epoch 30/5000
Epoch 31/5000
Epoch 32/5000
Epoch 33/5000
Epoch 34/5000
Epoch 35/5000
Epoch 36/5000
Epoch 37/5000
Epoch 38/5000
Epoch 39/5000
Epoch 40/5000
Epoch 41/5000
Epoch 42/5000
Epoch 43/5000
Epoch 44/5000
Epoch 45/5000
Epoch 46/5000
Epoch 47/5000
Epoch 48/5000
Epoch 49/5000
Epoch 50/5000
Epoch 51/5000
Epoch 52/5000
Epoch 53/5000
Epoch 54/5000
Epoch 55/5000
Epoch 56/5000
Epoch 57/5000
Epoch 58/5000
Epoch 59/5000
Epoch 60/5000
Epoch 61/5000
Epoch 62/5000
Epoch 63/5000
Epoch 64/5000
Epoch 65/5000
Epoch 66/5000
Epoch 67/5000
Epoch 68/5000
Epoch 69/5000
Epoch 70/5000
Epoch 71/5000
Epoch 72/5000
Epoch 73/5000


KeyboardInterrupt: 

In [24]:
from sklearn.metrics import roc_auc_score

y_predicted = model.predict(X_test)

roc_auc_score(y_test, y_predicted)

0.6281072393384263

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score


model = RandomForestRegressor(
n_estimators=800,
min_samples_split=2,
min_samples_leaf=4,
max_features='sqrt',
max_depth=20,
bootstrap=False)

model.fit(X_train, y_train)

In [41]:
from sklearn.metrics import roc_auc_score

y_predicted = model.predict(X_test)

roc_auc_score(y_test, y_predicted)

0.7784016773908989

## Submission Data

In [None]:
test_data = pd.read_csv('../Data/test_set_features.csv')
df_full = test_data

In [None]:
df = df_full.drop(columns=['employment_occupation', 'employment_industry', 'health_insurance', 'respondent_id'])

categorical_columns = [
    'age_group',
    'health_worker',
    'doctor_recc_h1n1',
    'doctor_recc_seasonal',
]

numerical_columns = [
    'household_children',
    'h1n1_concern',
    'h1n1_knowledge',
    'opinion_h1n1_risk',
    'opinion_h1n1_vacc_effective',
    'opinion_h1n1_sick_from_vacc',
    'opinion_seas_vacc_effective',
    'opinion_seas_risk',
    'opinion_seas_sick_from_vacc',    
]

In [None]:
for column in numerical_columns:
    df[column] = df[column].fillna(df[column].mean())

df['health_worker'] = df['health_worker'].fillna(0)
df['doctor_recc_h1n1'] = df['doctor_recc_h1n1'].fillna(1)
df['doctor_recc_seasonal'] = df['doctor_recc_seasonal'].fillna(1)

In [None]:
X = df

#Get Binary Data for Categorical Variables
cat_df = X[categorical_columns]
recat_df = pd.get_dummies(cat_df, drop_first=True)

num_df = X[numerical_columns]

from sklearn.preprocessing import StandardScaler

#Scale Numerical Data
scaler = StandardScaler()
scaled_num = scaler.fit_transform(num_df)
scaled_num_df = pd.DataFrame(scaled_num, index=num_df.index, columns=num_df.columns)

encoded_df = pd.concat([recat_df, scaled_num_df], axis=1)

X = np.asarray(encoded_df)

In [None]:
y = model.predict(X)
y_df = pd.DataFrame(y, columns=['h1n1_vaccine', 'seasonal_vaccine'])

In [None]:
results = pd.concat([df_full, y_df], axis=1)

In [None]:
results = results[['respondent_id', 'h1n1_vaccine', 'seasonal_vaccine']]

In [None]:
results.to_csv('../Submissions/Submission 6.29.21.csv', index=False)