In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
features = pd.read_csv('../Data/training_set_features.csv')
labels = pd.read_csv('../Data/training_set_labels.csv')

In [3]:
df = pd.merge(features, labels, on='respondent_id', how='inner')

In [4]:
df = df.drop(columns=['employment_occupation', 'employment_industry', 'health_insurance', 'respondent_id'])

In [5]:
seas_df = df.drop(columns=['h1n1_concern',
                          'h1n1_knowledge',
                          'doctor_recc_h1n1',
                          'opinion_h1n1_vacc_effective',
                          'opinion_h1n1_risk',
                          'opinion_h1n1_sick_from_vacc',
                          'h1n1_vaccine'])

h1n1_df = df.drop(columns=['doctor_recc_seasonal',
                          'opinion_seas_vacc_effective',
                          'opinion_seas_risk',
                          'opinion_seas_sick_from_vacc',
                          'seasonal_vaccine'])

In [6]:
categorical_columns = [
    'sex',
    'hhs_geo_region',
    'census_msa',
    'race',
    'age_group',
    'behavioral_face_mask',
    'behavioral_wash_hands',
    'behavioral_antiviral_meds',
    'behavioral_outside_home',
    'behavioral_large_gatherings',
    'behavioral_touch_face',
    'behavioral_avoidance',
    'health_worker',
    'child_under_6_months',
    'chronic_med_condition',
    'education',
    'marital_status',
    'employment_status',
    'rent_or_own',
    'doctor_recc_h1n1',
    'doctor_recc_seasonal',
    'income_poverty'
]

numerical_columns = [
    'household_children',
    'household_adults',
    'h1n1_concern',
    'h1n1_knowledge',
    'opinion_h1n1_risk',
    'opinion_h1n1_vacc_effective',
    'opinion_h1n1_sick_from_vacc',
    'opinion_seas_vacc_effective',
    'opinion_seas_risk',
    'opinion_seas_sick_from_vacc',
    
]

for column in categorical_columns:
    curr_col = df[column]
    df.loc[df[column] == 1, column] = 'Yes'
    df.loc[df[column] == 0, column] = 'No'

## Initial Run

In [7]:
X = df.drop(columns=['h1n1_vaccine', 'seasonal_vaccine'])
y = df[['h1n1_vaccine', 'seasonal_vaccine']]
y_h1n1 = df[['h1n1_vaccine']]
y_seas = df[['seasonal_vaccine']]

#### Categorical

In [8]:
#Get Binary Data for Categorical Variables
cat_df = X[categorical_columns]

In [9]:
recat_df = pd.get_dummies(data=cat_df).columns

#### Numerical

In [10]:
num_df = X[numerical_columns]

In [11]:
from sklearn.preprocessing import StandardScaler

#Scale Numerical Data
scaler = StandardScaler()
scaled_num = scaler.fit_transform(num_df)
scaled_num_df = pd.DataFrame(scaled_num, index=num_df.index, columns=num_df.columns)

In [12]:
encoded_df = pd.concat([num_df, scaled_num_df], axis=1)

In [13]:
encoded_df

Unnamed: 0,household_children,household_adults,h1n1_concern,h1n1_knowledge,opinion_h1n1_risk,opinion_h1n1_vacc_effective,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,household_children.1,household_adults.1,h1n1_concern.1,h1n1_knowledge.1,opinion_h1n1_risk.1,opinion_h1n1_vacc_effective.1,opinion_h1n1_sick_from_vacc.1,opinion_seas_vacc_effective.1,opinion_seas_risk.1,opinion_seas_sick_from_vacc.1
0,0.0,0.0,1.0,0.0,1.0,3.0,2.0,2.0,1.0,2.0,-0.587363,-1.210309,-0.692976,-2.184621,-1.053267,-0.938890,-0.256907,-1.940145,-1.266410,-0.079177
1,0.0,0.0,3.0,2.0,4.0,5.0,4.0,4.0,2.0,4.0,-0.587363,-1.210309,1.558709,1.169714,1.270301,1.097177,1.221174,-0.056950,-0.545672,1.429261
3,0.0,0.0,1.0,1.0,3.0,3.0,5.0,5.0,4.0,1.0,-0.587363,-1.210309,-0.692976,-0.507454,0.495778,-0.938890,1.960215,0.884647,0.895803,-0.833396
4,0.0,1.0,2.0,1.0,3.0,3.0,2.0,3.0,1.0,4.0,-0.587363,0.129688,0.432866,-0.507454,0.495778,-0.938890,-0.256907,-0.998548,-1.266410,1.429261
5,3.0,2.0,3.0,1.0,2.0,5.0,1.0,5.0,4.0,4.0,2.610248,1.469685,1.558709,-0.507454,-0.278745,1.097177,-0.995948,0.884647,0.895803,1.429261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26700,0.0,1.0,3.0,1.0,2.0,4.0,5.0,5.0,4.0,5.0,-0.587363,0.129688,1.558709,-0.507454,-0.278745,0.079144,1.960215,0.884647,0.895803,2.183480
26701,0.0,3.0,2.0,2.0,2.0,4.0,4.0,4.0,2.0,4.0,-0.587363,2.809681,0.432866,1.169714,-0.278745,0.079144,1.221174,-0.056950,-0.545672,1.429261
26702,0.0,0.0,2.0,0.0,1.0,3.0,1.0,5.0,2.0,2.0,-0.587363,-1.210309,0.432866,-2.184621,-1.053267,-0.938890,-0.995948,0.884647,-0.545672,-0.079177
26703,0.0,1.0,1.0,2.0,2.0,4.0,2.0,5.0,1.0,1.0,-0.587363,0.129688,-0.692976,1.169714,-0.278745,0.079144,-0.256907,0.884647,-1.266410,-0.833396


In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(encoded_df, y, test_size=0.3, random_state=42)

In [15]:
X_train = np.asarray(X_train)
X_test = np.asarray(X_test)
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)
X = np.asarray(encoded_df)

# Neural Network

In [16]:
from tensorflow import keras

model = keras.Sequential([
    keras.layers.Dense(32, activation='relu', input_dim=20),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(2, activation='hard_sigmoid')
])

In [17]:
model.compile(optimizer='adam', 
              loss=tf.losses.CategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [18]:
history = model.fit(
    X_train,
    y_train,
    batch_size=100,
    epochs=10000,
    validation_data=(X_test, y_test)
)

Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Epoch 33/10000
Epoch 34/10000
Epoch 35/10000
Epoch 36/10000
Epoch 37/10000
Epoch 38/10000
Epoch 39/10000
Epoch 40/10000
Epoch 41/10000
Epoch 42/10000
Epoch 43/10000
Epoch 44/10000
Epoch 45/10000
Epoch 46/10000
Epoch 47/10000
Epoch 48/10000
Epoch 49/10000
Epoch 50/10000
Epoch 51/10000
Epoch 52/10000
Epoch 53/10000
Epoch 54/10000
Epoch 55/10000
Epoch 56/10000
Epoch 57/10000
Epoch 58/10000
Epoch 59/10000
Epoch 60/10000
Epoch 61/10000
Epoch 62/10000
Epoch 63/10000
Epoch 64/10000
Epoch 65/10000
Epoch 66/10000
Epoch 67/10000
Epoc

KeyboardInterrupt: 

In [40]:
y_true = y
y_predicted = model.predict(X)
y_predicted_binary = np.where(y_predicted > 0.5, 1, 0)

In [42]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y, y_predicted)

0.7321256215606469

## Submission Data

In [71]:
test_data = pd.read_csv('../Data/test_set_features.csv')
df_full = test_data

In [72]:
df = df_full.drop(columns=['employment_occupation', 'employment_industry', 'health_insurance', 'respondent_id'])

categorical_columns = [
    'sex',
    'hhs_geo_region',
    'census_msa',
    'race',
    'age_group',
    'behavioral_face_mask',
    'behavioral_wash_hands',
    'behavioral_antiviral_meds',
    'behavioral_outside_home',
    'behavioral_large_gatherings',
    'behavioral_touch_face',
    'behavioral_avoidance',
    'health_worker',
    'child_under_6_months',
    'chronic_med_condition',
    'education',
    'marital_status',
    'employment_status',
    'rent_or_own',
    'doctor_recc_h1n1',
    'doctor_recc_seasonal',
    'income_poverty'
]

numerical_columns = [
    'household_children',
    'household_adults',
    'h1n1_concern',
    'h1n1_knowledge',
    'opinion_h1n1_risk',
    'opinion_h1n1_vacc_effective',
    'opinion_h1n1_sick_from_vacc',
    'opinion_seas_vacc_effective',
    'opinion_seas_risk',
    'opinion_seas_sick_from_vacc',
    
]

for column in categorical_columns:
    curr_col = df[column]
    df.loc[df[column] == 1, column] = 'Yes'
    df.loc[df[column] == 0, column] = 'No'

In [73]:
X = df

#Get Binary Data for Categorical Variables
cat_df = X[categorical_columns]
recat_df = pd.get_dummies(data=cat_df).columns

num_df = X[numerical_columns]

from sklearn.preprocessing import StandardScaler

#Scale Numerical Data
scaler = StandardScaler()
scaled_num = scaler.fit_transform(num_df)
scaled_num_df = pd.DataFrame(scaled_num, index=num_df.index, columns=num_df.columns)

encoded_df = pd.concat([num_df, scaled_num_df], axis=1)

X = np.asarray(encoded_df)

In [74]:
y = model.predict(X)
y_df = pd.DataFrame(y, columns=['h1n1_vaccine', 'seasonal_vaccine'])

In [75]:
results = pd.concat([df_full, y_df], axis=1)

In [77]:
results = results[['respondent_id', 'h1n1_vaccine', 'seasonal_vaccine']]

In [79]:
results.to_csv('../Submissions/Submission 6.6.21.csv', index=False)