In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import matthews_corrcoef
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import scipy.stats as stats
from tensorflow import keras
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from datetime import datetime

import warnings
warnings.filterwarnings("ignore")

In [2]:
#Functions

def evaluate(model, X_test, y_test):
    predictions = model.predict(X_test)
    errors = abs(predictions - y_test)
    mape = 100 * np.mean(errors / y_test)
    accuracy = 100 - mape
    roc = roc_auc_score(y_test, predictions)
    print('Model Performance')
    print('Average Error: {:0.4f} degrees'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%'.format(accuracy))
    print(f'AUC = {roc}')
    return accuracy

In [3]:
features = pd.read_csv('../Data/training_set_features.csv')
labels = pd.read_csv('../Data/training_set_labels.csv')

df = pd.merge(features, labels, on='respondent_id', how='inner')

In [4]:
df['sex'] = df['sex'].map({
    'Female': 0,
    'Male': 1
})

df['marital_status'] = df['marital_status'].map({
    'Not Married': 0,
    'Married': 1
})

df['rent_or_own'] = df['rent_or_own'].map({
    'Rent': 0,
    'Own': 1
})
    
h1n1_num_cols = [
    'opinion_h1n1_sick_from_vacc',
    'h1n1_concern',
    'h1n1_knowledge',
    'opinion_seas_vacc_effective',
    'opinion_h1n1_vacc_effective',
    'opinion_seas_risk',
    'opinion_h1n1_risk',
    'child_under_6_months',
    'marital_status',
    'rent_or_own',
    'behavioral_touch_face',
    'behavioral_face_mask',
    'chronic_med_condition',
    'behavioral_wash_hands',
    'doctor_recc_seasonal',
    'doctor_recc_h1n1'
]

h1n1_cat_cols = [
    'age_group',
    'hhs_geo_region',
    'income_poverty'
]

seas_num_cols = [
    'behavioral_wash_hands',
    'opinion_h1n1_vacc_effective',
    'opinion_h1n1_risk',
    'opinion_seas_vacc_effective',
    'opinion_seas_risk',
    'marital_status',
    'chronic_med_condition',
    'rent_or_own',
    'doctor_recc_h1n1',
    'doctor_recc_seasonal'
]

seas_cat_cols = [
    'age_group',
    'income_poverty'
]

h1n1_cols = h1n1_num_cols + h1n1_cat_cols
seas_cols = seas_num_cols + seas_cat_cols

num_cols = list(set(h1n1_num_cols + seas_num_cols))
cat_cols = list(set(h1n1_cat_cols + seas_cat_cols))

all_cols = list(set(num_cols + cat_cols))

In [5]:
opinion_h1n1_sick_from_vacc_mean = df['opinion_h1n1_sick_from_vacc'].mean()
h1n1_concern_mean = df['h1n1_concern'].mean()
h1n1_knowledge_mean = df['h1n1_knowledge'].mean()
opinion_seas_vacc_effective_mean = df['opinion_seas_vacc_effective'].mean()
opinion_h1n1_vacc_effective_mean = df['opinion_h1n1_vacc_effective'].mean()
opinion_seas_risk_mean = df['opinion_seas_risk'].mean()
opinion_h1n1_risk_mean = df['opinion_h1n1_risk'].mean()
behavioral_wash_hands_mean = df['behavioral_wash_hands'].mean()
sex_mean = df['sex'].mean()
child_under_6_months_mean = df['child_under_6_months'].mean()
marital_status_mean = df['marital_status'].mean()
rent_or_own_mean = df['rent_or_own'].mean()
behavioral_touch_face_mean = df['behavioral_touch_face'].mean()
behavioral_face_mask_mean = df['behavioral_face_mask'].mean()
chronic_med_condition_mean = df['chronic_med_condition'].mean()
doctor_recc_seasonal_mean = df['doctor_recc_seasonal'].mean()
doctor_recc_h1n1_mean = df['doctor_recc_h1n1'].mean()

df['opinion_h1n1_sick_from_vacc'] = df['opinion_h1n1_sick_from_vacc'].fillna(opinion_h1n1_sick_from_vacc_mean)
df['h1n1_concern'] = df['h1n1_concern'].fillna(h1n1_concern_mean)
df['h1n1_knowledge'] = df['h1n1_knowledge'].fillna(h1n1_knowledge_mean)
df['opinion_seas_vacc_effective'] = df['opinion_seas_vacc_effective'].fillna(opinion_seas_vacc_effective_mean)
df['opinion_h1n1_vacc_effective'] = df['opinion_h1n1_vacc_effective'].fillna(opinion_h1n1_vacc_effective_mean)
df['opinion_seas_risk'] = df['opinion_seas_risk'].fillna(opinion_seas_risk_mean)
df['opinion_h1n1_risk'] = df['opinion_h1n1_risk'].fillna(opinion_h1n1_risk_mean)
df['behavioral_wash_hands'] = df['behavioral_wash_hands'].fillna(behavioral_wash_hands_mean)
df['sex'] = df['sex'].fillna(sex_mean)
df['child_under_6_months'] = df['child_under_6_months'].fillna(child_under_6_months_mean)
df['marital_status'] = df['marital_status'].fillna(marital_status_mean)
df['rent_or_own'] = df['rent_or_own'].fillna(rent_or_own_mean)
df['behavioral_touch_face'] = df['behavioral_touch_face'].fillna(behavioral_touch_face_mean)
df['behavioral_face_mask'] = df['behavioral_face_mask'].fillna(behavioral_face_mask_mean)
df['chronic_med_condition'] = df['chronic_med_condition'].fillna(chronic_med_condition_mean)
df['doctor_recc_seasonal'] = df['doctor_recc_seasonal'].fillna(doctor_recc_seasonal_mean)
df['doctor_recc_h1n1'] = df['doctor_recc_h1n1'].fillna(doctor_recc_h1n1_mean)

df['age_group'] = df['age_group'].fillna('None')
df['income_poverty'] = df['income_poverty'].fillna('None')
df['hhs_geo_region'] = df['hhs_geo_region'].fillna('None')

In [6]:
all_cols

['marital_status',
 'age_group',
 'h1n1_concern',
 'h1n1_knowledge',
 'behavioral_wash_hands',
 'income_poverty',
 'opinion_seas_vacc_effective',
 'child_under_6_months',
 'chronic_med_condition',
 'doctor_recc_h1n1',
 'opinion_h1n1_vacc_effective',
 'opinion_seas_risk',
 'behavioral_face_mask',
 'doctor_recc_seasonal',
 'opinion_h1n1_sick_from_vacc',
 'behavioral_touch_face',
 'opinion_h1n1_risk',
 'hhs_geo_region',
 'rent_or_own']

# Preprocessing

In [7]:
X = df[all_cols]
y_h1n1 = df[['h1n1_vaccine']]
y_seas = df[['seasonal_vaccine']]
y = df[['h1n1_vaccine', 'seasonal_vaccine']]

In [8]:
#Categorical
cat_df = X[cat_cols]
recat_df = pd.get_dummies(data=cat_df, drop_first=True)

#Numerical
num_df = X[num_cols]

scaler = StandardScaler()
scaled_num = scaler.fit_transform(num_df)
scaled_num_df = pd.DataFrame(scaled_num, index=num_df.index, columns=num_df.columns)

encoded_df = pd.concat([recat_df, scaled_num_df], axis=1)

encoded_df['opinion_seas_risk'] = encoded_df['opinion_seas_risk'] * 1
encoded_df['opinion_h1n1_risk'] = encoded_df['opinion_h1n1_risk'] * 1

### Train-Test Split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(encoded_df, y, test_size=0.3, random_state=42)

y_train_h1n1 = y_train['h1n1_vaccine']
y_train_seas = y_train['seasonal_vaccine']

y_test_h1n1 = y_test['h1n1_vaccine']
y_test_seas = y_test['seasonal_vaccine']

In [10]:
X_train = np.asarray(X_train)
X_test = np.asarray(X_test)
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)

y_train_h1n1 = np.asarray(y_train_h1n1)
y_test_h1n1 = np.asarray(y_test_h1n1)

y_train_seas = np.asarray(y_train_seas)
y_test_seas = np.asarray(y_test_seas)

# Random Forest

### H1N1 Model

In [11]:
#H1N1 Model
model_h1n1 = RandomForestRegressor(n_estimators=800,
                              min_samples_split=2,
                              min_samples_leaf=4,
                              max_features='sqrt',
                              max_depth=20,
                              bootstrap=False)
model_h1n1.fit(X_train, y_train_h1n1)

RandomForestRegressor(bootstrap=False, max_depth=20, max_features='sqrt',
                      min_samples_leaf=4, n_estimators=800)

In [12]:
#Base Model
base_model = RandomForestRegressor(n_estimators=10, random_state=42)
base_model.fit(X_train, y_train_h1n1)

RandomForestRegressor(n_estimators=10, random_state=42)

In [13]:
evaluate(base_model, X_test, y_test_h1n1)

Model Performance
Average Error: 0.2510 degrees
Accuracy = nan%
AUC = 0.7644042825062549


nan

In [14]:
evaluate(model_h1n1, X_test, y_test_h1n1)

Model Performance
Average Error: 0.2464 degrees
Accuracy = -inf%
AUC = 0.8231329195341048


-inf

In [15]:
y_predicted_h1n1 = model_h1n1.predict(X_test)

### Seasonal Model

In [16]:
#Seasonal Model
model_seas = RandomForestRegressor(n_estimators=800,
                              min_samples_split=2,
                              min_samples_leaf=4,
                              max_features='sqrt',
                              max_depth=20,
                              bootstrap=False)

model_seas.fit(X_train, y_train_seas)

RandomForestRegressor(bootstrap=False, max_depth=20, max_features='sqrt',
                      min_samples_leaf=4, n_estimators=800)

In [17]:
#Base Model
base_model = RandomForestRegressor(n_estimators=10, random_state=42)
base_model.fit(X_train, y_train_seas)

RandomForestRegressor(n_estimators=10, random_state=42)

In [18]:
evaluate(base_model, X_test, y_test_seas)

Model Performance
Average Error: 0.3313 degrees
Accuracy = nan%
AUC = 0.7933126371358868


nan

In [19]:
evaluate(model_seas, X_test, y_test_seas)

Model Performance
Average Error: 0.3294 degrees
Accuracy = -inf%
AUC = 0.8466392132441946


-inf

In [20]:
y_predicted_seas = model_seas.predict(X_test)

In [21]:
y_predicted = np.vstack((y_predicted_h1n1, y_predicted_seas)).T

In [22]:
roc_auc_score(y_test, y_predicted)

0.8348860663891498

# Neural Network

### H1N1

In [23]:
h1n1_mc = ModelCheckpoint('..Models/h1n1_best_model.h5', monitor='val_auc', mode='max', verbose=1, save_best_only=True)

model_h1n1 = keras.Sequential([
    keras.layers.Dense(100, activation='hard_sigmoid', input_dim=32),
    keras.layers.LeakyReLU(500),
    keras.layers.LeakyReLU(800),
    keras.layers.LeakyReLU(200),
    keras.layers.Dense(200, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

In [24]:
model_h1n1.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=[tf.keras.metrics.AUC(from_logits=True)])

In [25]:
history = model_h1n1.fit(
    X_train,
    y_train_h1n1,
    batch_size=200,
    epochs=150,
    validation_data=(X_test, y_test_h1n1),
    callbacks=[h1n1_mc]
)

Epoch 1/150

Epoch 00001: val_auc improved from -inf to 0.79946, saving model to ..Models\h1n1_best_model.h5
Epoch 2/150

Epoch 00002: val_auc improved from 0.79946 to 0.81182, saving model to ..Models\h1n1_best_model.h5
Epoch 3/150

Epoch 00003: val_auc improved from 0.81182 to 0.81478, saving model to ..Models\h1n1_best_model.h5
Epoch 4/150

Epoch 00004: val_auc improved from 0.81478 to 0.81482, saving model to ..Models\h1n1_best_model.h5
Epoch 5/150

Epoch 00005: val_auc improved from 0.81482 to 0.81730, saving model to ..Models\h1n1_best_model.h5
Epoch 6/150

Epoch 00006: val_auc improved from 0.81730 to 0.81753, saving model to ..Models\h1n1_best_model.h5
Epoch 7/150

Epoch 00007: val_auc improved from 0.81753 to 0.81816, saving model to ..Models\h1n1_best_model.h5
Epoch 8/150

Epoch 00008: val_auc improved from 0.81816 to 0.81931, saving model to ..Models\h1n1_best_model.h5
Epoch 9/150

Epoch 00009: val_auc did not improve from 0.81931
Epoch 10/150

Epoch 00010: val_auc improved 

In [26]:
model_h1n1 = load_model('..Models/h1n1_best_model.h5')

y_predicted_h1n1 = model_h1n1.predict(X_test)
roc_auc_score(y_test_h1n1, y_predicted_h1n1)

0.8254230555587214

### Seasonal

In [27]:
from tensorflow import keras
from keras.callbacks import ModelCheckpoint

seas_mc = ModelCheckpoint('..Models/seas_best_model.h5', monitor='val_auc_1', mode='max', verbose=1, save_best_only=True)


model_seas = keras.Sequential([
    keras.layers.Dense(100, activation='hard_sigmoid', input_dim=32),
    keras.layers.LeakyReLU(500),
    keras.layers.LeakyReLU(600),
    keras.layers.LeakyReLU(820),
    keras.layers.Dense(200, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

In [28]:
model_seas.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=[tf.keras.metrics.AUC(from_logits=True)])

In [29]:
history = model_seas.fit(
    X_train,
    y_train_seas,
    batch_size=200,
    epochs=150,
    validation_data=(X_test, y_test_seas),
    callbacks=[seas_mc]
)

Epoch 1/150

Epoch 00001: val_auc_1 improved from -inf to 0.83209, saving model to ..Models\seas_best_model.h5
Epoch 2/150

Epoch 00002: val_auc_1 improved from 0.83209 to 0.83935, saving model to ..Models\seas_best_model.h5
Epoch 3/150

Epoch 00003: val_auc_1 improved from 0.83935 to 0.84070, saving model to ..Models\seas_best_model.h5
Epoch 4/150

Epoch 00004: val_auc_1 improved from 0.84070 to 0.84133, saving model to ..Models\seas_best_model.h5
Epoch 5/150

Epoch 00005: val_auc_1 improved from 0.84133 to 0.84184, saving model to ..Models\seas_best_model.h5
Epoch 6/150

Epoch 00006: val_auc_1 improved from 0.84184 to 0.84286, saving model to ..Models\seas_best_model.h5
Epoch 7/150

Epoch 00007: val_auc_1 improved from 0.84286 to 0.84364, saving model to ..Models\seas_best_model.h5
Epoch 8/150

Epoch 00008: val_auc_1 did not improve from 0.84364
Epoch 9/150

Epoch 00009: val_auc_1 improved from 0.84364 to 0.84440, saving model to ..Models\seas_best_model.h5
Epoch 10/150

Epoch 00010:

In [30]:
model_seas = load_model('..Models/seas_best_model.h5')

y_predicted_seas = model_seas.predict(X_test)
roc_auc_score(y_test_seas, y_predicted_seas)

0.8505707400027179

In [31]:
y_predicted_h1n1 = model_h1n1.predict(X_test)
y_predicted_seas = model_seas.predict(X_test)
y_predicted = np.concatenate((y_predicted_h1n1, y_predicted_seas), axis=1)
roc_auc_score(y_test, y_predicted)

0.8379968977807197

# Submission Data

In [32]:
test_data = pd.read_csv('../Data/test_set_features.csv')
df_full = test_data
df = test_data

In [33]:
df['sex'] = df['sex'].map({
    'Female': 0,
    'Male': 1
})

df['marital_status'] = df['marital_status'].map({
    'Not Married': 0,
    'Married': 1
})

df['rent_or_own'] = df['rent_or_own'].map({
    'Rent': 0,
    'Own': 1
})
    
h1n1_num_cols = [
    'opinion_h1n1_sick_from_vacc',
    'h1n1_concern',
    'h1n1_knowledge',
    'opinion_seas_vacc_effective',
    'opinion_h1n1_vacc_effective',
    'opinion_seas_risk',
    'opinion_h1n1_risk',
    'child_under_6_months',
    'marital_status',
    'rent_or_own',
    'behavioral_touch_face',
    'behavioral_face_mask',
    'chronic_med_condition',
    'behavioral_wash_hands',
    'doctor_recc_seasonal',
    'doctor_recc_h1n1'
]

h1n1_cat_cols = [
    'age_group',
    'hhs_geo_region',
    'income_poverty'
]

seas_num_cols = [
    'behavioral_wash_hands',
    'opinion_h1n1_vacc_effective',
    'opinion_h1n1_risk',
    'opinion_seas_vacc_effective',
    'opinion_seas_risk',
    'marital_status',
    'chronic_med_condition',
    'rent_or_own',
    'doctor_recc_h1n1',
    'doctor_recc_seasonal'
]

seas_cat_cols = [
    'age_group',
    'income_poverty'
]

h1n1_cols = h1n1_num_cols + h1n1_cat_cols
seas_cols = seas_num_cols + seas_cat_cols

num_cols = list(set(h1n1_num_cols + seas_num_cols))
cat_cols = list(set(h1n1_cat_cols + seas_cat_cols))

all_cols = list(set(num_cols + cat_cols))

In [34]:
df['opinion_h1n1_sick_from_vacc'] = df['opinion_h1n1_sick_from_vacc'].fillna(opinion_h1n1_sick_from_vacc_mean)
df['h1n1_concern'] = df['h1n1_concern'].fillna(h1n1_concern_mean)
df['h1n1_knowledge'] = df['h1n1_knowledge'].fillna(h1n1_knowledge_mean)
df['opinion_seas_vacc_effective'] = df['opinion_seas_vacc_effective'].fillna(opinion_seas_vacc_effective_mean)
df['opinion_h1n1_vacc_effective'] = df['opinion_h1n1_vacc_effective'].fillna(opinion_h1n1_vacc_effective_mean)
df['opinion_seas_risk'] = df['opinion_seas_risk'].fillna(opinion_seas_risk_mean)
df['opinion_h1n1_risk'] = df['opinion_h1n1_risk'].fillna(opinion_h1n1_risk_mean)
df['behavioral_wash_hands'] = df['behavioral_wash_hands'].fillna(behavioral_wash_hands_mean)
df['sex'] = df['sex'].fillna(sex_mean)
df['child_under_6_months'] = df['child_under_6_months'].fillna(child_under_6_months_mean)
df['marital_status'] = df['marital_status'].fillna(marital_status_mean)
df['rent_or_own'] = df['rent_or_own'].fillna(rent_or_own_mean)
df['behavioral_touch_face'] = df['behavioral_touch_face'].fillna(behavioral_touch_face_mean)
df['behavioral_face_mask'] = df['behavioral_face_mask'].fillna(behavioral_face_mask_mean)
df['chronic_med_condition'] = df['chronic_med_condition'].fillna(chronic_med_condition_mean)
df['doctor_recc_seasonal'] = df['doctor_recc_seasonal'].fillna(doctor_recc_seasonal_mean)
df['doctor_recc_h1n1'] = df['doctor_recc_h1n1'].fillna(doctor_recc_h1n1_mean)

df['age_group'] = df['age_group'].fillna('None')
df['income_poverty'] = df['income_poverty'].fillna('None')
df['hhs_geo_region'] = df['hhs_geo_region'].fillna('None')

In [35]:
X = df[all_cols]

In [36]:
#Categorical
cat_df = X[cat_cols]
recat_df = pd.get_dummies(data=cat_df, drop_first=True)

#Numerical
num_df = X[num_cols]

scaled_num = scaler.transform(num_df)
scaled_num_df = pd.DataFrame(scaled_num, index=num_df.index, columns=num_df.columns)

encoded_df = pd.concat([recat_df, scaled_num_df], axis=1)

encoded_df['opinion_seas_risk'] = encoded_df['opinion_seas_risk'] * 1
encoded_df['opinion_h1n1_risk'] = encoded_df['opinion_h1n1_risk'] * 1

X = np.asarray(encoded_df)

In [37]:
model_h1n1 = load_model('..Models/h1n1_best_model.h5')
model_seas = load_model('..Models/seas_best_model.h5')

In [38]:
y_h1n1 = model_h1n1.predict(X)
y_seas = model_seas.predict(X)

y = np.concatenate((y_h1n1, y_seas), axis=1)

In [39]:
results = pd.DataFrame(y, columns=['h1n1_vaccine', 'seasonal_vaccine'])

submission = pd.concat([df_full, results], axis=1)
submission = submission[['respondent_id', 'h1n1_vaccine', 'seasonal_vaccine']]

In [40]:
today = datetime.today().date()

submission.to_csv(f'../Data/Neural Network Submission {today}.csv', index=False)

In [41]:
print(datetime.today().date())

2021-07-02


# Test

In [64]:
training_features = pd.read_csv('../Data/training_set_features.csv')
training_targets = pd.read_csv('../Data/training_set_labels.csv')
df_full = pd.merge(training_features, training_targets, on='respondent_id', how='left')
df = df_full

In [65]:
df['sex'] = df['sex'].map({
    'Female': 0,
    'Male': 1
})

df['marital_status'] = df['marital_status'].map({
    'Not Married': 0,
    'Married': 1
})

df['rent_or_own'] = df['rent_or_own'].map({
    'Rent': 0,
    'Own': 1
})
    
h1n1_num_cols = [
    'opinion_h1n1_sick_from_vacc',
    'h1n1_concern',
    'h1n1_knowledge',
    'opinion_seas_vacc_effective',
    'opinion_h1n1_vacc_effective',
    'opinion_seas_risk',
    'opinion_h1n1_risk',
    'child_under_6_months',
    'marital_status',
    'rent_or_own',
    'behavioral_touch_face',
    'behavioral_face_mask',
    'chronic_med_condition',
    'behavioral_wash_hands',
    'doctor_recc_seasonal',
    'doctor_recc_h1n1'
]

h1n1_cat_cols = [
    'age_group',
    'hhs_geo_region',
    'income_poverty'
]

seas_num_cols = [
    'behavioral_wash_hands',
    'opinion_h1n1_vacc_effective',
    'opinion_h1n1_risk',
    'opinion_seas_vacc_effective',
    'opinion_seas_risk',
    'marital_status',
    'chronic_med_condition',
    'rent_or_own',
    'doctor_recc_h1n1',
    'doctor_recc_seasonal'
]

seas_cat_cols = [
    'age_group',
    'income_poverty'
]

h1n1_cols = h1n1_num_cols + h1n1_cat_cols
seas_cols = seas_num_cols + seas_cat_cols

num_cols = list(set(h1n1_num_cols + seas_num_cols))
cat_cols = list(set(h1n1_cat_cols + seas_cat_cols))

all_cols = list(set(num_cols + cat_cols))

In [66]:
df['opinion_h1n1_sick_from_vacc'] = df['opinion_h1n1_sick_from_vacc'].fillna(opinion_h1n1_sick_from_vacc_mean)
df['h1n1_concern'] = df['h1n1_concern'].fillna(h1n1_concern_mean)
df['h1n1_knowledge'] = df['h1n1_knowledge'].fillna(h1n1_knowledge_mean)
df['opinion_seas_vacc_effective'] = df['opinion_seas_vacc_effective'].fillna(opinion_seas_vacc_effective_mean)
df['opinion_h1n1_vacc_effective'] = df['opinion_h1n1_vacc_effective'].fillna(opinion_h1n1_vacc_effective_mean)
df['opinion_seas_risk'] = df['opinion_seas_risk'].fillna(opinion_seas_risk_mean)
df['opinion_h1n1_risk'] = df['opinion_h1n1_risk'].fillna(opinion_h1n1_risk_mean)
df['behavioral_wash_hands'] = df['behavioral_wash_hands'].fillna(behavioral_wash_hands_mean)
df['sex'] = df['sex'].fillna(sex_mean)
df['child_under_6_months'] = df['child_under_6_months'].fillna(child_under_6_months_mean)
df['marital_status'] = df['marital_status'].fillna(marital_status_mean)
df['rent_or_own'] = df['rent_or_own'].fillna(rent_or_own_mean)
df['behavioral_touch_face'] = df['behavioral_touch_face'].fillna(behavioral_touch_face_mean)
df['behavioral_face_mask'] = df['behavioral_face_mask'].fillna(behavioral_face_mask_mean)
df['chronic_med_condition'] = df['chronic_med_condition'].fillna(chronic_med_condition_mean)
df['doctor_recc_seasonal'] = df['doctor_recc_seasonal'].fillna(doctor_recc_seasonal_mean)
df['doctor_recc_h1n1'] = df['doctor_recc_h1n1'].fillna(doctor_recc_h1n1_mean)

df['age_group'] = df['age_group'].fillna('None')
df['income_poverty'] = df['income_poverty'].fillna('None')
df['hhs_geo_region'] = df['hhs_geo_region'].fillna('None')

In [67]:
X = df[all_cols]
y = df[['h1n1_vaccine', 'seasonal_vaccine']]

In [68]:
#Categorical
cat_df = X[cat_cols]
recat_df = pd.get_dummies(data=cat_df, drop_first=True)

#Numerical
num_df = X[num_cols]

scaled_num = scaler.transform(num_df)
scaled_num_df = pd.DataFrame(scaled_num, index=num_df.index, columns=num_df.columns)

encoded_df = pd.concat([recat_df, scaled_num_df], axis=1)

encoded_df['opinion_seas_risk'] = encoded_df['opinion_seas_risk'] * 1
encoded_df['opinion_h1n1_risk'] = encoded_df['opinion_h1n1_risk'] * 1

X = np.asarray(encoded_df)

In [69]:
model_h1n1 = load_model('..Models/h1n1_best_model.h5')
model_seas = load_model('..Models/seas_best_model.h5')

In [70]:
y_h1n1 = model_h1n1.predict(X)
y_seas = model_seas.predict(X)

y_comb = np.concatenate((y_h1n1, y_seas), axis=1)

In [71]:
results = pd.DataFrame(y_comb, columns=['h1n1_vaccine_pred', 'seasonal_vaccine_pred'])

submission = pd.concat([df_full, results], axis=1)
submission = submission[['respondent_id', 'h1n1_vaccine', 'h1n1_vaccine_pred', 'seasonal_vaccine', 'seasonal_vaccine_pred']]

In [72]:
y_true = np.asarray(submission[['h1n1_vaccine', 'seasonal_vaccine']])
y_pred = np.asarray(submission[['h1n1_vaccine_pred', 'seasonal_vaccine_pred']])

In [74]:
roc_auc_score(y_true, y_pred)

0.8460556503605772