In [96]:
import pandas as pd
import numpy as np
import tensorflow as tf
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import matthews_corrcoef
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import r2_score
import scipy.stats as stats
from tensorflow import keras
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from datetime import datetime
from category_encoders import OrdinalEncoder
from catboost import CatBoostClassifier, CatBoostRegressor
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs

import warnings
warnings.filterwarnings("ignore")

In [97]:
#Functions

def evaluate(model, X_test, y_test):
    predictions = model.predict(X_test)
    errors = abs(predictions - y_test)
    mape = 100 * np.mean(errors / y_test)
    accuracy = 100 - mape
    roc = roc_auc_score(y_test, predictions)
    print('Model Performance')
    print('Average Error: {:0.4f} degrees'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%'.format(accuracy))
    print(f'AUC = {roc}')
    return accuracy

In [98]:
train = pd.read_csv('../Data/training_set_features.csv', index_col='respondent_id')
test = pd.read_csv('../Data/test_set_features.csv', index_col ='respondent_id')
labels = pd.read_csv('../Data/training_set_labels.csv', index_col='respondent_id')

In [99]:
train.loc[(train['age_group'] == '65+ Years') & (train['employment_status'].isnull()), 'employment_status'] = 'Not in Labor Force'

In [100]:
num_cols = list(train.select_dtypes('number').columns)

cat_cols = [
    'race',
    'sex',
    'marital_status',
    'rent_or_own',
    'hhs_geo_region',
    'census_msa',
    'employment_industry',
    'employment_occupation'
]

ord_cols = [
    'age_group',
    'education',
    'income_poverty',
    'employment_status'
]




#Impute Train
for col in num_cols:
    train[col] = train[col].fillna(value=-1)
    test[col] = test[col].fillna(value=-1)

for col in (cat_cols + ord_cols):
    train[col] = train[col].fillna(value='None')
    test[col] = test[col].fillna(value='None')
test_labels = labels.copy()    
labels['h1n1_vaccine'] = labels['h1n1_vaccine'].map({0: 'No', 1: 'Yes'})
labels['seasonal_vaccine'] = labels['seasonal_vaccine'].map({0: 'No', 1: 'Yes'})

In [101]:
train['age_group'] = train['age_group'].map({
    '18 - 34 Years': 1,
    '35 - 44 Years': 2,
    '45 - 54 Years': 3,
    '55 - 64 Years': 4,
    '65+ Years': 5
})

train['education'] = train['education'].map({
    '< 12 Years': 1,
    '12 Years': 2,
    'Some College': 3,
    'College Graduate': 4,
    'None': -1
})

train['income_poverty'] = train['income_poverty'].map({
    'None': -1,
    'Below Poverty': 1,
    '<= $75,000, Above Poverty': 2,
    '> $75,000': 3
})

train['employment_status'] = train['employment_status'].map({
    'None': -1,
    'Unemployed': 1,
    'Employed': 2,
    'Not in Labor Force': 3
})




test['education'] = test['education'].map({
    '< 12 Years': 1,
    '12 Years': 2,
    'Some College': 3,
    'College Graduate': 4,
    'None': -1
})

test['income_poverty'] = test['income_poverty'].map({
    'None': -1,
    'Below Poverty': 1,
    '<= $75,000, Above Poverty': 2,
    '> $75,000': 3
})

test['employment_status'] = test['employment_status'].map({
    'None': -1,
    'Unemployed': 1,
    'Employed': 2,
    'Not in Labor Force': 3
})

In [102]:
for x in train[ord_cols].columns:
    print(x, train[x].unique())

age_group [4 2 1 5 3]
education [ 1  2  4  3 -1]
income_poverty [ 1  2  3 -1]
employment_status [ 3  2  1 -1]


In [103]:
all_cols = train.columns
best_cols = [2, 3, 6, 7, 10, 12, 13, 14, 16, 21, 22, 25, 26, 27, 28, 29, 30, 31, 32]

In [104]:
best_cols_names = [all_cols[x] for x in best_cols]

In [105]:
used_cols = list(set(num_cols + best_cols_names))
train = train[used_cols]
test = test[used_cols]

h1n1_labels = labels[['h1n1_vaccine']]
seas_labels = labels[['seasonal_vaccine']]

# Transformation

In [106]:
#H1N1 Balancing
yeses = h1n1_labels[h1n1_labels['h1n1_vaccine'] == 'Yes']
len_of_yes = len(yeses)
nos = h1n1_labels[h1n1_labels['h1n1_vaccine'] == 'No'].sample(len_of_yes, random_state=42)

indices = np.concatenate((yeses.index.values, nos.index.values))
h1n1_labels_balanced = h1n1_labels.iloc[indices, :]
h1n1_train_balanced = train.iloc[indices, :]

enc = OneHotEncoder(categories='auto')
h1n1_labels_balanced_arr = np.array(h1n1_labels_balanced['h1n1_vaccine']).reshape(-1,1)
h1n1_labels_trans = enc.fit_transform(h1n1_labels_balanced_arr).toarray()
h1n1_test_trans = enc.transform(np.array(h1n1_labels['h1n1_vaccine']).reshape(-1,1)).toarray()

In [107]:
#Seasonal Balancing
yeses = seas_labels[seas_labels['seasonal_vaccine'] == 'Yes']
len_of_yes = len(yeses)
nos = seas_labels[seas_labels['seasonal_vaccine'] == 'No'].sample(len_of_yes, random_state=42)

indices = np.concatenate((yeses.index.values, nos.index.values))
seas_labels_balanced = seas_labels.iloc[indices, :]
seas_train_balanced = train.iloc[indices, :]

enc = OneHotEncoder(categories='auto')
seas_labels_balanced_arr = np.array(seas_labels_balanced['seasonal_vaccine']).reshape(-1,1)
seas_labels_trans = enc.fit_transform(seas_labels_balanced_arr).toarray()
seas_test_trans = enc.transform(np.array(seas_labels['seasonal_vaccine']).reshape(-1,1)).toarray()

In [108]:
cat_cols = train.select_dtypes('object').columns

In [109]:
ct = ColumnTransformer([('scaler', StandardScaler(), num_cols),
                       ('cat', OneHotEncoder(), cat_cols)]
                       , remainder='passthrough')

In [110]:
h1n1_train_trans = ct.fit_transform(h1n1_train_balanced)
seas_train_trans = ct.fit_transform(seas_train_balanced)
test = ct.transform(test)

In [111]:
true_features = train.sample(1500, random_state=42)
true_indices = true_features.index.values
true_labels = labels.iloc[true_indices,:]

true_features = ct.fit_transform(true_features)
true_labels['h1n1_vaccine'] = true_labels['h1n1_vaccine'].map({'Yes': 1, 'No': 0})
true_labels['seasonal_vaccine'] = true_labels['seasonal_vaccine'].map({'Yes': 1, 'No': 0})
true_labels_rf = true_labels.copy()
true_labels = np.asarray(true_labels)

# H1N1

## Random Forest

## Train Test Split

In [112]:
X = h1n1_train_trans
y = h1n1_labels_balanced['h1n1_vaccine'].map({'Yes': 1, 'No': 0}).to_numpy()

In [113]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

In [114]:
#H1N1 Model
# model_h1n1 = RandomForestRegressor(n_estimators=1200,
#                                min_samples_split=2,
#                                min_samples_leaf=3,
#                                max_features='sqrt',
#                                max_depth=20,
#                                bootstrap=True)
model_h1n1 = CatBoostRegressor(n_estimators=150)
model_h1n1.fit(X_train, y_train)

Learning rate set to 0.238654
0:	learn: 0.4605087	total: 2.46ms	remaining: 367ms
1:	learn: 0.4363125	total: 5.11ms	remaining: 378ms
2:	learn: 0.4217797	total: 7.69ms	remaining: 377ms
3:	learn: 0.4113525	total: 10.2ms	remaining: 373ms
4:	learn: 0.4041361	total: 12.7ms	remaining: 367ms
5:	learn: 0.3997603	total: 15.1ms	remaining: 363ms
6:	learn: 0.3957630	total: 17.7ms	remaining: 362ms
7:	learn: 0.3927565	total: 20.3ms	remaining: 361ms
8:	learn: 0.3908970	total: 22.9ms	remaining: 358ms
9:	learn: 0.3884160	total: 25.2ms	remaining: 353ms
10:	learn: 0.3864804	total: 27.7ms	remaining: 350ms
11:	learn: 0.3846070	total: 30.2ms	remaining: 348ms
12:	learn: 0.3829816	total: 32.8ms	remaining: 346ms
13:	learn: 0.3820948	total: 35.5ms	remaining: 344ms
14:	learn: 0.3809459	total: 37.9ms	remaining: 341ms
15:	learn: 0.3799199	total: 40.4ms	remaining: 338ms
16:	learn: 0.3789953	total: 42.9ms	remaining: 336ms
17:	learn: 0.3784690	total: 45.5ms	remaining: 334ms
18:	learn: 0.3776037	total: 47.9ms	remaining

<catboost.core.CatBoostRegressor at 0x28f07eb5cd0>

In [58]:
#Base Model
base_model = RandomForestClassifier(n_estimators=10, random_state=42)
base_model.fit(X_train, y_train)

RandomForestClassifier(n_estimators=10, random_state=42)

In [85]:
evaluate(base_model, X_test, y_test)

Model Performance
Average Error: 0.3257 degrees
Accuracy = nan%
AUC = 0.8098067289872746


nan

In [115]:
evaluate(model_h1n1, X_test, y_test)

Model Performance
Average Error: 0.3109 degrees
Accuracy = -inf%
AUC = 0.8530047184267125


-inf

In [663]:
y_predicted_h1n1 = model_h1n1.predict(X_test)

## Neural Network

### Train Test Split

In [17]:
X = h1n1_train_trans
y = h1n1_labels_trans

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

In [19]:
h1n1_mc = ModelCheckpoint('..Models/h1n1_best_model.h5', monitor='val_auc', mode='max', verbose=0, save_best_only=True)

model_h1n1 = keras.Sequential([
    keras.layers.Dense(200, activation='sigmoid', input_dim=46),
    keras.layers.Dense(100, activation='relu'),
    keras.layers.Dense(200, activation='relu'),
    keras.layers.Dense(800, activation='relu'),
    keras.layers.Dense(500, activation='relu'),
    keras.layers.Dense(250, activation='relu'),
    keras.layers.Dense(2, activation='softmax')
])

In [20]:
model_h1n1.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              metrics=[tf.keras.metrics.AUC(from_logits=False), 'accuracy'])

In [21]:
EarlyStopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', min_delta=0, patience=5, verbose=0,
    mode='auto', baseline=None, restore_best_weights=True
)

history = model_h1n1.fit(
    X_train,
    y_train,
    batch_size=50,
    epochs=150,
    validation_data=(X_test, y_test),
    callbacks=[h1n1_mc, EarlyStopping],
    shuffle=True,
    verbose=1
)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150


In [22]:
#model_h1n1 = load_model('..Models/h1n1_best_model.h5')

y_predicted_h1n1 = model_h1n1.predict(X_test)
roc_auc_score(y_test, y_predicted_h1n1)

0.8567961634138328

# Seasonal

## Random Forest Model

### Train Test Split

In [116]:
X = seas_train_trans
y = seas_labels_balanced['seasonal_vaccine'].map({'Yes': 1, 'No': 0}).to_numpy()

In [117]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

In [118]:
#Seasonal Model
# model_seas = RandomForestRegressor(n_estimators=800,
#                               min_samples_split=2,
#                               min_samples_leaf=4,
#                               max_features='sqrt',
#                               max_depth=20,
#                               bootstrap=False)
model_seas = CatBoostRegressor(n_estimators=150)
model_seas.fit(X_train, y_train)

Learning rate set to 0.274428
0:	learn: 0.4621088	total: 3.77ms	remaining: 562ms
1:	learn: 0.4389792	total: 7.64ms	remaining: 565ms
2:	learn: 0.4250228	total: 11.4ms	remaining: 560ms
3:	learn: 0.4153538	total: 15.1ms	remaining: 551ms
4:	learn: 0.4088632	total: 18.7ms	remaining: 543ms
5:	learn: 0.4033330	total: 22.7ms	remaining: 544ms
6:	learn: 0.4006570	total: 27ms	remaining: 551ms
7:	learn: 0.3988498	total: 31ms	remaining: 550ms
8:	learn: 0.3968799	total: 35ms	remaining: 549ms
9:	learn: 0.3955938	total: 38.9ms	remaining: 544ms
10:	learn: 0.3943830	total: 42.9ms	remaining: 542ms
11:	learn: 0.3933230	total: 46.7ms	remaining: 538ms
12:	learn: 0.3924473	total: 50.6ms	remaining: 533ms
13:	learn: 0.3915054	total: 54.8ms	remaining: 533ms
14:	learn: 0.3907711	total: 59.2ms	remaining: 533ms
15:	learn: 0.3903538	total: 63.3ms	remaining: 530ms
16:	learn: 0.3893615	total: 67.2ms	remaining: 526ms
17:	learn: 0.3889123	total: 71ms	remaining: 521ms
18:	learn: 0.3883255	total: 74.9ms	remaining: 516ms


<catboost.core.CatBoostRegressor at 0x28f07e6f7c0>

In [64]:
#Base Model
base_model = RandomForestRegressor(n_estimators=10, random_state=42)
base_model.fit(X_train, y_train)

RandomForestRegressor(n_estimators=10, random_state=42)

In [65]:
evaluate(base_model, X_test, y_test)

Model Performance
Average Error: 0.3257 degrees
Accuracy = nan%
AUC = 0.8098067289872746


nan

In [119]:
evaluate(model_seas, X_test, y_test)

Model Performance
Average Error: 0.3121 degrees
Accuracy = -inf%
AUC = 0.8574910841385694


-inf

In [693]:
y_predicted_seas = model_seas.predict(X_test)

In [694]:
y_predicted = np.vstack((y_predicted_h1n1, y_predicted_seas)).T

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 1 and the array at index 1 has size 7461

In [695]:
roc_auc_score(y_test, y_predicted_seas)

0.8560692889802611

## Neural Network

### Train Test Split

In [23]:
X = seas_train_trans
y = seas_labels_trans

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

In [25]:
from tensorflow import keras
from keras.callbacks import ModelCheckpoint

seas_mc = ModelCheckpoint('..Models/seas_best_model.h5', monitor='val_auc_2', mode='max', verbose=1, save_best_only=True)


model_seas = keras.Sequential([
    keras.layers.Dense(100, activation='sigmoid', input_dim=46),
    keras.layers.LeakyReLU(500),
    keras.layers.LeakyReLU(600),
    keras.layers.LeakyReLU(820),
    keras.layers.Dense(200, activation='relu'),
    keras.layers.Dense(2, activation='softmax')
])

In [26]:
model_seas.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              metrics=[tf.keras.metrics.AUC(from_logits=False)])

In [27]:
history = model_seas.fit(
    X_train,
    y_train,
    batch_size=50,
    epochs=150,
    validation_data=(X_test, y_test),
    callbacks=[seas_mc, EarlyStopping]
)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150


In [28]:
#model_seas = load_model('..Models/seas_best_model.h5')

y_predicted_seas = model_seas.predict(X_test)
roc_auc_score(y_test, y_predicted_seas)

0.8565332263597631

In [120]:
#Full Data Test - Random Forest Network
X_test = true_features
y_test = true_labels_rf.to_numpy()

y_predicted_h1n1 = model_h1n1.predict(X_test).reshape(-1,1)
y_predicted_seas = model_seas.predict(X_test).reshape(-1,1)

y_predicted = np.concatenate((y_predicted_h1n1, y_predicted_seas), axis=1)
roc_auc_score(y_test, y_predicted)

0.9110327767453076

(1500, 2)

(1500, 2)

In [685]:
y_predicted_h1n1

array([[0.32140653],
       [0.4491121 ],
       [0.28784873],
       ...,
       [0.44708622],
       [0.35714154],
       [0.13289283]])

In [684]:
pd.DataFrame(y_predicted)

Unnamed: 0,0,1
0,0.321407,0.254216
1,0.449112,0.241889
2,0.287849,0.524687
3,0.528227,0.277097
4,0.237950,0.188313
...,...,...
1495,0.429888,0.621552
1496,0.310791,0.758748
1497,0.447086,0.199280
1498,0.357142,0.575851


In [30]:
#Full Data Test - Neural Network
X_test = true_features
y_test = true_labels

y_predicted_h1n1 = model_h1n1.predict(X_test)[:,1].reshape(-1,1)
y_predicted_seas = model_seas.predict(X_test)[:,1].reshape(-1,1)

y_predicted = np.concatenate((y_predicted_h1n1, y_predicted_seas), axis=1)
roc_auc_score(y_test, y_predicted)

0.8763100493167153

In [None]:
y_predicted

# Submission

In [121]:
test = pd.read_csv('../Data/test_set_features.csv')
full_test = test.copy()

In [122]:
num_cols = list(test.select_dtypes('number').columns)

cat_cols = [
    'race',
    'sex',
    'marital_status',
    'rent_or_own',
    'hhs_geo_region',
    'census_msa',
    'employment_industry',
    'employment_occupation'
]

ord_cols = [
    'age_group',
    'education',
    'income_poverty',
    'employment_status'
]


#Impute Test
for col in num_cols:
    test[col] = test[col].fillna(value=-1)


for col in (cat_cols+ord_cols):
    test[col] = test[col].fillna(value='None')

    
test['age_group'] = test['age_group'].map({
    '18 - 34 Years': 1,
    '35 - 44 Years': 2,
    '45 - 54 Years': 3,
    '55 - 64 Years': 4,
    '65+ Years': 5
})
    
test['education'] = test['education'].map({
    '< 12 Years': 1,
    '12 Years': 2,
    'Some College': 3,
    'College Graduate': 4,
    'None': -1
})

test['income_poverty'] = test['income_poverty'].map({
    'None': -1,
    'Below Poverty': 1,
    '<= $75,000, Above Poverty': 2,
    '> $75,000': 3
})

test['employment_status'] = test['employment_status'].map({
    'None': -1,
    'Unemployed': 1,
    'Employed': 2,
    'Not in Labor Force': 3
})

In [123]:
test = test[used_cols]

In [124]:
test = ct.transform(test)

In [125]:
y_h1n1 = model_h1n1.predict(test).reshape(-1,1)
y_seas = model_seas.predict(test).reshape(-1,1)

y_comb = np.concatenate((y_h1n1, y_seas), axis=1)

In [126]:
results = pd.DataFrame(y_comb, columns=['h1n1_vaccine', 'seasonal_vaccine'])

submission = pd.concat([full_test, results], axis=1)
submission = submission[['respondent_id', 'h1n1_vaccine', 'seasonal_vaccine']]

In [127]:
today = datetime.today().date()

submission.to_csv(f'../Submissions/Neural Network Submission {today}.csv', index=False)

In [128]:
submission

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707,0.439759,0.261469
1,26708,0.174511,-0.015732
2,26709,0.548362,0.847890
3,26710,0.896150,0.926527
4,26711,0.652385,0.495263
...,...,...,...
26703,53410,0.616330,0.461431
26704,53411,0.384029,0.117051
26705,53412,0.521775,0.114166
26706,53413,-0.066346,0.444741
