In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import matthews_corrcoef
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import r2_score
import scipy.stats as stats
from tensorflow import keras
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from datetime import datetime
from category_encoders import OrdinalEncoder, TargetEncoder
from catboost import CatBoostClassifier, CatBoostRegressor
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs

import warnings
warnings.filterwarnings("ignore")

In [2]:
#Functions

def evaluate(model, X_test, y_test):
    predictions = model.predict(X_test)
    errors = abs(predictions - y_test)
    mape = 100 * np.mean(errors / y_test)
    accuracy = 100 - mape
    roc = roc_auc_score(y_test, predictions)
    print('Model Performance')
    print('Average Error: {:0.4f} degrees'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%'.format(accuracy))
    print(f'AUC = {roc}')
    return accuracy

In [3]:
train = pd.read_csv('../Data/training_set_features.csv', index_col='respondent_id')
test = pd.read_csv('../Data/test_set_features.csv', index_col ='respondent_id')
labels = pd.read_csv('../Data/training_set_labels.csv', index_col='respondent_id')

In [4]:
train.loc[(train['age_group'] == '65+ Years') & (train['employment_status'].isnull()), 'employment_status'] = 'Not in Labor Force'

In [5]:
num_cols = list(train.select_dtypes('number').columns)

cat_cols = [
    'race',
    'sex',
    'marital_status',
    'rent_or_own',
    'hhs_geo_region',
    'census_msa',
    'employment_industry',
    'employment_occupation'
]

ord_cols = [
    'age_group',
    'education',
    'income_poverty',
    'employment_status'
]




#Impute Train
for col in num_cols:
    train[col] = train[col].fillna(value=-1)
    test[col] = test[col].fillna(value=-1)

for col in (cat_cols + ord_cols):
    train[col] = train[col].fillna(value='None')
    test[col] = test[col].fillna(value='None')
test_labels = labels.copy()    

In [6]:
train['age_group'] = train['age_group'].map({
    '18 - 34 Years': 1,
    '35 - 44 Years': 2,
    '45 - 54 Years': 3,
    '55 - 64 Years': 4,
    '65+ Years': 5
})

train['education'] = train['education'].map({
    '< 12 Years': 1,
    '12 Years': 2,
    'Some College': 3,
    'College Graduate': 4,
    'None': -1
})

train['income_poverty'] = train['income_poverty'].map({
    'None': -1,
    'Below Poverty': 1,
    '<= $75,000, Above Poverty': 2,
    '> $75,000': 3
})

train['employment_status'] = train['employment_status'].map({
    'None': -1,
    'Unemployed': 1,
    'Employed': 2,
    'Not in Labor Force': 3
})




test['education'] = test['education'].map({
    '< 12 Years': 1,
    '12 Years': 2,
    'Some College': 3,
    'College Graduate': 4,
    'None': -1
})

test['income_poverty'] = test['income_poverty'].map({
    'None': -1,
    'Below Poverty': 1,
    '<= $75,000, Above Poverty': 2,
    '> $75,000': 3
})

test['employment_status'] = test['employment_status'].map({
    'None': -1,
    'Unemployed': 1,
    'Employed': 2,
    'Not in Labor Force': 3
})

In [7]:
for x in train[ord_cols].columns:
    print(x, train[x].unique())

age_group [4 2 1 5 3]
education [ 1  2  4  3 -1]
income_poverty [ 1  2  3 -1]
employment_status [ 3  2  1 -1]


In [8]:
all_cols = train.columns

In [72]:
best_cols_names = [all_cols[x] for x in best_cols]

In [10]:
h1n1_labels = labels[['h1n1_vaccine']]
seas_labels = labels[['seasonal_vaccine']]

# Transformation

In [11]:
#H1N1 Balancing
yeses = h1n1_labels[h1n1_labels['h1n1_vaccine'] == 1]
len_of_yes = len(yeses)
nos = h1n1_labels[h1n1_labels['h1n1_vaccine'] == 0].sample(len_of_yes, random_state=42)

indices = np.concatenate((yeses.index.values, nos.index.values))
h1n1_labels_balanced = h1n1_labels.iloc[indices, :]
h1n1_train_balanced = train.iloc[indices, :]

# enc = OneHotEncoder(categories='auto')
# h1n1_labels_balanced_arr = np.array(h1n1_labels_balanced['h1n1_vaccine']).reshape(-1,1)
# h1n1_labels_trans = enc.fit_transform(h1n1_labels_balanced_arr).toarray()
# h1n1_test_trans = enc.transform(np.array(h1n1_labels['h1n1_vaccine']).reshape(-1,1)).toarray()

In [12]:
#Seasonal Balancing
yeses = seas_labels[seas_labels['seasonal_vaccine'] == 1]
len_of_yes = len(yeses)
nos = seas_labels[seas_labels['seasonal_vaccine'] == 0].sample(len_of_yes, random_state=42)

indices = np.concatenate((yeses.index.values, nos.index.values))
seas_labels_balanced = seas_labels.iloc[indices, :]
seas_train_balanced = train.iloc[indices, :]

# enc = OneHotEncoder(categories='auto')
# seas_labels_balanced_arr = np.array(seas_labels_balanced['seasonal_vaccine']).reshape(-1,1)
# seas_labels_trans = enc.fit_transform(seas_labels_balanced_arr).toarray()
# seas_test_trans = enc.transform(np.array(seas_labels['seasonal_vaccine']).reshape(-1,1)).toarray()

In [13]:
cat_cols = train.select_dtypes('object').columns

In [14]:
ct_h1n1 = ColumnTransformer([('scaler', StandardScaler(), num_cols),
                       ('cat', TargetEncoder(cols=cat_cols, smoothing=100, min_samples_leaf=10), cat_cols)]
                       , remainder='passthrough')

ct_seas = ColumnTransformer([('scaler', StandardScaler(), num_cols),
                       ('cat', TargetEncoder(cols=cat_cols, smoothing=100, min_samples_leaf=10), cat_cols)]
                       , remainder='passthrough')

In [15]:
h1n1_train_trans = ct_h1n1.fit_transform(h1n1_train_balanced, h1n1_labels_balanced)
seas_train_trans = ct_seas.fit_transform(seas_train_balanced, seas_labels_balanced)
test_h1n1 = ct_h1n1.transform(test)
test_seas = ct_seas.transform(test)

In [16]:
true_features = train.sample(1500, random_state=42)
true_indices = true_features.index.values
true_labels = labels.iloc[true_indices,:]

true_features = ct_seas.transform(true_features)
true_labels_rf = true_labels.copy()
true_labels = np.asarray(true_labels)

In [80]:
true_labels

array([[0, 0],
       [0, 1],
       [0, 0],
       ...,
       [0, 0],
       [1, 1],
       [0, 0]], dtype=int64)

# H1N1

## Random Forest

## Train Test Split

In [20]:
X = h1n1_train_trans
y = h1n1_labels_balanced

Unnamed: 0_level_0,h1n1_vaccine
respondent_id,Unnamed: 1_level_1
7,1
10,1
11,1
16,1
26,1
...,...
14227,0
1668,0
20168,0
968,0


In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

In [None]:
#H1N1 Model
# model_h1n1 = RandomForestRegressor(n_estimators=1200,
#                                min_samples_split=2,
#                                min_samples_leaf=3,
#                                max_features='sqrt',
#                                max_depth=20,
#                                bootstrap=True)
model_h1n1 = CatBoostRegressor(n_estimators=150)
model_h1n1.fit(X_train, y_train)

In [None]:
#Base Model
base_model = RandomForestClassifier(n_estimators=10, random_state=42)
base_model.fit(X_train, y_train)

In [None]:
evaluate(base_model, X_test, y_test)

In [None]:
evaluate(model_h1n1, X_test, y_test)

In [None]:
y_predicted_h1n1 = model_h1n1.predict(X_test)

## Neural Network

### Train Test Split

In [108]:
X = h1n1_train_trans
y = h1n1_labels_balanced

In [109]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

In [110]:
h1n1_mc = ModelCheckpoint('..Models/h1n1_best_model.h5', monitor='val_auc', mode='max', verbose=0, save_best_only=True)

model_h1n1 = keras.Sequential([
    keras.layers.Dense(200, activation='relu', input_dim=31),
    keras.layers.Dense(100, activation='relu'),
    keras.layers.Dense(200, activation='relu'),
    keras.layers.Dense(800, activation='relu'),
    keras.layers.Dense(500, activation='relu'),
    keras.layers.Dense(250, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

In [111]:
model_h1n1.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              metrics=[tf.keras.metrics.AUC(from_logits=False), 'accuracy'])

In [112]:
EarlyStopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', min_delta=0, patience=5, verbose=0,
    mode='auto', baseline=None, restore_best_weights=True
)

history = model_h1n1.fit(
    X_train,
    y_train,
    batch_size=50,
    epochs=150,
    validation_data=(X_test, y_test),
    callbacks=[h1n1_mc, EarlyStopping],
    shuffle=True,
    verbose=1
)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150


In [113]:
#model_h1n1 = load_model('..Models/h1n1_best_model.h5')

y_predicted_h1n1 = model_h1n1.predict(X_test)
roc_auc_score(y_test, y_predicted_h1n1)

0.8567688992454382

# Seasonal

## Random Forest Model

### Train Test Split

In [134]:
X = seas_train_trans
y = seas_labels_balanced

In [135]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

In [140]:
#Seasonal Model
model_seas = RandomForestRegressor(n_estimators=800,
                                    min_samples_split=2,
                                    min_samples_leaf=4,
                                    max_features='sqrt',
                                    max_depth=20,bootstrap=False)
#model_seas = CatBoostRegressor(n_estimators=150)
model_seas.fit(X_train, y_train)

RandomForestRegressor(bootstrap=False, max_depth=20, max_features='sqrt',
                      min_samples_leaf=4, n_estimators=800)

In [141]:
#Base Model
base_model = RandomForestRegressor(n_estimators=10, random_state=42)
base_model.fit(X_train, y_train)

RandomForestRegressor(n_estimators=10, random_state=42)

In [142]:
evaluate(base_model, X_test, y_test)

ValueError: Unable to coerce to Series, length must be 1: given 7461

In [143]:
evaluate(model_seas, X_test, y_test)

ValueError: Unable to coerce to Series, length must be 1: given 7461

In [None]:
y_predicted_seas = model_seas.predict(X_test)

In [None]:
y_predicted = np.vstack((y_predicted_h1n1, y_predicted_seas)).T

In [None]:
roc_auc_score(y_test, y_predicted_seas)

## Neural Network

### Train Test Split

In [119]:
X = seas_train_trans
y = seas_labels_balanced

In [120]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

In [121]:
from tensorflow import keras
from keras.callbacks import ModelCheckpoint

seas_mc = ModelCheckpoint('..Models/seas_best_model.h5', monitor='val_auc_2', mode='max', verbose=1, save_best_only=True)


model_seas = keras.Sequential([
    keras.layers.Dense(100, activation='relu', input_dim=31),
    keras.layers.LeakyReLU(500),
    keras.layers.LeakyReLU(600),
    keras.layers.LeakyReLU(820),
    keras.layers.Dense(200, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

In [122]:
model_seas.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              metrics=[tf.keras.metrics.AUC(from_logits=False)])

In [123]:
history = model_seas.fit(
    X_train,
    y_train,
    batch_size=50,
    epochs=150,
    validation_data=(X_test, y_test),
    callbacks=[seas_mc, EarlyStopping]
)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150


In [124]:
#model_seas = load_model('..Models/seas_best_model.h5')

y_predicted_seas = model_seas.predict(X_test)
roc_auc_score(y_test, y_predicted_seas)

0.8510831972210825

In [None]:
#Full Data Test - Random Forest Network
X_test = true_features
y_test = true_labels_rf.to_numpy()

y_predicted_h1n1 = model_h1n1.predict(X_test).reshape(-1,1)
y_predicted_seas = model_seas.predict(X_test).reshape(-1,1)

y_predicted = np.concatenate((y_predicted_h1n1, y_predicted_seas), axis=1)
roc_auc_score(y_test, y_predicted)

In [None]:
y_predicted_h1n1

In [None]:
pd.DataFrame(y_predicted)

In [125]:
#Full Data Test - Neural Network
X_test = true_features
y_test = true_labels

y_predicted_h1n1 = model_h1n1.predict(X_test)
y_predicted_seas = model_seas.predict(X_test)

y_predicted = np.concatenate((y_predicted_h1n1, y_predicted_seas), axis=1)
roc_auc_score(y_test, y_predicted)

0.8794472852265591

array([[0.48894233, 0.36433274],
       [0.4735105 , 0.14668101],
       [0.4070636 , 0.91004425],
       ...,
       [0.5524947 , 0.20513529],
       [0.17454535, 0.38615042],
       [0.0442811 , 0.08562094]], dtype=float32)

# Submission

In [126]:
test = pd.read_csv('../Data/test_set_features.csv')
full_test = test.copy()

In [127]:
num_cols = list(test.select_dtypes('number').columns)

cat_cols = [
    'race',
    'sex',
    'marital_status',
    'rent_or_own',
    'hhs_geo_region',
    'census_msa',
    'employment_industry',
    'employment_occupation'
]

ord_cols = [
    'age_group',
    'education',
    'income_poverty',
    'employment_status'
]


#Impute Test
for col in num_cols:
    test[col] = test[col].fillna(value=-1)


for col in (cat_cols+ord_cols):
    test[col] = test[col].fillna(value='None')

    
test['age_group'] = test['age_group'].map({
    '18 - 34 Years': 1,
    '35 - 44 Years': 2,
    '45 - 54 Years': 3,
    '55 - 64 Years': 4,
    '65+ Years': 5
})
    
test['education'] = test['education'].map({
    '< 12 Years': 1,
    '12 Years': 2,
    'Some College': 3,
    'College Graduate': 4,
    'None': -1
})

test['income_poverty'] = test['income_poverty'].map({
    'None': -1,
    'Below Poverty': 1,
    '<= $75,000, Above Poverty': 2,
    '> $75,000': 3
})

test['employment_status'] = test['employment_status'].map({
    'None': -1,
    'Unemployed': 1,
    'Employed': 2,
    'Not in Labor Force': 3
})

In [128]:
test = test[used_cols]

In [129]:
test_h1n1 = ct_h1n1.transform(test)
test_seas = ct_seas.transform(test)

In [130]:
y_h1n1 = model_h1n1.predict(test_h1n1)
y_seas = model_seas.predict(test_seas)

y_comb = np.concatenate((y_h1n1, y_seas), axis=1)

In [131]:
y_comb

array([[0.32304165, 0.26871872],
       [0.15819561, 0.13347876],
       [0.46876505, 0.80193985],
       ...,
       [0.3317409 , 0.21633819],
       [0.08595854, 0.41861343],
       [0.8881855 , 0.7818744 ]], dtype=float32)

In [132]:
results = pd.DataFrame(y_comb, columns=['h1n1_vaccine', 'seasonal_vaccine'])

submission = pd.concat([full_test, results], axis=1)
submission = submission[['respondent_id', 'h1n1_vaccine', 'seasonal_vaccine']]

In [133]:
today = datetime.today().date()

submission.to_csv(f'../Submissions/Neural Network Submission {today}.csv', index=False)

In [None]:
submission