In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
features = pd.read_csv('../Data/training_set_features.csv')
labels = pd.read_csv('../Data/training_set_labels.csv')

In [4]:
df = pd.merge(features, labels, on='respondent_id', how='inner')

In [5]:
df = df.drop(columns=['employment_occupation', 'employment_industry', 'health_insurance', 'respondent_id'])

In [6]:
seas_df = df.drop(columns=['h1n1_concern',
                          'h1n1_knowledge',
                          'doctor_recc_h1n1',
                          'opinion_h1n1_vacc_effective',
                          'opinion_h1n1_risk',
                          'opinion_h1n1_sick_from_vacc',
                          'h1n1_vaccine'])

h1n1_df = df.drop(columns=['doctor_recc_seasonal',
                          'opinion_seas_vacc_effective',
                          'opinion_seas_risk',
                          'opinion_seas_sick_from_vacc',
                          'seasonal_vaccine'])

In [7]:
categorical_columns = [
    'sex',
    'hhs_geo_region',
    'census_msa',
    'race',
    'age_group',
    'behavioral_face_mask',
    'behavioral_wash_hands',
    'behavioral_antiviral_meds',
    'behavioral_outside_home',
    'behavioral_large_gatherings',
    'behavioral_touch_face',
    'behavioral_avoidance',
    'health_worker',
    'child_under_6_months',
    'chronic_med_condition',
    'education',
    'marital_status',
    'employment_status',
    'rent_or_own',
    'doctor_recc_h1n1',
    'doctor_recc_seasonal',
    'income_poverty'
]

numerical_columns = [
    'household_children',
    'household_adults',
    'h1n1_concern',
    'h1n1_knowledge',
    'opinion_h1n1_risk',
    'opinion_h1n1_vacc_effective',
    'opinion_h1n1_sick_from_vacc',
    'opinion_seas_vacc_effective',
    'opinion_seas_risk',
    'opinion_seas_sick_from_vacc',
    
]

for column in categorical_columns:
    curr_col = df[column]
    df.loc[df[column] == 1, column] = 'Yes'
    df.loc[df[column] == 0, column] = 'No'



## Deal with NAs

In [8]:
((df.isnull().sum() / len(df)) * 100).sort_values()

seasonal_vaccine                0.000000
census_msa                      0.000000
hhs_geo_region                  0.000000
sex                             0.000000
race                            0.000000
age_group                       0.000000
h1n1_vaccine                    0.000000
behavioral_face_mask            0.071142
behavioral_wash_hands           0.157262
behavioral_antiviral_meds       0.265848
behavioral_outside_home         0.307036
behavioral_large_gatherings     0.325757
h1n1_concern                    0.344479
h1n1_knowledge                  0.434343
behavioral_touch_face           0.479275
behavioral_avoidance            0.778822
household_children              0.932340
household_adults                0.932340
opinion_h1n1_risk               1.452803
opinion_h1n1_vacc_effective     1.464036
opinion_h1n1_sick_from_vacc     1.479013
opinion_seas_vacc_effective     1.729884
opinion_seas_risk               1.924589
opinion_seas_sick_from_vacc     2.010709
health_worker   

In [144]:
for column in numerical_columns:
    df[column] = df[column].fillna(df[column].mean())

df = df.dropna()

## Initial Run

In [146]:
X = df.drop(columns=['h1n1_vaccine', 'seasonal_vaccine'])
y = df[['h1n1_vaccine', 'seasonal_vaccine']]
y_h1n1 = df[['h1n1_vaccine']]
y_seas = df[['seasonal_vaccine']]

KeyError: "['h1n1_vaccine' 'seasonal_vaccine'] not found in axis"

#### Categorical

In [11]:
#Get Binary Data for Categorical Variables
cat_df = X[categorical_columns]

In [12]:
recat_df = pd.get_dummies(data=cat_df)

#### Numerical

In [13]:
num_df = X[numerical_columns]

In [14]:
from sklearn.preprocessing import StandardScaler

#Scale Numerical Data
scaler = StandardScaler()
scaled_num = scaler.fit_transform(num_df)
scaled_num_df = pd.DataFrame(scaled_num, index=num_df.index, columns=num_df.columns)

In [15]:
encoded_df = pd.concat([recat_df, scaled_num_df], axis=1)

In [17]:
encoded_df

Unnamed: 0,sex_Female,sex_Male,hhs_geo_region_atmpeygn,hhs_geo_region_bhuqouqj,hhs_geo_region_dqpwygqj,hhs_geo_region_fpwskwrf,hhs_geo_region_kbazzjca,hhs_geo_region_lrircsnp,hhs_geo_region_lzgpxyit,hhs_geo_region_mlyzmhmf,...,household_children,household_adults,h1n1_concern,h1n1_knowledge,opinion_h1n1_risk,opinion_h1n1_vacc_effective,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc
0,1,0,0,0,0,0,0,0,0,0,...,-0.578667,-1.182177,-0.680609,-2.046928,-1.052050,-0.850610,-0.264426,-1.880954,-1.253366,-0.089516
1,0,1,0,1,0,0,0,0,0,0,...,-0.578667,-1.182177,1.520279,1.195647,1.298784,1.149360,1.214180,-0.024126,-0.524309,1.426260
2,0,1,0,0,0,0,0,0,0,0,...,-0.578667,1.484891,-0.680609,-0.425641,-1.052050,-0.850610,-1.003729,-0.024126,-1.253366,-0.089516
3,1,0,0,0,0,0,0,1,0,0,...,-0.578667,-1.182177,-0.680609,-0.425641,0.515173,-0.850610,1.953484,0.904289,0.933803,-0.847404
4,1,0,0,0,0,0,0,0,0,0,...,-0.578667,0.151357,0.419835,-0.425641,0.515173,-0.850610,-0.264426,-0.952540,-1.253366,1.426260
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,1,0,0,0,0,0,0,0,0,0,...,-0.578667,-1.182177,0.419835,-2.046928,-1.052050,-0.850610,-1.003729,0.904289,-0.524309,-0.089516
26703,0,1,0,0,0,0,0,0,1,0,...,-0.578667,0.151357,-0.680609,1.195647,-0.268439,0.149375,-0.264426,0.904289,-1.253366,-0.847404
26704,1,0,0,0,0,0,0,0,1,0,...,-0.578667,-1.182177,0.419835,1.195647,1.298784,0.149375,-0.264426,0.904289,0.933803,-0.089516
26705,1,0,0,0,0,0,0,1,0,0,...,-0.578667,0.151357,-0.680609,-0.425641,-1.052050,-0.850610,-0.264426,-1.880954,-1.253366,-0.089516


In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(encoded_df, y, test_size=0.3, random_state=42)

In [19]:
X_train = np.asarray(X_train)
X_test = np.asarray(X_test)
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)
X = np.asarray(encoded_df)

# Neural Network

In [98]:
from tensorflow import keras

model = keras.Sequential([
    keras.layers.Dense(60, activation='selu', input_dim=84),
    keras.layers.Dense(100, activation='relu'),
    keras.layers.Dense(200, activation='selu'),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(5, activation='swish'),
    keras.layers.Dense(2, activation='swish')
])

In [99]:
model.compile(optimizer='adam', 
              loss=tf.losses.CategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [100]:
history = model.fit(
    X_train,
    y_train,
    batch_size=200,
    epochs=5000,
    validation_data=(X_test, y_test)
)

Epoch 1/5000
Epoch 2/5000
Epoch 3/5000
Epoch 4/5000
Epoch 5/5000
Epoch 6/5000
Epoch 7/5000
Epoch 8/5000
Epoch 9/5000
Epoch 10/5000
Epoch 11/5000
Epoch 12/5000
Epoch 13/5000
Epoch 14/5000
Epoch 15/5000
Epoch 16/5000
Epoch 17/5000
Epoch 18/5000
Epoch 19/5000
Epoch 20/5000
Epoch 21/5000
Epoch 22/5000
Epoch 23/5000
Epoch 24/5000
Epoch 25/5000
Epoch 26/5000
Epoch 27/5000
Epoch 28/5000
Epoch 29/5000
Epoch 30/5000
Epoch 31/5000
Epoch 32/5000
Epoch 33/5000
Epoch 34/5000
Epoch 35/5000
Epoch 36/5000
Epoch 37/5000
Epoch 38/5000
Epoch 39/5000
Epoch 40/5000
Epoch 41/5000
Epoch 42/5000
Epoch 43/5000
Epoch 44/5000
Epoch 45/5000
Epoch 46/5000
Epoch 47/5000
Epoch 48/5000
Epoch 49/5000
Epoch 50/5000
Epoch 51/5000
Epoch 52/5000
Epoch 53/5000
Epoch 54/5000
Epoch 55/5000
Epoch 56/5000
Epoch 57/5000
Epoch 58/5000
Epoch 59/5000
Epoch 60/5000
Epoch 61/5000
Epoch 62/5000
Epoch 63/5000
Epoch 64/5000
Epoch 65/5000
Epoch 66/5000
Epoch 67/5000
Epoch 68/5000
Epoch 69/5000
Epoch 70/5000
Epoch 71/5000

KeyboardInterrupt: 

In [101]:
y_true = y_test
y_predicted = model.predict(X_test)
y_predicted_binary = np.where(y_predicted > 0.5, 1, 0)

In [105]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_true, y_predicted)

0.8265410454290134

# Random Forest

In [104]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

model = RandomForestRegressor()
model.fit(X_train, y_train)

y_predicted = model.predict(X_test)

r2_score(y_predicted, y_test)

-1.1959004149577708

In [109]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [110]:
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

In [111]:
model = rf_random
model.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed: 30.0min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 74.9min finished


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [112]:
model.best_params_

{'n_estimators': 800,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 20,
 'bootstrap': False}

In [114]:
def evaluate(model, X_test, y_test):
    predictions = model.predict(X_test)
    errors = abs(predictions - y_test)
    mape = 100 * np.mean(errors / y_test)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy


base_model = RandomForestRegressor(n_estimators = 10, random_state = 42)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)



best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)

  mape = 100 * np.mean(errors / y_test)
  mape = 100 * np.mean(errors / y_test)


Model Performance
Average Error: 0.2832 degrees.
Accuracy = nan%.
Model Performance
Average Error: 0.2890 degrees.
Accuracy = -inf%.


  mape = 100 * np.mean(errors / y_test)


In [116]:
y_predicted = best_random.predict(X_test)
model = best_random

from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, y_predicted)

0.843480834828371

## Submission Data

In [117]:
test_data = pd.read_csv('../Data/test_set_features.csv')
df_full = test_data

In [118]:
df = df_full.drop(columns=['employment_occupation', 'employment_industry', 'health_insurance', 'respondent_id'])

categorical_columns = [
    'sex',
    'hhs_geo_region',
    'census_msa',
    'race',
    'age_group',
    'behavioral_face_mask',
    'behavioral_wash_hands',
    'behavioral_antiviral_meds',
    'behavioral_outside_home',
    'behavioral_large_gatherings',
    'behavioral_touch_face',
    'behavioral_avoidance',
    'health_worker',
    'child_under_6_months',
    'chronic_med_condition',
    'education',
    'marital_status',
    'employment_status',
    'rent_or_own',
    'doctor_recc_h1n1',
    'doctor_recc_seasonal',
    'income_poverty'
]

numerical_columns = [
    'household_children',
    'household_adults',
    'h1n1_concern',
    'h1n1_knowledge',
    'opinion_h1n1_risk',
    'opinion_h1n1_vacc_effective',
    'opinion_h1n1_sick_from_vacc',
    'opinion_seas_vacc_effective',
    'opinion_seas_risk',
    'opinion_seas_sick_from_vacc',
    
]

for column in categorical_columns:
    curr_col = df[column]
    df.loc[df[column] == 1, column] = 'Yes'
    df.loc[df[column] == 0, column] = 'No'

In [119]:
for column in numerical_columns:
    df[column] = df[column].fillna(df[column].mean())

df['health_worker'] = df['health_worker'].fillna(0)
df['behavioral_face_mask'] = df['behavioral_face_mask'].fillna(0)
df['behavioral_wash_hands'] = df['behavioral_wash_hands'].fillna(0)
df['behavioral_antiviral_meds'] = df['behavioral_antiviral_meds'].fillna(0)
df['behavioral_outside_home'] = df['behavioral_outside_home'].fillna(0)
df['behavioral_large_gatherings'] = df['behavioral_large_gatherings'].fillna(0)
df['behavioral_touch_face'] = df['behavioral_touch_face'].fillna(0)
df['behavioral_avoidance'] = df['behavioral_avoidance'].fillna(0)
df['child_under_6_months'] = df['child_under_6_months'].fillna(0)
df['chronic_med_condition'] = df['chronic_med_condition'].fillna(0)
df['marital_status'] = df['marital_status'].fillna('Not Married')
df['rent_or_own'] = df['rent_or_own'].fillna('Rent')
df['education'] = df['education'].fillna('Some College')
df['employment_status'] = df['employment_status'].fillna('Employed')
df['doctor_recc_h1n1'] = df['doctor_recc_h1n1'].fillna(1)
df['doctor_recc_seasonal'] = df['doctor_recc_seasonal'].fillna(1)
df['income_poverty'] = df['income_poverty'].fillna('<= $75,000, Above Poverty')

In [120]:
X = df

#Get Binary Data for Categorical Variables
cat_df = X[categorical_columns]
recat_df = pd.get_dummies(data=cat_df)

num_df = X[numerical_columns]

from sklearn.preprocessing import StandardScaler

#Scale Numerical Data
scaler = StandardScaler()
scaled_num = scaler.fit_transform(num_df)
scaled_num_df = pd.DataFrame(scaled_num, index=num_df.index, columns=num_df.columns)

encoded_df = pd.concat([recat_df, scaled_num_df], axis=1)

X = np.asarray(encoded_df)

In [121]:
y = model.predict(X)
y_df = pd.DataFrame(y, columns=['h1n1_vaccine', 'seasonal_vaccine'])

In [122]:
results = pd.concat([df_full, y_df], axis=1)

In [123]:
results = results[['respondent_id', 'h1n1_vaccine', 'seasonal_vaccine']]

In [124]:
results.to_csv('../Submissions/Submission 6.29.21.csv', index=False)

In [128]:
from sklearn.feature_selection import SelectFromModel

sel = SelectFromModel(RandomForestRegressor(n_estimators= 800,
 min_samples_split= 2,
 min_samples_leaf= 4,
max_features= 'sqrt',
 max_depth= 20,
 bootstrap= False))
sel.fit(X_train, y_train)

In [136]:
selected_feat= encoded_df.columns[(sel.get_support())]
len(selected_feat)

17

In [137]:
selected_feat

Index(['age_group_18 - 34 Years', 'age_group_65+ Years', 'health_worker_No',
       'health_worker_Yes', 'doctor_recc_h1n1_No', 'doctor_recc_h1n1_Yes',
       'doctor_recc_seasonal_No', 'doctor_recc_seasonal_Yes',
       'household_children', 'h1n1_concern', 'h1n1_knowledge',
       'opinion_h1n1_risk', 'opinion_h1n1_vacc_effective',
       'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc'],
      dtype='object')

In [142]:
pd.Series(sel.estimator_, feature_importances_.ravel()).hist()

NameError: name 'feature_importances_' is not defined

In [143]:
for x in selected_feat:
    print(x)

age_group_18 - 34 Years
age_group_65+ Years
health_worker_No
health_worker_Yes
doctor_recc_h1n1_No
doctor_recc_h1n1_Yes
doctor_recc_seasonal_No
doctor_recc_seasonal_Yes
household_children
h1n1_concern
h1n1_knowledge
opinion_h1n1_risk
opinion_h1n1_vacc_effective
opinion_h1n1_sick_from_vacc
opinion_seas_vacc_effective
opinion_seas_risk
opinion_seas_sick_from_vacc
