In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import matthews_corrcoef

In [2]:
#Functions

def evaluate(model, X_test, y_test):
    predictions = model.predict(X_test)
    errors = abs(predictions - y_test)
    mape = 100 * np.mean(errors / y_test)
    accuracy = 100 - mape
    roc = roc_auc_score(y_test, predictions)
    print('Model Performance')
    print('Average Error: {:0.4f} degrees'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%'.format(accuracy))
    print(f'AUC = {roc}')
    return accuracy

In [6]:
features = pd.read_csv('../Data/training_set_features.csv')
labels = pd.read_csv('../Data/training_set_labels.csv')

df = pd.merge(features, labels, on='respondent_id', how='inner')

df['sex'] = df['sex'].map({
    'Female': 0,
    'Male': 1
})

df['marital_status'] = df['marital_status'].map({
    'Not Married': 0,
    'Married': 1
})

df['rent_or_own'] = df['rent_or_own'].map({
    'Rent': 0,
    'Own': 1
})
for col in df.columns:
    print(col, df[col].unique())

respondent_id [    0     1     2 ... 26704 26705 26706]
h1n1_concern [ 1.  3.  2.  0. nan]
h1n1_knowledge [ 0.  2.  1. nan]
behavioral_antiviral_meds [ 0.  1. nan]
behavioral_avoidance [ 0.  1. nan]
behavioral_face_mask [ 0.  1. nan]
behavioral_wash_hands [ 0.  1. nan]
behavioral_large_gatherings [ 0.  1. nan]
behavioral_outside_home [ 1.  0. nan]
behavioral_touch_face [ 1.  0. nan]
doctor_recc_h1n1 [ 0. nan  1.]
doctor_recc_seasonal [ 0. nan  1.]
chronic_med_condition [ 0.  1. nan]
child_under_6_months [ 0.  1. nan]
health_worker [ 0.  1. nan]
health_insurance [ 1. nan  0.]
opinion_h1n1_vacc_effective [ 3.  5.  4.  2.  1. nan]
opinion_h1n1_risk [ 1.  4.  3.  2.  5. nan]
opinion_h1n1_sick_from_vacc [ 2.  4.  1.  5.  3. nan]
opinion_seas_vacc_effective [ 2.  4.  5.  3.  1. nan]
opinion_seas_risk [ 1.  2.  4.  3.  5. nan]
opinion_seas_sick_from_vacc [ 2.  4.  1.  5. nan  3.]
age_group ['55 - 64 Years' '35 - 44 Years' '18 - 34 Years' '65+ Years'
 '45 - 54 Years']
education ['< 12 Years' '

In [241]:
categorical_columns = [
    #'education',
    #'income_poverty',
    #'marital_status',
    #'rent_or_own',
    #'employment_industry',
    #'employment_occupation',
    'sex',
    #'hhs_geo_region',
    'census_msa',
    'race',
    'age_group',
    'behavioral_face_mask',
    'behavioral_wash_hands',
    'behavioral_antiviral_meds',
    'behavioral_outside_home',
    'behavioral_large_gatherings',
    'behavioral_touch_face',
    'behavioral_avoidance',
    'health_worker',
    'child_under_6_months',
    'chronic_med_condition',
    'doctor_recc_h1n1',
    'doctor_recc_seasonal',
]

numerical_columns = [
    'household_children',
    'household_adults',
    'h1n1_concern',
    'h1n1_knowledge',
    'opinion_h1n1_risk',
    'opinion_h1n1_vacc_effective',
    'opinion_h1n1_sick_from_vacc',
    'opinion_seas_vacc_effective',
    'opinion_seas_risk',
    'opinion_seas_sick_from_vacc',
    #'health_insurance'
]

targets = ['h1n1_vaccine', 'seasonal_vaccine']

In [242]:
df.isnull().sum().sort_values()

respondent_id                      0
census_msa                         0
hhs_geo_region                     0
sex                                0
race                               0
age_group                          0
h1n1_vaccine                       0
seasonal_vaccine                   0
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_antiviral_meds         71
behavioral_outside_home           82
behavioral_large_gatherings       87
h1n1_concern                      92
h1n1_knowledge                   116
behavioral_touch_face            128
behavioral_avoidance             208
household_children               249
household_adults                 249
opinion_h1n1_risk                388
opinion_h1n1_vacc_effective      391
opinion_h1n1_sick_from_vacc      395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
health_worker                    804
child_under_6_months             820
c

In [243]:
for column in numerical_columns:
    df[column] = df[column].fillna(df[column].mean())

health_worker_mean = df['health_worker'].mean()
behavioral_face_mask_mean = df['health_worker'].mean()
behavioral_wash_hands_mean = df['behavioral_wash_hands'].mean()
behavioral_antiviral_meds_mean = df['behavioral_antiviral_meds'].mean()
behavioral_outside_home_mean = df['behavioral_outside_home'].mean()
behavioral_large_gatherings_mean = df['behavioral_large_gatherings'].mean()
behavioral_touch_face_mean = df['behavioral_touch_face'].mean()
behavioral_avoidance_mean = df['behavioral_avoidance'].mean()
child_under_6_months_mean = df['child_under_6_months'].mean()
chronic_med_condition_mean = df['chronic_med_condition'].mean()
doctor_recc_h1n1_mean = df['doctor_recc_h1n1'].mean()
doctor_recc_seasonal_mean = df['doctor_recc_h1n1'].mean()

df['health_worker'] = df['health_worker'].fillna(health_worker_mean)
df['behavioral_face_mask'] = df['behavioral_face_mask'].fillna(behavioral_face_mask_mean)
df['behavioral_wash_hands'] = df['behavioral_wash_hands'].fillna(behavioral_wash_hands_mean)
df['behavioral_antiviral_meds'] = df['behavioral_antiviral_meds'].fillna(behavioral_antiviral_meds_mean)
df['behavioral_outside_home'] = df['behavioral_outside_home'].fillna(behavioral_outside_home_mean)
df['behavioral_large_gatherings'] = df['behavioral_large_gatherings'].fillna(behavioral_large_gatherings_mean)
df['behavioral_touch_face'] = df['behavioral_touch_face'].fillna(behavioral_touch_face_mean)
df['behavioral_avoidance'] = df['behavioral_avoidance'].fillna(behavioral_avoidance_mean)
df['child_under_6_months'] = df['child_under_6_months'].fillna(child_under_6_months_mean)
df['chronic_med_condition'] = df['chronic_med_condition'].fillna(chronic_med_condition_mean)
df['doctor_recc_h1n1'] = df['doctor_recc_h1n1'].fillna(doctor_recc_h1n1_mean)
df['doctor_recc_seasonal'] = df['doctor_recc_seasonal'].fillna(doctor_recc_seasonal_mean)

all_cols = categorical_columns + numerical_columns + targets
df = df[all_cols]

# Preprocessing

In [244]:
X = df.drop(columns=['h1n1_vaccine', 'seasonal_vaccine'])
y = df[['h1n1_vaccine', 'seasonal_vaccine']]

for column in numerical_columns:
    df[column] = np.log(df[column])

df

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,sex,census_msa,race,age_group,behavioral_face_mask,behavioral_wash_hands,behavioral_antiviral_meds,behavioral_outside_home,behavioral_large_gatherings,behavioral_touch_face,...,h1n1_concern,h1n1_knowledge,opinion_h1n1_risk,opinion_h1n1_vacc_effective,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,h1n1_vaccine,seasonal_vaccine
0,Female,Non-MSA,White,55 - 64 Years,0.0,0.0,0.0,1.0,0.0,1.000000,...,0.000000,-inf,0.000000,1.098612,0.693147,0.693147,0.000000,0.693147,0,0
1,Male,"MSA, Not Principle City",White,35 - 44 Years,0.0,1.0,0.0,1.0,0.0,1.000000,...,1.098612,0.693147,1.386294,1.609438,1.386294,1.386294,0.693147,1.386294,0,1
2,Male,"MSA, Not Principle City",White,18 - 34 Years,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,1.098612,0.000000,1.386294,0.000000,0.693147,0,0
3,Female,"MSA, Principle City",White,65+ Years,0.0,1.0,0.0,0.0,1.0,0.000000,...,0.000000,0.000000,1.098612,1.098612,1.609438,1.609438,1.386294,0.000000,0,1
4,Female,"MSA, Not Principle City",White,45 - 54 Years,0.0,1.0,0.0,0.0,1.0,1.000000,...,0.693147,0.000000,1.098612,1.098612,0.693147,1.098612,0.000000,1.386294,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,Female,Non-MSA,White,65+ Years,0.0,0.0,0.0,1.0,0.0,0.000000,...,0.693147,-inf,0.000000,1.098612,0.000000,1.609438,0.693147,0.693147,0,0
26703,Male,"MSA, Principle City",White,18 - 34 Years,0.0,1.0,0.0,0.0,0.0,0.000000,...,0.000000,0.693147,0.693147,1.386294,0.693147,1.609438,0.000000,0.000000,0,0
26704,Female,"MSA, Not Principle City",White,55 - 64 Years,1.0,1.0,0.0,0.0,1.0,1.000000,...,0.693147,0.693147,1.386294,1.386294,0.693147,1.609438,1.386294,0.693147,0,1
26705,Female,Non-MSA,Hispanic,18 - 34 Years,0.0,0.0,0.0,0.0,0.0,0.677264,...,0.000000,0.000000,0.000000,1.098612,0.693147,0.693147,0.000000,0.693147,0,0


In [222]:
#Categorical
cat_df = X[categorical_columns]
recat_df = pd.get_dummies(data=cat_df)

#Numerical
num_df = X[numerical_columns]

scaler = StandardScaler()
scaled_num = scaler.fit_transform(num_df)
scaled_num_df = pd.DataFrame(scaled_num, index=num_df.index, columns=num_df.columns)

encoded_df = pd.concat([recat_df, scaled_num_df], axis=1)

In [223]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(encoded_df, y, test_size=0.3, random_state=42)

y_train_h1n1 = y_train['h1n1_vaccine']
y_train_seas = y_train['seasonal_vaccine']

y_test_h1n1 = y_test['h1n1_vaccine']
y_test_seas = y_test['seasonal_vaccine']

In [224]:
#Create arrays
X_train = np.asarray(X_train)
X_test = np.asarray(X_test)
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)

y_train_h1n1 = np.asarray(y_train_h1n1)
y_test_h1n1 = np.asarray(y_test_h1n1)

y_train_seas = np.asarray(y_train_seas)
y_test_seas = np.asarray(y_test_seas)

# Random Forest

In [225]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

model = RandomForestRegressor()
model.fit(X_train, y_train_h1n1)

y_pred_h1n1 = model.predict(X_test)

In [226]:
#Set up RandomSearch
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 100, num=11)]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
              'max_features': max_features,
              'max_depth': max_depth,
              'min_samples_split': min_samples_split,
              'min_samples_leaf': min_samples_leaf,
              'bootstrap': bootstrap}

rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator=rf, 
                               param_distributions=random_grid, 
                               n_iter=50, 
                               cv=3, 
                               verbose=2, 
                               random_state=42, 
                               n_jobs=-1)

### H1N1

In [227]:
#model = rf_random
model_h1n1 = RandomForestRegressor(n_estimators=800,
                              min_samples_split=2,
                              min_samples_leaf=4,
                              max_features='sqrt',
                              max_depth=20,
                              bootstrap=False)
model_h1n1.fit(X_train, y_train_h1n1)

RandomForestRegressor(bootstrap=False, max_depth=20, max_features='sqrt',
                      min_samples_leaf=4, n_estimators=800)

In [228]:
#model.best_params_

In [229]:
base_model = RandomForestRegressor(n_estimators=10, random_state=42)
base_model.fit(X_train, y_train_h1n1)
evaluate(base_model, X_test, y_test_h1n1)

Model Performance
Average Error: 0.2483 degrees
Accuracy = nan%
AUC = 0.7816175070667295


  mape = 100 * np.mean(errors / y_test)
  mape = 100 * np.mean(errors / y_test)


nan

In [230]:
#best_random = rf_random.best_estimator_
evaluate(model_h1n1, X_test, y_test_h1n1)

Model Performance
Average Error: 0.2423 degrees
Accuracy = -inf%
AUC = 0.8301045010895534


  mape = 100 * np.mean(errors / y_test)


-inf

In [231]:
y_predicted_h1n1 = model_h1n1.predict(X_test)

### Seasonal

In [232]:
#model = rf_random
model_seas = RandomForestRegressor(n_estimators=800,
                              min_samples_split=2,
                              min_samples_leaf=4,
                              max_features='sqrt',
                              max_depth=20,
                              bootstrap=False)

model_seas.fit(X_train, y_train_seas)

RandomForestRegressor(bootstrap=False, max_depth=20, max_features='sqrt',
                      min_samples_leaf=4, n_estimators=800)

In [233]:
#model.best_params_

In [234]:
base_model = RandomForestRegressor(n_estimators=10, random_state=42)
base_model.fit(X_train, y_train_seas)
base_accuracy = evaluate(base_model, X_test, y_test_seas)

Model Performance
Average Error: 0.3251 degrees
Accuracy = nan%
AUC = 0.8082471249693057


  mape = 100 * np.mean(errors / y_test)
  mape = 100 * np.mean(errors / y_test)


In [235]:
evaluate(model_seas, X_test, y_test_seas)

Model Performance
Average Error: 0.3228 degrees
Accuracy = -inf%
AUC = 0.8529219770128039


  mape = 100 * np.mean(errors / y_test)


-inf

In [236]:
y_predicted_seas = model_seas.predict(X_test)

### Combine Targets

In [237]:
y_predicted = np.vstack((y_predicted_h1n1, y_predicted_seas)).T
y_predicted

array([[0.19288598, 0.29934187],
       [0.26939945, 0.19817748],
       [0.06233589, 0.71491743],
       ...,
       [0.0611781 , 0.10010754],
       [0.08652902, 0.13497373],
       [0.10255067, 0.85995094]])

In [238]:
roc_auc_score(y_test, y_predicted)

0.8415132390511786

# Transform Test Data

In [160]:
df_full = pd.read_csv('../Data/test_set_features.csv')

df = df_full.drop(columns=['employment_occupation', 'employment_industry', 'health_insurance', 'respondent_id'])

In [161]:
categorical_columns = [
    #'education',
    #'income_poverty',
    #'marital_status',
    #'rent_or_own',
    #'employment_industry',
    #'employment_occupation',
    'sex',
    #'hhs_geo_region',
    #'census_msa',
    'race',
    'age_group',
    'behavioral_face_mask',
    'behavioral_wash_hands',
    'behavioral_antiviral_meds',
    'behavioral_outside_home',
    'behavioral_large_gatherings',
    'behavioral_touch_face',
    'behavioral_avoidance',
    'health_worker',
    #'child_under_6_months',
    'chronic_med_condition',
    'doctor_recc_h1n1',
    'doctor_recc_seasonal',
]

numerical_columns = [
    #'household_children',
    #'household_adults',
    'h1n1_concern',
    'h1n1_knowledge',
    'opinion_h1n1_risk',
    'opinion_h1n1_vacc_effective',
    'opinion_h1n1_sick_from_vacc',
    'opinion_seas_vacc_effective',
    'opinion_seas_risk',
    'opinion_seas_sick_from_vacc',
    #'health_insurance'
]


In [162]:
for column in numerical_columns:
    df[column] = df[column].fillna(df[column].mean())

health_worker_mean = df['health_worker'].mean()
behavioral_face_mask_mean = df['health_worker'].mean()
behavioral_wash_hands_mean = df['behavioral_wash_hands'].mean()
behavioral_antiviral_meds_mean = df['behavioral_antiviral_meds'].mean()
behavioral_outside_home_mean = df['behavioral_outside_home'].mean()
behavioral_large_gatherings_mean = df['behavioral_large_gatherings'].mean()
behavioral_touch_face_mean = df['behavioral_touch_face'].mean()
behavioral_avoidance_mean = df['behavioral_avoidance'].mean()
child_under_6_months_mean = df['child_under_6_months'].mean()
chronic_med_condition_mean = df['chronic_med_condition'].mean()
doctor_recc_h1n1_mean = df['doctor_recc_h1n1'].mean()
doctor_recc_seasonal_mean = df['doctor_recc_h1n1'].mean()

df['health_worker'] = df['health_worker'].fillna(health_worker_mean)
df['behavioral_face_mask'] = df['behavioral_face_mask'].fillna(behavioral_face_mask_mean)
df['behavioral_wash_hands'] = df['behavioral_wash_hands'].fillna(behavioral_wash_hands_mean)
df['behavioral_antiviral_meds'] = df['behavioral_antiviral_meds'].fillna(behavioral_antiviral_meds_mean)
df['behavioral_outside_home'] = df['behavioral_outside_home'].fillna(behavioral_outside_home_mean)
df['behavioral_large_gatherings'] = df['behavioral_large_gatherings'].fillna(behavioral_large_gatherings_mean)
df['behavioral_touch_face'] = df['behavioral_touch_face'].fillna(behavioral_touch_face_mean)
df['behavioral_avoidance'] = df['behavioral_avoidance'].fillna(behavioral_avoidance_mean)
df['child_under_6_months'] = df['child_under_6_months'].fillna(child_under_6_months_mean)
df['chronic_med_condition'] = df['chronic_med_condition'].fillna(chronic_med_condition_mean)
df['doctor_recc_h1n1'] = df['doctor_recc_h1n1'].fillna(doctor_recc_h1n1_mean)
df['doctor_recc_seasonal'] = df['doctor_recc_seasonal'].fillna(doctor_recc_seasonal_mean)

full_cols = categorical_columns + numerical_columns
df = df[full_cols]

# Preprocessing

In [163]:
X = df.copy()

In [164]:
#Categorical
cat_df = X[categorical_columns]
recat_df = pd.get_dummies(data=cat_df)

#Numerical
num_df = X[numerical_columns]

scaler = StandardScaler()
scaled_num = scaler.fit_transform(num_df)
scaled_num_df = pd.DataFrame(scaled_num, index=num_df.index, columns=num_df.columns)

encoded_df = pd.concat([recat_df, scaled_num_df], axis=1)
X = np.asarray(encoded_df)

In [165]:
results_h1n1 = model_h1n1.predict(X)
results_seas = model_seas.predict(X)

results = np.vstack((results_h1n1, results_seas)).T

In [166]:
results = pd.DataFrame(results, columns=['h1n1_vaccine', 'seasonal_vaccine'])

submission = pd.concat([df_full, results], axis=1)
submission = submission[['respondent_id', 'h1n1_vaccine', 'seasonal_vaccine']]

In [167]:
submission.to_csv('../Submissions/Submission 7.1.21.csv', index=False)