In [2]:
! pip install keras-tuner



# **Loading the data**

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


In [6]:
# Load the cleaned dataset
clean_df = pd.read_csv('clean_joined_df.csv', index_col='respondent_id')
print("clean_df.shape", clean_df.shape)
clean_df.head()


clean_df.shape (26707, 48)


Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,hhs_geo_region_kbazzjca,hhs_geo_region_lrircsnp,hhs_geo_region_lzgpxyit,hhs_geo_region_mlyzmhmf,hhs_geo_region_oxchjgsf,hhs_geo_region_qufhixun,"census_msa_MSA, Principle City",census_msa_Non-MSA,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,False,False,False,False,True,False,False,True,0,0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,False,False,False,False,False,False,False,False,0,1
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,True,False,False,0,0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,False,True,False,False,False,False,True,False,0,1
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,False,False,False,False,False,True,False,False,0,0


## **Preprocessing Categorical Data**

In [7]:
# Separate numeric and categorical columns
numeric_cols = clean_df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = clean_df.select_dtypes(include=['object']).columns

In [9]:
# Display value counts for each categorical column
for col in categorical_cols:
    print(f"\nValue counts in column '{col}':")
    print(clean_df[col].value_counts())


In [10]:
clean_df.head()

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,hhs_geo_region_kbazzjca,hhs_geo_region_lrircsnp,hhs_geo_region_lzgpxyit,hhs_geo_region_mlyzmhmf,hhs_geo_region_oxchjgsf,hhs_geo_region_qufhixun,"census_msa_MSA, Principle City",census_msa_Non-MSA,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,False,False,False,False,True,False,False,True,0,0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,False,False,False,False,False,False,False,False,0,1
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,True,False,False,0,0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,False,True,False,False,False,False,True,False,0,1
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,False,False,False,False,False,True,False,False,0,0


In [11]:
clean_df.describe()

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,opinion_seas_risk,opinion_seas_sick_from_vacc,age_group,education,income_poverty,employment_status,household_adults,household_children,h1n1_vaccine,seasonal_vaccine
count,26707.0,26707.0,26707.0,26707.0,26707.0,26707.0,26707.0,26707.0,26707.0,26707.0,...,26707.0,26707.0,26707.0,26707.0,26707.0,26707.0,26707.0,26707.0,26707.0,26707.0
mean,1.618486,1.262532,0.048714,0.727749,0.068933,0.825888,0.35864,0.337315,0.677264,0.202494,...,2.719162,2.118112,3.186131,2.984049,2.154005,2.328678,0.886499,0.529599,0.212454,0.465608
std,0.908741,0.616805,0.215273,0.445127,0.253345,0.379213,0.478828,0.472076,0.46641,0.401866,...,1.371662,1.31948,1.45732,0.974074,0.576428,0.573996,0.749901,0.925264,0.409052,0.498825
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,2.0,1.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0
50%,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,2.0,2.0,3.0,3.0,2.0,2.0,1.0,0.0,0.0,0.0
75%,2.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,4.0,2.118112,5.0,4.0,3.0,3.0,1.0,1.0,0.0,1.0
max,3.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,5.0,5.0,5.0,4.0,3.0,3.0,3.0,3.0,1.0,1.0


In [12]:
data_type = clean_df.dtypes
print(data_type)

h1n1_concern                      float64
h1n1_knowledge                    float64
behavioral_antiviral_meds         float64
behavioral_avoidance              float64
behavioral_face_mask              float64
behavioral_wash_hands             float64
behavioral_large_gatherings       float64
behavioral_outside_home           float64
behavioral_touch_face             float64
doctor_recc_h1n1                  float64
doctor_recc_seasonal              float64
chronic_med_condition             float64
child_under_6_months              float64
health_worker                     float64
health_insurance                  float64
opinion_h1n1_vacc_effective       float64
opinion_h1n1_risk                 float64
opinion_h1n1_sick_from_vacc       float64
opinion_seas_vacc_effective       float64
opinion_seas_risk                 float64
opinion_seas_sick_from_vacc       float64
age_group                         float64
education                         float64
income_poverty                    

In [14]:
def identify_numerical_columns(df):
    """
    Identifies numerical columns in a DataFrame.

    Parameters:
    df (pd.DataFrame): The input DataFrame.

    Returns:
    list: A list of numerical column names.
    """
    # Select columns with numeric data types
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    return numeric_cols

# Example usage:
numerical_columns = identify_numerical_columns(clean_df)
print("Numerical Columns:", numerical_columns)


Numerical Columns: ['h1n1_concern', 'h1n1_knowledge', 'behavioral_antiviral_meds', 'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home', 'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal', 'chronic_med_condition', 'child_under_6_months', 'health_worker', 'health_insurance', 'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group', 'education', 'income_poverty', 'employment_status', 'household_adults', 'household_children', 'h1n1_vaccine', 'seasonal_vaccine']


In [16]:
from sklearn.preprocessing import MinMaxScaler

# Define numerical columns
numeric_cols = [
    # Behavioral factors
    'behavioral_antiviral_meds', 'behavioral_avoidance', 'behavioral_face_mask',
    'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home',
    'behavioral_touch_face',

    # Doctor recommendations
    'doctor_recc_h1n1', 'doctor_recc_seasonal',

    # Health-related factors
    'chronic_med_condition', 'child_under_6_months', 'health_worker', 'health_insurance',

    # Opinions on vaccines
    'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc',
    'opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc',

    # Concerns and knowledge
    'h1n1_concern', 'h1n1_knowledge',

    # Ordinal data
    'age_group', 'education', 'income_poverty', 'employment_status',

    # Household demographics
    'household_adults', 'household_children'
]

In [17]:
# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Scale only numeric columns
clean_df[numeric_cols] = scaler.fit_transform(clean_df[numeric_cols])


In [18]:
clean_df.describe()

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,opinion_seas_risk,opinion_seas_sick_from_vacc,age_group,education,income_poverty,employment_status,household_adults,household_children,h1n1_vaccine,seasonal_vaccine
count,26707.0,26707.0,26707.0,26707.0,26707.0,26707.0,26707.0,26707.0,26707.0,26707.0,...,26707.0,26707.0,26707.0,26707.0,26707.0,26707.0,26707.0,26707.0,26707.0,26707.0
mean,0.539495,0.631266,0.048714,0.727749,0.068933,0.825888,0.35864,0.337315,0.677264,0.202494,...,0.42979,0.279528,0.546533,0.66135,0.577002,0.664339,0.2955,0.176533,0.212454,0.465608
std,0.302914,0.308403,0.215273,0.445127,0.253345,0.379213,0.478828,0.472076,0.46641,0.401866,...,0.342915,0.32987,0.36433,0.324691,0.288214,0.286998,0.249967,0.308421,0.409052,0.498825
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.333333,0.5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.25,0.0,0.25,0.333333,0.5,0.5,0.0,0.0,0.0,0.0
50%,0.666667,0.5,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.25,0.25,0.5,0.666667,0.5,0.5,0.333333,0.0,0.0,0.0
75%,0.666667,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,0.75,0.279528,1.0,1.0,1.0,1.0,0.333333,0.333333,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Training Data



In [26]:
# Separate features (X) and targets (y)
X = clean_df.drop(['h1n1_vaccine', 'seasonal_vaccine'], axis=1)
y_h1n1 = clean_df['h1n1_vaccine']  # H1N1 vaccine
y_seasonal = clean_df['seasonal_vaccine']  # Seasonal vaccine


In [27]:
from sklearn.model_selection import train_test_split

# Split for H1N1 vaccine prediction
X_train, X_test, y_h1n1_train, y_h1n1_test = train_test_split(X, y_h1n1, test_size=0.2, random_state=42)

# Split for Seasonal vaccine prediction
X_train, X_test, y_seasonal_train, y_seasonal_test = train_test_split(X, y_seasonal, test_size=0.2, random_state=42)

print("H1N1 Training Data:", X_train.shape, y_h1n1_train.shape)
print("Seasonal Training Data:", X_train.shape, y_seasonal_train.shape)

H1N1 Training Data: (21365, 46) (21365,)
Seasonal Training Data: (21365, 46) (21365,)


# **Build the Neural Network Model**


In [28]:
import tensorflow as tf
import sklearn as skl
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [29]:
import keras_tuner as kt

In [30]:
# Create the Keras Sequential model
model = tf.keras.models.Sequential()

# Single Prediction Model for H1N1

In [42]:
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow Keras Tuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation', ['relu', 'tanh', 'softmax'])

    # Allow Keras Tuner to decide the number of neurons in the first layer
    nn_model.add(tf.keras.layers.Dense(
        units=hp.Int('first_units', min_value=1, max_value=30, step=5),
        activation=activation, input_dim=46))  # Set input_dim to 46

    # Allow Keras Tuner to decide the number of hidden layers and their neurons
    for i in range(hp.Int('num_layers', 1, 10)):
        nn_model.add(tf.keras.layers.Dense(
            units=hp.Int('units_' + str(i), min_value=1, max_value=30, step=5),
            activation=activation))

    # Output layer with 1 unit (for a single target variable) and sigmoid activation
    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(
        loss="binary_crossentropy",  # Loss for binary classification
        optimizer='adam',
        metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]  # Include AUC as a metric
    )

    return nn_model


In [43]:
# Set up Hyperband tuner
tuner = kt.Hyperband(
    create_model,
    objective="val_auc",
    max_epochs=20,
    factor=3,
    hyperband_iterations=1,
    directory="my_dir",
    project_name="h1n1_vaccine_prediction"
)


Reloading Tuner from my_dir/h1n1_vaccine_prediction/tuner0.json


In [46]:
tuner.search(
    X_train, y_h1n1_train,  # Correctly reshaped training target
    epochs=20,
    validation_data=(X_test, y_h1n1_test),  # Correctly reshaped test target
    batch_size=32
)


Trial 26 Complete [00h 00m 51s]
val_auc: 0.8265522718429565

Best val_auc So Far: 0.8326616287231445
Total elapsed time: 00h 15m 23s


In [47]:
# Get the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

In [48]:
# Build the best model
best_H1N1_model = tuner.hypermodel.build(best_hps)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [49]:
# Get the best hyperparameters for H1N1 model
best_hps_H1N1 = tuner.get_best_hyperparameters(num_trials=1)[0]

# Print the hyperparameter values
print("Best Hyperparameters for H1N1 Model:")
for param, value in best_hps_H1N1.values.items():
    print(f"{param}: {value}")


Best Hyperparameters for H1N1 Model:
activation: tanh
first_units: 11
num_layers: 5
units_0: 26
units_1: 6
units_2: 26
units_3: 16
units_4: 16
units_5: 26
units_6: 26
units_7: 21
units_8: 21
tuner/epochs: 20
tuner/initial_epoch: 0
tuner/bracket: 0
tuner/round: 0


In [50]:
best_H1N1_model.summary()

In [51]:
# Train the best model
history = best_H1N1_model.fit(
    X_train,
    y_h1n1_train,
    epochs=20,
    validation_data=(X_test, y_h1n1_test),
    batch_size=32
)

Epoch 1/20
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.8004 - auc: 0.6842 - loss: 0.4723 - val_accuracy: 0.8285 - val_auc: 0.8124 - val_loss: 0.4036
Epoch 2/20
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8267 - auc: 0.8148 - loss: 0.4013 - val_accuracy: 0.8326 - val_auc: 0.8194 - val_loss: 0.3931
Epoch 3/20
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8301 - auc: 0.8243 - loss: 0.3952 - val_accuracy: 0.8340 - val_auc: 0.8260 - val_loss: 0.3886
Epoch 4/20
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8299 - auc: 0.8213 - loss: 0.3940 - val_accuracy: 0.8355 - val_auc: 0.8229 - val_loss: 0.3902
Epoch 5/20
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8359 - auc: 0.8301 - loss: 0.3860 - val_accuracy: 0.8341 - val_auc: 0.8265 - val_loss: 0.3918
Epoch 6/20
[1m668/668[0m [3

In [52]:
loss, accuracy, auc = best_H1N1_model.evaluate(X_test, y_h1n1_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}, Test AUC: {auc}')

[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8461 - auc: 0.8431 - loss: 0.3700
Test Loss: 0.3839156925678253, Test Accuracy: 0.8377012610435486, Test AUC: 0.8300442099571228


In [81]:
best_H1N1_model.save("new_best_H1N1_model.h5")



# New Single Prediction Model for Seasonal

In [54]:
def create_seasonal_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow Keras Tuner to decide activation functions in hidden layers
    activation = hp.Choice('activation', ['relu', 'tanh', 'softmax'])

    # First layer
    nn_model.add(tf.keras.layers.Dense(
        units=hp.Int('first_units', min_value=1, max_value=30, step=5),
        activation=activation, input_dim=46))  # Set input_dim to 46 (same as seasonal data)

    # Hidden layers
    for i in range(hp.Int('num_layers', 1, 10)):
        nn_model.add(tf.keras.layers.Dense(
            units=hp.Int('units_' + str(i), min_value=1, max_value=30, step=5),
            activation=activation))

    # Output layer for predicting probabilities
    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(
        optimizer='adam',
        loss="binary_crossentropy",
        metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
    )

    return nn_model


In [55]:
# Set up Hyperband tuner
tuner_seasonal = kt.Hyperband(
    create_seasonal_model,
    objective="val_auc",  # Optimize for validation AUC
    max_epochs=20,  # Each trial can run up to 20 epochs
    factor=3,  # Reduce resources by a factor of 3 in successive rounds
    hyperband_iterations=2,  # Number of rounds of Hyperband
    directory="my_dir",
    project_name="seasonal_vaccine_prediction"
)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [56]:
# Run the tuner
tuner_seasonal.search(
    X_train, y_seasonal_train,
    epochs=20,
    validation_data=(X_test, y_seasonal_test),
    batch_size=32
)

Trial 60 Complete [00h 00m 51s]
val_auc: 0.8527311086654663

Best val_auc So Far: 0.8538097143173218
Total elapsed time: 00h 24m 22s


In [57]:
# Get the best hyperparameters
best_hps_seasonal = tuner_seasonal.get_best_hyperparameters(num_trials=1)[0]

In [58]:
# Build the best seasonal model
best_seasonal_model = tuner_seasonal.hypermodel.build(best_hps_seasonal)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [59]:
# Get the best hyperparameters
best_hps_seasonal = tuner_seasonal.get_best_hyperparameters(num_trials=1)[0]

# Print the hyperparameter values
print("Best Hyperparameters for Seasonal Model:")
for param, value in best_hps_seasonal.values.items():
    print(f"{param}: {value}")


Best Hyperparameters for Seasonal Model:
activation: tanh
first_units: 21
num_layers: 3
units_0: 11
units_1: 6
units_2: 16
units_3: 21
units_4: 1
units_5: 26
units_6: 21
units_7: 11
tuner/epochs: 20
tuner/initial_epoch: 7
tuner/bracket: 2
tuner/round: 2
tuner/trial_id: 0013
units_8: 6
units_9: 6


In [60]:
best_seasonal_model.summary()

In [61]:
# Train the best seasonal model
history_seasonal = best_seasonal_model.fit(
    X_train,
    y_seasonal_train,
    epochs=20,
    validation_data=(X_test, y_seasonal_test),
    batch_size=32
)


Epoch 1/20
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.6719 - auc: 0.7275 - loss: 0.5988 - val_accuracy: 0.7705 - val_auc: 0.8389 - val_loss: 0.4957
Epoch 2/20
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7566 - auc: 0.8310 - loss: 0.5067 - val_accuracy: 0.7748 - val_auc: 0.8444 - val_loss: 0.4884
Epoch 3/20
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7674 - auc: 0.8403 - loss: 0.4945 - val_accuracy: 0.7772 - val_auc: 0.8487 - val_loss: 0.4852
Epoch 4/20
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7640 - auc: 0.8365 - loss: 0.5000 - val_accuracy: 0.7630 - val_auc: 0.8492 - val_loss: 0.5020
Epoch 5/20
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7650 - auc: 0.8377 - loss: 0.4977 - val_accuracy: 0.7787 - val_auc: 0.8504 - val_loss: 0.4875
Epoch 6/20
[1m668/668[0m [3

In [62]:
# Evaluate the model
seasonal_loss, seasonal_accuracy, seasonal_auc = best_seasonal_model.evaluate(X_test, y_seasonal_test)
print(f'Test Loss: {seasonal_loss}, Test Accuracy: {seasonal_accuracy}, Test AUC: {seasonal_auc}')


[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7762 - auc: 0.8552 - loss: 0.4785
Test Loss: 0.48429346084594727, Test Accuracy: 0.7748034596443176, Test AUC: 0.8500616550445557


In [80]:
best_seasonal_model.save("new_best_seasonal_model.h5")



# Make Predictions


In [92]:
final_test_df = pd.read_csv('new_test_df.csv',
                      index_col='respondent_id'
)

In [93]:
print(final_test_df.shape)

(26708, 46)


In [94]:
final_test_df

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,hhs_geo_region_dqpwygqj,hhs_geo_region_fpwskwrf,hhs_geo_region_kbazzjca,hhs_geo_region_lrircsnp,hhs_geo_region_lzgpxyit,hhs_geo_region_mlyzmhmf,hhs_geo_region_oxchjgsf,hhs_geo_region_qufhixun,"census_msa_MSA, Principle City",census_msa_Non-MSA
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26707,0.666667,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,False,False,False,False,False,True,False,False,False,False
26708,0.333333,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,True
26709,0.666667,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,...,False,False,False,True,False,False,False,False,False,True
26710,0.333333,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,False,False,False,True,False,False,False,False,False,False
26711,1.000000,0.5,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,False,False,False,False,True,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53410,0.333333,0.5,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,True,False,False,False,False,False,False,False,True,False
53411,1.000000,0.5,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,False,False,False,False,False,False,False,True,False,True
53412,0.000000,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,True,False,False
53413,1.000000,0.5,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False


In [83]:
from tensorflow.keras.models import load_model

# Load the model from the .h5 file
loaded_h1n1_model = load_model("new_best_H1N1_model.h5")

# Display the model summary to verify
loaded_h1n1_model.summary()




In [84]:
from tensorflow.keras.models import load_model

# Load the model from the .h5 file
loaded_seasonal_model = load_model("new_best_seasonal_model.h5")

# Display the model summary to verify
loaded_seasonal_model.summary()




In [95]:
# Predict probabilities for the H1N1 vaccine
h1n1_probabilities = loaded_h1n1_model.predict(final_test_df)

# Print the first few predicted probabilities
print("Predicted Probabilities (H1N1):", h1n1_probabilities[:10])


[1m835/835[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Predicted Probabilities (H1N1): [[0.08356459]
 [0.03880857]
 [0.5616102 ]
 [0.6002395 ]
 [0.18743873]
 [0.5162081 ]
 [0.3634452 ]
 [0.13014518]
 [0.0301398 ]
 [0.22724323]]


In [96]:
# Create a DataFrame for probabilities
h1n1_predictions = pd.DataFrame({
    'Probability': h1n1_probabilities.flatten()  # Flatten to make it a 1D array
})

# Display the first few rows of the DataFrame
print(h1n1_predictions.head())


   Probability
0     0.083565
1     0.038809
2     0.561610
3     0.600240
4     0.187439


In [97]:
# Predict probabilities for the Seasonal vaccine
seasonal_probabilities = loaded_seasonal_model.predict(final_test_df)

# Print the first few predicted probabilities
print("Predicted Probabilities (Seasonal):", seasonal_probabilities[:10])


[1m835/835[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Predicted Probabilities (Seasonal): [[0.2222093 ]
 [0.042957  ]
 [0.8019461 ]
 [0.92790216]
 [0.3837767 ]
 [0.9073807 ]
 [0.4699863 ]
 [0.21128768]
 [0.12509023]
 [0.9239656 ]]


In [98]:
# Create a DataFrame for seasonal predictions
seasonal_predictions_df = pd.DataFrame({
    'Probability': seasonal_probabilities.flatten(),  # Flatten to make it 1D
})

# Display the first few rows of the DataFrame
print(seasonal_predictions_df.head())


   Probability
0     0.222209
1     0.042957
2     0.801946
3     0.927902
4     0.383777


In [99]:
# Assuming `final_test_df` has a column or index that serves as respondent_id
# If not, generate a range index for respondent IDs
respondent_ids = final_test_df.index  # Use index from `final_test_df`

# Combine predictions into a single DataFrame
combined_predictions_df = pd.DataFrame({
    'respondent_id': respondent_ids,  # Use respondent_id as identifier
    'h1n1_vaccine': h1n1_predictions['Probability'],  # H1N1 probabilities
    'seasonal_vaccine': seasonal_predictions_df['Probability']  # Seasonal probabilities
})

# Set respondent_id as the index
combined_predictions_df.set_index('respondent_id', inplace=True)

# Display the combined DataFrame
print(combined_predictions_df)


               h1n1_vaccine  seasonal_vaccine
respondent_id                                
26707              0.083565          0.222209
26708              0.038809          0.042957
26709              0.561610          0.801946
26710              0.600240          0.927902
26711              0.187439          0.383777
...                     ...               ...
53410              0.432690          0.393226
53411              0.079297          0.144138
53412              0.109496          0.131219
53413              0.059095          0.385360
53414              0.560786          0.621552

[26708 rows x 2 columns]


In [100]:
# Save the combined DataFrame to a CSV file
combined_predictions_df.to_csv("new_combined_predictions.csv")
print("Combined predictions saved to new_combined_predictions.csv")


Combined predictions saved to new_combined_predictions.csv
