In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

In [3]:
# Load the dataset
file_path = "H1N1_Flu_Vaccines_Cleaned.csv"
df = pd.read_csv(file_path)

# Drop respondent_id as it's just an identifier
df.drop(columns=["respondent_id"], inplace=True, errors='ignore')

In [4]:
# Encode categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns
df_encoded = df.copy()
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store encoders for future reference


In [5]:
# Define features and target variables
X = df_encoded.drop(columns=["h1n1_vaccine", "seasonal_vaccine"])
y_h1n1 = df_encoded["h1n1_vaccine"]
y_seasonal = df_encoded["seasonal_vaccine"]

In [6]:
# Split data into training and testing sets
X_train, X_test, y_train_h1n1, y_test_h1n1 = train_test_split(X, y_h1n1, test_size=0.2, random_state=42)
X_train, X_test, y_train_seasonal, y_test_seasonal = train_test_split(X, y_seasonal, test_size=0.2, random_state=42)


In [7]:
# Define hyperparameter search space
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [8]:
# Randomized Search for H1N1 model
rf_h1n1 = RandomForestClassifier(random_state=42)
random_search_h1n1 = RandomizedSearchCV(rf_h1n1, param_dist, n_iter=10, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search_h1n1.fit(X_train, y_train_h1n1)
best_h1n1 = random_search_h1n1.best_estimator_

# Randomized Search for Seasonal Flu model
rf_seasonal = RandomForestClassifier(random_state=42)
random_search_seasonal = RandomizedSearchCV(rf_seasonal, param_dist, n_iter=10, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search_seasonal.fit(X_train, y_train_seasonal)
best_seasonal = random_search_seasonal.best_estimator_

In [9]:
# Make predictions
y_pred_h1n1 = best_h1n1.predict(X_test)
y_pred_seasonal = best_seasonal.predict(X_test)

In [10]:
# Evaluate accuracy
acc_h1n1 = accuracy_score(y_test_h1n1, y_pred_h1n1)
acc_seasonal = accuracy_score(y_test_seasonal, y_pred_seasonal)

In [11]:
# Generate classification reports
report_h1n1 = classification_report(y_test_h1n1, y_pred_h1n1)
report_seasonal = classification_report(y_test_seasonal, y_pred_seasonal)


In [12]:
# Print results
print(f"\nRandom Forest Model Performance (Tuned with Randomized Search):\n")
print(f"H1N1 Vaccine Prediction:\n- Accuracy: {acc_h1n1:.2%}\n{report_h1n1}")
print(f"Seasonal Flu Vaccine Prediction:\n- Accuracy: {acc_seasonal:.2%}\n{report_seasonal}")


Random Forest Model Performance (Tuned with Randomized Search):

H1N1 Vaccine Prediction:
- Accuracy: 84.07%
              precision    recall  f1-score   support

           0       0.86      0.96      0.90      4212
           1       0.72      0.41      0.52      1130

    accuracy                           0.84      5342
   macro avg       0.79      0.68      0.71      5342
weighted avg       0.83      0.84      0.82      5342

Seasonal Flu Vaccine Prediction:
- Accuracy: 78.70%
              precision    recall  f1-score   support

           0       0.80      0.82      0.81      2891
           1       0.78      0.75      0.76      2451

    accuracy                           0.79      5342
   macro avg       0.79      0.78      0.78      5342
weighted avg       0.79      0.79      0.79      5342

