In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import RandomizedSearchCV
import numpy as np


In [2]:
# Load the dataset
file_path = "/content/H1N1_Flu_Vaccines_Cleaned(Shiva).csv"
df = pd.read_csv(file_path)

# Drop respondent_id as it's just an identifier
df.drop(columns=["respondent_id"], inplace=True, errors='ignore')
df

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,sex,income_poverty,marital_status,rent_or_own,employment_status,census_msa,household_adults,household_children,h1n1_vaccine,seasonal_vaccine
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,Female,Below Poverty,Not Married,Own,Not in Labor Force,Non-MSA,0.0,0.0,0,0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,Male,Below Poverty,Not Married,Rent,Employed,"MSA, Not Principle City",0.0,0.0,0,1
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Male,"<= $75,000, Above Poverty",Not Married,Own,Employed,"MSA, Not Principle City",2.0,0.0,0,0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,Female,Below Poverty,Not Married,Rent,Not in Labor Force,"MSA, Principle City",0.0,0.0,0,1
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,Female,"<= $75,000, Above Poverty",Married,Own,Employed,"MSA, Not Principle City",1.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,Female,"<= $75,000, Above Poverty",Not Married,Own,Not in Labor Force,Non-MSA,0.0,0.0,0,0
26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,Male,"<= $75,000, Above Poverty",Not Married,Rent,Employed,"MSA, Principle City",1.0,0.0,0,0
26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,Female,"<= $75,000, Above Poverty",Not Married,Own,Employed,"MSA, Not Principle City",0.0,0.0,0,1
26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,Female,"<= $75,000, Above Poverty",Married,Rent,Employed,Non-MSA,1.0,0.0,0,0


In [3]:
# Encode categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns
df_encoded = df.copy()
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store encoders for future reference


In [4]:
# Define features and target variables
X = df_encoded.drop(columns=["h1n1_vaccine", "seasonal_vaccine"])
y_h1n1 = df_encoded["h1n1_vaccine"]
y_seasonal = df_encoded["seasonal_vaccine"]

# Split data into training and testing sets
X_train, X_test, y_train_h1n1, y_test_h1n1 = train_test_split(X, y_h1n1, test_size=0.2, random_state=42)
X_train, X_test, y_train_seasonal, y_test_seasonal = train_test_split(X, y_seasonal, test_size=0.2, random_state=42)


In [5]:
# Train Random Forest model for H1N1 vaccine prediction
rf_h1n1 = RandomForestClassifier(n_estimators=100, random_state=42)
rf_h1n1.fit(X_train, y_train_h1n1)

# Train Random Forest model for Seasonal Flu vaccine prediction
rf_seasonal = RandomForestClassifier(n_estimators=100, random_state=42)
rf_seasonal.fit(X_train, y_train_seasonal)

In [6]:
# Define hyperparameter search space
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [7]:
# Randomized Search for H1N1 model
rf_h1n1 = RandomForestClassifier(random_state=42)
random_search_h1n1 = RandomizedSearchCV(rf_h1n1, param_dist, n_iter=10, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search_h1n1.fit(X_train, y_train_h1n1)
best_h1n1 = random_search_h1n1.best_estimator_

# Randomized Search for Seasonal Flu model
rf_seasonal = RandomForestClassifier(random_state=42)
random_search_seasonal = RandomizedSearchCV(rf_seasonal, param_dist, n_iter=10, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search_seasonal.fit(X_train, y_train_seasonal)
best_seasonal = random_search_seasonal.best_estimator_



In [8]:
# Make predictions
y_pred_h1n1 = best_h1n1.predict(X_test)
y_pred_seasonal = best_seasonal.predict(X_test)

# Evaluate accuracy
acc_h1n1 = accuracy_score(y_test_h1n1, y_pred_h1n1)
acc_seasonal = accuracy_score(y_test_seasonal, y_pred_seasonal)


In [9]:
# Generate classification reports
report_h1n1 = classification_report(y_test_h1n1, y_pred_h1n1)
report_seasonal = classification_report(y_test_seasonal, y_pred_seasonal)

In [10]:
# Print results
print(f"\nRandom Forest Model Performance (Tuned with Randomized Search):\n")
print(f"H1N1 Vaccine Prediction:\n- Accuracy: {acc_h1n1:.2%}\n{report_h1n1}")
print(f"Seasonal Flu Vaccine Prediction:\n- Accuracy: {acc_seasonal:.2%}\n{report_seasonal}")



Random Forest Model Performance (Tuned with Randomized Search):

H1N1 Vaccine Prediction:
- Accuracy: 84.07%
              precision    recall  f1-score   support

           0       0.86      0.96      0.90      4212
           1       0.72      0.41      0.52      1130

    accuracy                           0.84      5342
   macro avg       0.79      0.68      0.71      5342
weighted avg       0.83      0.84      0.82      5342

Seasonal Flu Vaccine Prediction:
- Accuracy: 78.70%
              precision    recall  f1-score   support

           0       0.80      0.82      0.81      2891
           1       0.78      0.75      0.76      2451

    accuracy                           0.79      5342
   macro avg       0.79      0.78      0.78      5342
weighted avg       0.79      0.79      0.79      5342



In [16]:
import pickle

In [17]:
# Save the trained models as pickle files
h1n1_model = "rf_h1n1_model.pkl"
seasonal_model = "rf_seasonal_model.pkl"

In [18]:
with open(h1n1_model, 'wb') as f:
    pickle.dump(rf_h1n1, f)

with open(seasonal_model, 'wb') as f:
    pickle.dump(rf_seasonal, f)

print("Models saved successfully!")

Models saved successfully!


In [14]:
rf_h1n1.predict

In [15]:
# import pandas as pd
# import numpy as np
# import pickle
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score

# # Load your dataset (Update the path accordingly)
# df = pd.read_csv(r"/content/H1N1_Flu_Vaccines_Cleaned(Shiva).csv")

# # Print available columns to debug
# print("Dataset Columns:", df.columns.tolist())

# # Define categorical features (only those present in the dataset)
# categorical_features = [
#     'age_group', 'education', 'race', 'sex', 'income_poverty',
#     'marital_status', 'rent_or_own', 'employment_status', 'census_msa'
# ]

# # Check which categorical features are in the dataset
# available_categorical_features = [col for col in categorical_features if col in df.columns]

# # Apply one-hot encoding only to available categorical features
# df = pd.get_dummies(df, columns=available_categorical_features, drop_first=True)

# # Define features and targets
# features = [
#     'h1n1_concern', 'h1n1_knowledge', 'behavioral_antiviral_meds',
#     'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands',
#     'behavioral_large_gatherings', 'behavioral_outside_home', 'behavioral_touch_face',
#     'doctor_recc_h1n1', 'doctor_recc_seasonal', 'chronic_med_condition',
#     'child_under_6_months', 'health_worker', 'health_insurance',
#     'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc',
#     'opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc',
#     'household_adults', 'household_children'
# ] + list(df.columns.difference(['h1n1_vaccine', 'seasonal_vaccine']))  # Add all encoded categorical features

# # Define target variables
# target_h1n1 = 'h1n1_vaccine'
# target_seasonal = 'seasonal_vaccine'

# # Ensure all selected features exist in the dataset
# features = [col for col in features if col in df.columns]

# # Split data into train and test sets
# X = df[features]
# y_h1n1 = df[target_h1n1]
# y_seasonal = df[target_seasonal]

# X_train, X_test, y_train_h1n1, y_test_h1n1 = train_test_split(X, y_h1n1, test_size=0.2, random_state=42)
# X_train, X_test, y_train_seasonal, y_test_seasonal = train_test_split(X, y_seasonal, test_size=0.2, random_state=42)

# # Train RandomForest models
# rf_h1n1 = RandomForestClassifier(n_estimators=100, random_state=42)
# rf_h1n1.fit(X_train, y_train_h1n1)

# rf_seasonal = RandomForestClassifier(n_estimators=100, random_state=42)
# rf_seasonal.fit(X_train, y_train_seasonal)

# # Evaluate models
# y_pred_h1n1 = rf_h1n1.predict(X_test)
# y_pred_seasonal = rf_seasonal.predict(X_test)

# print(f"H1N1 Vaccine Model Accuracy: {accuracy_score(y_test_h1n1, y_pred_h1n1) * 100:.2f}%")
# print(f"Seasonal Flu Vaccine Model Accuracy: {accuracy_score(y_test_seasonal, y_pred_seasonal) * 100:.2f}%")

# # Save models using pickle
# model_paths = {
#     "RF_H1N1": r"C:\Users\DELL\.spyder-py3\rf_h1n1_model.pkl",
#     "RF_Seasonal": r"C:\Users\DELL\.spyder-py3rf_seasonal_model.pkl"
# }

# with open(model_paths["RF_H1N1"], 'wb') as f:
#     pickle.dump(rf_h1n1, f)

# with open(model_paths["RF_Seasonal"], 'wb') as f:
#     pickle.dump(rf_seasonal, f)

# print("✅ Models trained and saved successfully!")
