In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
df = pd.read_csv("H1N1_Flu_Vaccines_Cleaned.csv")

In [None]:
df.head()

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,sex,income_poverty,marital_status,rent_or_own,employment_status,census_msa,household_adults,household_children,h1n1_vaccine,seasonal_vaccine
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Female,Below Poverty,Not Married,Own,Not in Labor Force,Non-MSA,0.0,0.0,0,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Male,Below Poverty,Not Married,Rent,Employed,"MSA, Not Principle City",0.0,0.0,0,1
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,Male,"<= $75,000, Above Poverty",Not Married,Own,Employed,"MSA, Not Principle City",2.0,0.0,0,0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Female,Below Poverty,Not Married,Rent,Not in Labor Force,"MSA, Principle City",0.0,0.0,0,1
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,Female,"<= $75,000, Above Poverty",Married,Own,Employed,"MSA, Not Principle City",1.0,0.0,0,0


In [None]:
# after transform all the objects are converted to integers:

#  21  age_group                    26707 non-null  int64
#  22  education                    26707 non-null  int64
#  23  race                         26707 non-null  int64
#  24  sex                          26707 non-null  int64
#  25  income_poverty               26707 non-null  int64
#  26  marital_status               26707 non-null  int64
#  27  rent_or_own                  26707 non-null  int64
#  28  employment_status            26707 non-null  int64
#  29  census_msa                   26707 non-null  int64
#  30  household_adults             26707 non-null  float64
#  31  household_children           26707 non-null  float64
#  32  h1n1_vaccine                 26707 non-null  int64
#  33  seasonal_vaccine             26707 non-null  int64
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 35 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  int64  
 1   h1n1_concern                 26707 non-null  float64
 2   h1n1_knowledge               26707 non-null  float64
 3   behavioral_antiviral_meds    26707 non-null  float64
 4   behavioral_avoidance         26707 non-null  float64
 5   behavioral_face_mask         26707 non-null  float64
 6   behavioral_wash_hands        26707 non-null  float64
 7   behavioral_large_gatherings  26707 non-null  float64
 8   behavioral_outside_home      26707 non-null  float64
 9   behavioral_touch_face        26707 non-null  float64
 10  doctor_recc_h1n1             26707 non-null  float64
 11  doctor_recc_seasonal         26707 non-null  float64
 12  chronic_med_condition        26707 non-null  float64
 13  child_under_6_mo

In [None]:
df["age_group"].unique()

array(['55 - 64 Years', '35 - 44 Years', '18 - 34 Years', '65+ Years',
       '45 - 54 Years'], dtype=object)

In [None]:
# Drop respondent_id as it's not a predictive feature
df.drop(columns=["respondent_id"], inplace=True)

# Identify categorical columns
categorical_cols = df.select_dtypes(include=["object"]).columns

In [None]:
# Encode categorical features
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [None]:
# Split features and targets
X = df.drop(columns=["h1n1_vaccine", "seasonal_vaccine"])
y_h1n1 = df["h1n1_vaccine"]
y_seasonal = df["seasonal_vaccine"]

In [None]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Train-test split for both targets
X_train, X_test, y_train_h1n1, y_test_h1n1 = train_test_split(X_scaled, y_h1n1, test_size=0.2, random_state=42)
X_train, X_test, y_train_seasonal, y_test_seasonal = train_test_split(X_scaled, y_seasonal, test_size=0.2, random_state=42)

In [None]:
X_train.shape

(21365, 32)

In [None]:
# Train SVM models
svm_model = SVC(kernel="linear", random_state=42)
h1n1 = svm_model.fit(X_train, y_train_h1n1)

seasonal = svm_model.fit(X_train, y_train_seasonal)

In [None]:
seasonal.score(X_train, y_train_seasonal)

0.771401825415399

In [None]:
seasonal.score(X_test, y_test_seasonal)

0.7798577311868214

In [None]:
h1n1.score(X_train, y_train_h1n1)

0.6494266323426164

In [None]:
# Predictions
y_pred_h1n1 = h1n1.predict(X_test)
y_pred_seasonal = seasonal.predict(X_test)

In [None]:
# Calculate accuracies
accuracy_h1n1 = accuracy_score(y_test_h1n1, y_pred_h1n1)
accuracy_seasonal = accuracy_score(y_test_seasonal, y_pred_seasonal)

In [None]:
accuracy_h1n1, accuracy_seasonal

(0.6396480718831898, 0.7798577311868214)

In [None]:
#confusion matrix
conf_h1n1 = confusion_matrix(y_test_h1n1,y_pred_h1n1)
conf_h1n1

array([[2663, 1549],
       [ 376,  754]], dtype=int64)

In [None]:
conf_seasonal = confusion_matrix(y_test_seasonal,y_pred_seasonal)
conf_seasonal

array([[2377,  514],
       [ 662, 1789]], dtype=int64)

In [None]:
classification_report(y_test_h1n1,y_pred_h1n1)

'              precision    recall  f1-score   support\n\n           0       0.88      0.63      0.73      4212\n           1       0.33      0.67      0.44      1130\n\n    accuracy                           0.64      5342\n   macro avg       0.60      0.65      0.59      5342\nweighted avg       0.76      0.64      0.67      5342\n'

In [None]:
classification_report(y_test_seasonal,y_pred_seasonal)

'              precision    recall  f1-score   support\n\n           0       0.78      0.82      0.80      2891\n           1       0.78      0.73      0.75      2451\n\n    accuracy                           0.78      5342\n   macro avg       0.78      0.78      0.78      5342\nweighted avg       0.78      0.78      0.78      5342\n'

In [None]:
# Generate the classification report
report_h1n1 = classification_report(y_test_h1n1, y_pred_h1n1)
report_seasonal = classification_report(y_test_seasonal, y_pred_seasonal)

In [None]:
# Print reports properly formatted
print("H1N1 Vaccine Prediction Report:\n")
print(report_h1n1)
print("\nSeasonal Flu Vaccine Prediction Report:\n")
print(report_seasonal)

H1N1 Vaccine Prediction Report:

              precision    recall  f1-score   support

           0       0.88      0.63      0.73      4212
           1       0.33      0.67      0.44      1130

    accuracy                           0.64      5342
   macro avg       0.60      0.65      0.59      5342
weighted avg       0.76      0.64      0.67      5342


Seasonal Flu Vaccine Prediction Report:

              precision    recall  f1-score   support

           0       0.78      0.82      0.80      2891
           1       0.78      0.73      0.75      2451

    accuracy                           0.78      5342
   macro avg       0.78      0.78      0.78      5342
weighted avg       0.78      0.78      0.78      5342



In [None]:
# Define parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'gamma': ['scale', 'auto', 0.01, 0.1, 1],  # Kernel coefficient
    'kernel': ['rbf']  # RBF kernel
}

In [None]:
# Grid search for H1N1 vaccine prediction
grid_search_h1n1 = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_h1n1.fit(X_train, y_train_h1n1)


In [None]:
# Train optimized model
best_svm_h1n1 = grid_search_h1n1.best_estimator_
y_pred_h1n1 = best_svm_h1n1.predict(X_test)
accuracy_h1n1 = accuracy_score(y_test_h1n1, y_pred_h1n1)

In [None]:
# Grid search for Seasonal vaccine prediction
grid_search_seasonal = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_seasonal.fit(X_train, y_train_seasonal)

In [None]:
# Train optimized model
best_svm_seasonal = grid_search_seasonal.best_estimator_
y_pred_seasonal = best_svm_seasonal.predict(X_test)
accuracy_seasonal = accuracy_score(y_test_seasonal, y_pred_seasonal)

In [None]:
# Print results
print("Optimized H1N1 Accuracy:", accuracy_h1n1)
print("Optimized Seasonal Vaccine Accuracy:", accuracy_seasonal)
print("Best Parameters H1N1:", grid_search_h1n1.best_params_)
print("Best Parameters Seasonal:", grid_search_seasonal.best_params_)

Optimized H1N1 Accuracy: 0.837701235492325
Optimized Seasonal Vaccine Accuracy: 0.7877199550730064
Best Parameters H1N1: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
Best Parameters Seasonal: {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}


In [None]:
# 1. Set Up the Input Function
def get_full_user_input():
    user_data = {
        "h1n1_concern": float(input("Rate your concern about H1N1 (0-3): ")),
        "h1n1_knowledge": float(input("Rate your knowledge about H1N1 (0-3): ")),
        "behavioral_antiviral_meds": int(input("Do you take antiviral meds? (0 = No, 1 = Yes): ")),
        "behavioral_avoidance": int(input("Do you avoid public places? (0 = No, 1 = Yes): ")),
        "behavioral_face_mask": int(input("Do you wear a face mask? (0 = No, 1 = Yes): ")),
        "behavioral_wash_hands": int(input("Do you frequently wash hands? (0 = No, 1 = Yes): ")),
        "behavioral_large_gatherings": int(input("Do you attend large gatherings? (0 = No, 1 = Yes): ")),
        "behavioral_outside_home": int(input("Do you go outside often? (0 = No, 1 = Yes): ")),
        "behavioral_touch_face": int(input("Do you touch your face often? (0 = No, 1 = Yes): ")),
        "doctor_recc_h1n1": int(input("Did your doctor recommend the H1N1 vaccine? (0 = No, 1 = Yes): ")),
        "doctor_recc_seasonal": int(input("Did your doctor recommend the seasonal flu vaccine? (0 = No, 1 = Yes): ")),
        "chronic_med_condition": int(input("Do you have a chronic medical condition? (0 = No, 1 = Yes): ")),
        "child_under_6_months": int(input("Do you have a child under 6 months old? (0 = No, 1 = Yes): ")),
        "health_worker": int(input("Are you a healthcare worker? (0 = No, 1 = Yes): ")),
        "health_insurance": int(input("Do you have health insurance? (0 = No, 1 = Yes): ")),
        "opinion_h1n1_vacc_effective": float(input("How effective do you think the H1N1 vaccine is? (1-5): ")),
        "opinion_h1n1_risk": float(input("What do you think is the risk of getting H1N1? (1-5): ")),
        "opinion_h1n1_sick_from_vacc": float(input("Do you think the H1N1 vaccine can make you sick? (1-5): ")),
        "opinion_seas_vacc_effective": float(input("How effective do you think the seasonal flu vaccine is? (1-5): ")),
        "opinion_seas_risk": float(input("What do you think is the risk of getting the seasonal flu? (1-5): ")),
        "opinion_seas_sick_from_vacc": float(input("Do you think the seasonal flu vaccine can make you sick? (1-5): ")),
        "age_group": input("What is your age group? (18 - 34 Years, 35 - 44 Years, etc.): "),
        "education": input("What is your highest education level? (e.g., Less than High School, College Graduate): "),
        "race": input("What is your race/ethnicity? (e.g., White, Black, Hispanic): "),
        "sex": input("What is your gender? (Female/Male): "),
        "income_poverty": input("What is your income level? (e.g., <= $75,000, Above Poverty): "),
        "marital_status": input("What is your marital status? (e.g., Married, Not Married): "),
        "rent_or_own": input("Do you rent or own your home? (e.g., Rent, Own): "),
        "employment_status": input("What is your employment status? (e.g., Employed, Unemployed): "),
        "hhs_geo_region": input("Enter your geographic region code (e.g., region 1, region 2): "),
        "census_msa": input("Are you in a metro or non-metro area? (e.g., MSA, Non-MSA): "),
        "household_adults": int(input("How many adults are in your household?: ")),
        "household_children": int(input("How many children are in your household?: ")),
        "employment_industry": input("Enter your employment industry (e.g., Healthcare, Education, etc.): "),
        "employment_occupation": input("Enter your employment occupation (e.g., Sales, Engineering, etc.): ")
    }
    return user_data

In [None]:
user_data

{'h1n1_concern': 1.0,
 'h1n1_knowledge': 2.0,
 'behavioral_antiviral_meds': 1,
 'behavioral_avoidance': 1,
 'behavioral_face_mask': 1,
 'behavioral_wash_hands': 0,
 'behavioral_large_gatherings': 1,
 'behavioral_outside_home': 1,
 'behavioral_touch_face': 0,
 'doctor_recc_h1n1': 1,
 'doctor_recc_seasonal': 1,
 'chronic_med_condition': 0,
 'child_under_6_months': 0,
 'health_worker': 0,
 'health_insurance': 1,
 'opinion_h1n1_vacc_effective': 3.0,
 'opinion_h1n1_risk': 2.0,
 'opinion_h1n1_sick_from_vacc': 4.0,
 'opinion_seas_vacc_effective': 3.0,
 'opinion_seas_risk': 2.0,
 'opinion_seas_sick_from_vacc': 4.0,
 'age_group': '18-34',
 'education': 'less than high school',
 'race': 'black',
 'sex': 'male',
 'income_poverty': '<=75000',
 'marital_status': 'not married',
 'rent_or_own': 'onw',
 'employment_status': 'employed',
 'hhs_geo_region': 'region 2',
 'census_msa': 'msa',
 'household_adults': 2,
 'household_children': 4,
 'employment_industry': 'education',
 'employment_occupation': 's

In [None]:
# 2. Preprocess the User Input
def preprocess_user_input(user_data, top_features):
    preprocessor = StandardScaler() # Replace with your actual preprocessor
    # Convert to DataFrame
    user_df = pd.DataFrame([user_data])

    # Apply preprocessing
    user_processed = preprocessor.transform(user_df)

    # Select top features
    user_processed = user_processed[:, top_features]

    return user_processed

In [None]:
# 3. Make the Prediction
def predict_vaccine(user_data):
    # Preprocess user input
    user_processed = preprocess_user_input(user_data, top_features)

    # Make prediction with the ensemble model (MLP + XGBoost)
    xgb_pred = xgb_final.predict(user_processed)
    mlp_pred = (best_mlp.predict(user_processed) > 0.5).astype(int)

    # Ensemble prediction
    final_pred = (0.6 * xgb_pred + 0.4 * mlp_pred).round()

    if final_pred == 1:
        print("Prediction: You are likely to get the H1N1 vaccine!")
    else:
        print("Prediction: You are less likely to get the H1N1 vaccine!")


In [None]:
# 4. Putting It All Together

# Get user data
user_data = get_full_user_input()

# Predict based on user data
predict_vaccine(user_data)


Rate your concern about H1N1 (0-3): 1
Rate your knowledge about H1N1 (0-3): 1
Do you take antiviral meds? (0 = No, 1 = Yes): 1
Do you avoid public places? (0 = No, 1 = Yes): 1
Do you wear a face mask? (0 = No, 1 = Yes): 1
Do you frequently wash hands? (0 = No, 1 = Yes): 1
Do you attend large gatherings? (0 = No, 1 = Yes): 1
Do you go outside often? (0 = No, 1 = Yes): 1
Do you touch your face often? (0 = No, 1 = Yes): 1
Did your doctor recommend the H1N1 vaccine? (0 = No, 1 = Yes): 1
Did your doctor recommend the seasonal flu vaccine? (0 = No, 1 = Yes): 1
Do you have a chronic medical condition? (0 = No, 1 = Yes): 11
Do you have a child under 6 months old? (0 = No, 1 = Yes): 1
Are you a healthcare worker? (0 = No, 1 = Yes): 1
Do you have health insurance? (0 = No, 1 = Yes): 1
How effective do you think the H1N1 vaccine is? (1-5): 1
What do you think is the risk of getting H1N1? (1-5): 1
Do you think the H1N1 vaccine can make you sick? (1-5): 1
How effective do you think the seasonal flu

NameError: name 'top_features' is not defined

In [None]:
import pickle
with open("SVM_h1n1.pkl","wb") as file:
    pickle.dump(svm_model,file)