In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


In [51]:
from sklearn.metrics import confusion_matrix, classification_report

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
%matplotlib inline

In [5]:
from sklearn.datasets import load_breast_cancer

In [13]:
df = pd.read_csv("H1N1_Flu_Vaccines_Cleaned.csv")

In [60]:
df.head()

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,sex,income_poverty,marital_status,rent_or_own,employment_status,census_msa,household_adults,household_children,h1n1_vaccine,seasonal_vaccine
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0,2,1,0,1,2,0.0,0.0,0,0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,1,2,1,1,0,0,0.0,0.0,0,1
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,1,0,0,0,2.0,0.0,0,0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0,2,1,1,1,1,0.0,0.0,0,1
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0,0,0,0,0,0,1.0,0.0,0,0


In [62]:
# after transform all the objects are converted to integers:

#  21  age_group                    26707 non-null  int64  
#  22  education                    26707 non-null  int64  
#  23  race                         26707 non-null  int64  
#  24  sex                          26707 non-null  int64  
#  25  income_poverty               26707 non-null  int64  
#  26  marital_status               26707 non-null  int64  
#  27  rent_or_own                  26707 non-null  int64  
#  28  employment_status            26707 non-null  int64  
#  29  census_msa                   26707 non-null  int64  
#  30  household_adults             26707 non-null  float64
#  31  household_children           26707 non-null  float64
#  32  h1n1_vaccine                 26707 non-null  int64  
#  33  seasonal_vaccine             26707 non-null  int64 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 34 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   h1n1_concern                 26707 non-null  float64
 1   h1n1_knowledge               26707 non-null  float64
 2   behavioral_antiviral_meds    26707 non-null  float64
 3   behavioral_avoidance         26707 non-null  float64
 4   behavioral_face_mask         26707 non-null  float64
 5   behavioral_wash_hands        26707 non-null  float64
 6   behavioral_large_gatherings  26707 non-null  float64
 7   behavioral_outside_home      26707 non-null  float64
 8   behavioral_touch_face        26707 non-null  float64
 9   doctor_recc_h1n1             26707 non-null  float64
 10  doctor_recc_seasonal         26707 non-null  float64
 11  chronic_med_condition        26707 non-null  float64
 12  child_under_6_months         26707 non-null  float64
 13  health_worker   

In [65]:
df["age_group"].unique()

array([3, 1, 0, 4, 2], dtype=int64)

In [16]:
# Drop respondent_id as it's not a predictive feature
df.drop(columns=["respondent_id"], inplace=True)

# Identify categorical columns
categorical_cols = df.select_dtypes(include=["object"]).columns

KeyError: "['respondent_id'] not found in axis"

In [17]:
# Encode categorical features
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [18]:
# Split features and targets
X = df.drop(columns=["h1n1_vaccine", "seasonal_vaccine"])
y_h1n1 = df["h1n1_vaccine"]
y_seasonal = df["seasonal_vaccine"]

In [19]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [30]:
# Train-test split for both targets
X_train, X_test, y_train_h1n1, y_test_h1n1 = train_test_split(X_scaled, y_h1n1, test_size=0.2, random_state=42)
X_train, X_test, y_train_seasonal, y_test_seasonal = train_test_split(X_scaled, y_seasonal, test_size=0.2, random_state=42)

In [32]:
X_train.shape

(21365, 32)

In [38]:
# Train SVM models
svm_model = SVC(kernel="rbf", random_state=40)
h1n1 = svm_model.fit(X_train, y_train_h1n1)

seasonal = svm_model.fit(X_train, y_train_seasonal)

In [43]:
seasonal.score(X_train, y_train_seasonal)

0.8256962321553943

In [45]:
seasonal.score(X_test, y_test_seasonal)

0.7843504305503557

In [44]:
h1n1.score(X_train, y_train_h1n1)

0.6521413526796161

In [47]:
# Predictions
y_pred_h1n1 = h1n1.predict(X_test)
y_pred_seasonal = seasonal.predict(X_test)

In [48]:
# Calculate accuracies
accuracy_h1n1 = accuracy_score(y_test_h1n1, y_pred_h1n1)
accuracy_seasonal = accuracy_score(y_test_seasonal, y_pred_seasonal)

In [49]:
accuracy_h1n1, accuracy_seasonal

(0.6370273305877948, 0.7843504305503557)

In [53]:
#confusion matrix
conf_h1n1 = confusion_matrix(y_test_h1n1,y_pred_h1n1)
conf_h1n1

array([[2623, 1589],
       [ 350,  780]], dtype=int64)

In [54]:
conf_seasonal = confusion_matrix(y_test_seasonal,y_pred_seasonal)
conf_seasonal

array([[2356,  535],
       [ 617, 1834]], dtype=int64)

In [56]:
classification_report(y_test_h1n1,y_pred_h1n1)

'              precision    recall  f1-score   support\n\n           0       0.88      0.62      0.73      4212\n           1       0.33      0.69      0.45      1130\n\n    accuracy                           0.64      5342\n   macro avg       0.61      0.66      0.59      5342\nweighted avg       0.77      0.64      0.67      5342\n'

In [57]:
classification_report(y_test_seasonal,y_pred_seasonal)

'              precision    recall  f1-score   support\n\n           0       0.79      0.81      0.80      2891\n           1       0.77      0.75      0.76      2451\n\n    accuracy                           0.78      5342\n   macro avg       0.78      0.78      0.78      5342\nweighted avg       0.78      0.78      0.78      5342\n'

In [58]:
# Generate the classification report
report_h1n1 = classification_report(y_test_h1n1, y_pred_h1n1)
report_seasonal = classification_report(y_test_seasonal, y_pred_seasonal)

In [59]:
# Print reports properly formatted
print("H1N1 Vaccine Prediction Report:\n")
print(report_h1n1)
print("\nSeasonal Flu Vaccine Prediction Report:\n")
print(report_seasonal)

H1N1 Vaccine Prediction Report:

              precision    recall  f1-score   support

           0       0.88      0.62      0.73      4212
           1       0.33      0.69      0.45      1130

    accuracy                           0.64      5342
   macro avg       0.61      0.66      0.59      5342
weighted avg       0.77      0.64      0.67      5342


Seasonal Flu Vaccine Prediction Report:

              precision    recall  f1-score   support

           0       0.79      0.81      0.80      2891
           1       0.77      0.75      0.76      2451

    accuracy                           0.78      5342
   macro avg       0.78      0.78      0.78      5342
weighted avg       0.78      0.78      0.78      5342

