In [1]:
#importing libraries
import numpy as np
import pandas as pd

# Reload the dataset
df = pd.read_csv(r"E:\Python\New folder\h1n1_vaccine_prediction.csv")
df.head()

Unnamed: 0,unique_id,h1n1_worry,h1n1_awareness,antiviral_medication,contact_avoidance,bought_face_mask,wash_hands_frequently,avoid_large_gatherings,reduced_outside_home_cont,avoid_touch_face,...,race,sex,income_level,marital_status,housing_status,employment,census_msa,no_of_adults,no_of_children,h1n1_vaccine
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,White,Female,Below Poverty,Not Married,Own,Not in Labor Force,Non-MSA,0.0,0.0,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,White,Male,Below Poverty,Not Married,Rent,Employed,"MSA, Not Principle City",0.0,0.0,0
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,White,Male,"<= $75,000, Above Poverty",Not Married,Own,Employed,"MSA, Not Principle City",2.0,0.0,0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,White,Female,Below Poverty,Not Married,Rent,Not in Labor Force,"MSA, Principle City",0.0,0.0,0
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,White,Female,"<= $75,000, Above Poverty",Married,Own,Employed,"MSA, Not Principle City",1.0,0.0,0


In [2]:
# Check for missing values
missing_data = df.isnull().sum()

# Display missing values
missing_data[missing_data > 0]

h1n1_worry                      92
h1n1_awareness                 116
antiviral_medication            71
contact_avoidance              208
bought_face_mask                19
wash_hands_frequently           42
avoid_large_gatherings          87
reduced_outside_home_cont       82
avoid_touch_face               128
dr_recc_h1n1_vacc             2160
dr_recc_seasonal_vacc         2160
chronic_medic_condition        971
cont_child_undr_6_mnths        820
is_health_worker               804
has_health_insur             12274
is_h1n1_vacc_effective         391
is_h1n1_risky                  388
sick_from_h1n1_vacc            395
is_seas_vacc_effective         462
is_seas_risky                  514
sick_from_seas_vacc            537
qualification                 1407
income_level                  4423
marital_status                1408
housing_status                2042
employment                    1463
no_of_adults                   249
no_of_children                 249
dtype: int64

In [3]:
# Distribution of the target variable
target_distribution = df['h1n1_vaccine'].value_counts(normalize=True)

target_distribution

0    0.787546
1    0.212454
Name: h1n1_vaccine, dtype: float64

In [4]:
# Separate the features into continuous and categorical
continuous_features = df.select_dtypes(include=['float64']).columns.tolist()
categorical_features = df.select_dtypes(include=['object']).columns.tolist()

# Impute missing values
for feature in continuous_features:
    df[feature].fillna(df[feature].median(), inplace=True)

for feature in categorical_features:
    df[feature].fillna(df[feature].mode()[0], inplace=True)

# Check if there are any more missing values
missing_after_imputation = df.isnull().sum().sum()

missing_after_imputation

0

In [5]:
# One-hot encoding of categorical features
df_encoded = pd.get_dummies(df, columns=categorical_features, drop_first=True)

# Display the shape and first few rows of the encoded dataset
df_encoded.shape, df_encoded.head()

((26707, 44),
    unique_id  h1n1_worry  h1n1_awareness  antiviral_medication  \
 0          0         1.0             0.0                   0.0   
 1          1         3.0             2.0                   0.0   
 2          2         1.0             1.0                   0.0   
 3          3         1.0             1.0                   0.0   
 4          4         2.0             1.0                   0.0   
 
    contact_avoidance  bought_face_mask  wash_hands_frequently  \
 0                0.0               0.0                    0.0   
 1                1.0               0.0                    1.0   
 2                1.0               0.0                    0.0   
 3                1.0               0.0                    1.0   
 4                1.0               0.0                    1.0   
 
    avoid_large_gatherings  reduced_outside_home_cont  avoid_touch_face  ...  \
 0                     0.0                        1.0               1.0  ...   
 1                     0

In [6]:
from sklearn.model_selection import train_test_split

# Features and target variable
X = df_encoded.drop(columns=['unique_id', 'h1n1_vaccine'])
y = df_encoded['h1n1_vaccine']

# Splitting the data into training and test sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train.shape, X_test.shape

((21365, 42), (5342, 42))

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
# Initialize classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'SVM': SVC(random_state=42),
    'KNN': KNeighborsClassifier()
}

# Dictionary to store results
results = {}

# Train and evaluate each classifier
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    f1_vaccinated = classification_report(y_test, y_pred, output_dict=True)['1']['f1-score']
    
    results[name] = {'Accuracy': accuracy, 'F1 Score (Vaccinated)': f1_vaccinated}

results

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


{'Random Forest': {'Accuracy': 0.8320853612879071,
  'F1 Score (Vaccinated)': 0.5035971223021583},
 'Logistic Regression': {'Accuracy': 0.8360164732309996,
  'F1 Score (Vaccinated)': 0.5233949945593036},
 'Decision Tree': {'Accuracy': 0.7484088356420816,
  'F1 Score (Vaccinated)': 0.4305084745762711},
 'SVM': {'Accuracy': 0.8375140396855111,
  'F1 Score (Vaccinated)': 0.5112612612612613},
 'KNN': {'Accuracy': 0.8135529764133284,
  'F1 Score (Vaccinated)': 0.45274725274725275}}