Import the necessary libraries

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder


In [None]:

#Load the two csv files for training data
x = pd.read_csv("./Data/training_set_features.csv")
y = pd.read_csv("./Data/training_set_labels.csv")

df = pd.merge(x, y, on = "respondent_id")



Pick target variable that the model is to predict
We are predicting h1n1 vaccine. 0 = no, 1 = yes

In [2]:
target = "h1n1_vaccine" # Whether someone got the h1n1 vaccine or not

y = df[target]
X = df.drop(columns=['respondent_id', 'h1n1_vaccine', 'seasonal_vaccine'])

Split the data for training and testing

In [None]:


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)



In [None]:
#Identify column types whether numerical or categorical
numeric_columns = X_train.select_dtypes(include=['number']).columns.tolist()
categorical_columns = X_train.select_dtypes(include=['object']).columns.tolist()


In [8]:
#Fill missing values in numeric columns with the median
#Fill missing values in categorical columns with missing and perform encoding

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])



In [11]:
#Preprocessing pipeline
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_columns),
    ('cat', categorical_transformer, categorical_columns)
])


In [None]:
# Fit on training set 
X_train_clean = preprocessor.fit_transform(X_train)

X_test_clean = preprocessor.transform(X_test)

print("Training data shape:", X_train_clean.shape)
print("Test data shape:", X_test_clean.shape)



Training data shape: (21365, 112)
Test data shape: (5342, 112)
