In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [12]:
# Load the csv files
train_features = pd.read_csv('training_set_features.csv')
test_features = pd.read_csv('test_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')

train_features.drop(columns=['respondent_id'])
test_features.drop(columns=['respondent_id'])
train_labels.drop(columns=['respondent_id'])

Unnamed: 0,xyz_vaccine,seasonal_vaccine
0,0,0
1,0,1
2,0,0
3,0,1
4,0,0
...,...,...
26702,0,0
26703,0,0
26704,0,1
26705,0,0


In [13]:
# Separate the labels into categorical data and numerical data using List Comprehension
categorical_labels = [i for i in train_features.columns if train_features[i].dtype == 'object']
numerical_labels = [i for i in train_features.columns if train_features[i].dtype != 'object']

In [14]:
numerical_pre_processing = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pre_processing = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [15]:
pre_processing = ColumnTransformer(
    transformers=[
        ('num', numerical_pre_processing, numerical_labels),
        ('cat', categorical_pre_processing, categorical_labels)
    ],
    remainder='passthrough'
)

In [16]:
x_train = train_features
y_train = train_labels
x_test = test_features

In [17]:
x_train_pre_processed = pre_processing.fit_transform(x_train)
x_test_pre_processed = pre_processing.transform(x_test)

In [18]:
print(x_train_pre_processed.shape, x_test_pre_processed.shape)
print(y_train.shape)

(26707, 106) (26708, 106)
(26707, 3)


In [19]:
x_train, x_test, y_train, y_test = train_test_split(x_train_pre_processed, y_train, test_size=0.2, random_state=42)

In [10]:
Classifier = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42), n_jobs=-1)

In [None]:
Classifier.fit(x_train, y_train)
y_pred = Classifier.predict_proba(x_test)

In [None]:
roc_auc_xyz = roc_auc_score(y_test['xyz_vaccine'], y_pred_proba[0][:, 1])
roc_auc_seasonal = roc_auc_score(y_test['seasonal_vaccine'], y_pred_proba[1][:, 1])
mean_roc_auc = np.mean([roc_auc_xyz, roc_auc_seasonal])

In [None]:
output = pd.Dataframe({
    'respondent_id' : test_features.index,
    'xyz_vaccine' : y_pred_proba[0][:, 1],
    'seasonal_vaccine' : y_pred_proba[1][:, 1]
})

In [None]:
output.to_csv('submission.csv')