In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.multioutput import MultiOutputClassifier

train_features = pd.read_csv('/content/training_set_features.csv')
train_labels = pd.read_csv('/content/training_set_labels.csv')
test_features = pd.read_csv('/content/test_set_features.csv')


train_data = pd.merge(train_features, train_labels, on='respondent_id')


categorical_cols = ['age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status',
                    'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
                    'employment_industry', 'employment_occupation']
numerical_cols = train_data.columns.difference(categorical_cols + ['respondent_id', 'xyz_vaccine', 'seasonal_vaccine']).tolist()


num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, numerical_cols),
        ('cat', cat_pipeline, categorical_cols)
    ])


X = train_data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y = train_data[['xyz_vaccine', 'seasonal_vaccine']]

X_preprocessed = preprocessor.fit_transform(X)


X_train, X_val, y_train, y_val = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)


model = MultiOutputClassifier(GradientBoostingClassifier())


model.fit(X_train, y_train)


y_pred_proba = model.predict_proba(X_val)
roc_auc_xyz = roc_auc_score(y_val['xyz_vaccine'], y_pred_proba[0][:, 1])
roc_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], y_pred_proba[1][:, 1])

print(f"ROC AUC for XYZ vaccine: {roc_auc_xyz}")
print(f"ROC AUC for Seasonal vaccine: {roc_auc_seasonal}")


X_test = test_features.drop(columns=['respondent_id'])
X_test_preprocessed = preprocessor.transform(X_test)


y_test_pred_proba = model.predict_proba(X_test_preprocessed)

submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': y_test_pred_proba[0][:, 1],
    'seasonal_vaccine': y_test_pred_proba[1][:, 1]
})


submission.to_csv('/content/submission_format.csv', index=False)


ROC AUC for XYZ vaccine: 0.8707206128297573
ROC AUC for Seasonal vaccine: 0.8638179293043691
