In [4]:
import pandas as pd
import gdown
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score

# Google Drive file IDs
train_features_id = '1ANxZrRmci0Rdj3rOm8P1Q1-0NtdwvPJG'
train_labels_id = '15ykhdVYvYD7MkaeEm-wrNV8SGvvgaDuL'
test_features_id = '1ec5DgtDNs_zY17ot7ESggU7nBb-mqZo4'

# File paths to save the files
train_features_path = 'training_set_features.csv'
train_labels_path = 'training_set_labels.csv'
test_features_path = 'test_set_features.csv'

# Downloading the files from Google Drive
gdown.download(f'https://drive.google.com/uc?id={train_features_id}', train_features_path, quiet=False)
gdown.download(f'https://drive.google.com/uc?id={train_labels_id}', train_labels_path, quiet=False)
gdown.download(f'https://drive.google.com/uc?id={test_features_id}', test_features_path, quiet=False)

# Load the datasets
train_features = pd.read_csv(train_features_path)
train_labels = pd.read_csv(train_labels_path)
test_features = pd.read_csv(test_features_path)

# Merge training features and labels
train_data = train_features.merge(train_labels, on='respondent_id')

# Separate features and targets
X = train_data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y = train_data[['xyz_vaccine', 'seasonal_vaccine']]

# Columns to be one-hot encoded and scaled
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing pipelines for both numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the model
model = MultiOutputClassifier(RandomForestClassifier(random_state=42))

# Create and evaluate the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred_proba = pipeline.predict_proba(X_val)

# Calculate ROC AUC for each target
roc_auc_xyz = roc_auc_score(y_val['xyz_vaccine'], [prob[1] for prob in y_pred_proba[0]])
roc_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], [prob[1] for prob in y_pred_proba[1]])

# Print the scores
print(f'ROC AUC for XYZ Vaccine: {roc_auc_xyz}')
print(f'ROC AUC for Seasonal Vaccine: {roc_auc_seasonal}')
print(f'Average ROC AUC: {(roc_auc_xyz + roc_auc_seasonal) / 2}')

# Prepare the test data
X_test = test_features.drop(columns=['respondent_id'])

# Predict probabilities
test_pred_proba = pipeline.predict_proba(X_test)

# Prepare the submission file
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': [prob[1] for prob in test_pred_proba[0]],
    'seasonal_vaccine': [prob[1] for prob in test_pred_proba[1]]
})

# Save the submission file
submission.to_csv('submission.csv', index=False)
from google.colab import files
files.download('submission.csv')


Downloading...
From: https://drive.google.com/uc?id=1ANxZrRmci0Rdj3rOm8P1Q1-0NtdwvPJG
To: /content/training_set_features.csv
100%|██████████| 4.65M/4.65M [00:00<00:00, 49.7MB/s]
Downloading...
From: https://drive.google.com/uc?id=15ykhdVYvYD7MkaeEm-wrNV8SGvvgaDuL
To: /content/training_set_labels.csv
100%|██████████| 283k/283k [00:00<00:00, 5.08MB/s]
Downloading...
From: https://drive.google.com/uc?id=1ec5DgtDNs_zY17ot7ESggU7nBb-mqZo4
To: /content/test_set_features.csv
100%|██████████| 4.66M/4.66M [00:00<00:00, 32.4MB/s]


ROC AUC for XYZ Vaccine: 0.864173999277244
ROC AUC for Seasonal Vaccine: 0.8570519011081396
Average ROC AUC: 0.8606129501926918


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>