In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [2]:
training_features = pd.read_csv('/content/training_set_features.csv')
training_labels = pd.read_csv('/content/training_set_labels.csv')
test_features = pd.read_csv('/content/test_set_features.csv')
submission_format = pd.read_csv('/content/submission_format.csv')

In [4]:
training_data = pd.merge(training_features, training_labels, on='respondent_id')

In [6]:
X = training_data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y_xyz = training_data['xyz_vaccine']
y_seasonal = training_data['seasonal_vaccine']
X_test = test_features.drop(columns=['respondent_id'])

In [7]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [8]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

In [9]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [11]:
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestClassifier(random_state=42))])

In [12]:
X_train, X_val, y_train_xyz, y_val_xyz = train_test_split(X, y_xyz, test_size=0.2, random_state=42)
_, _, y_train_seasonal, y_val_seasonal = train_test_split(X, y_seasonal, test_size=0.2, random_state=42)

In [13]:
model.fit(X_train, y_train_xyz)
y_pred_val_xyz = model.predict_proba(X_val)[:, 1]
roc_auc_xyz = roc_auc_score(y_val_xyz, y_pred_val_xyz)
print(f'ROC AUC for xyz vaccine: {roc_auc_xyz}')

ROC AUC for xyz vaccine: 0.8294325525888947


In [14]:
model.fit(X_train, y_train_seasonal)
y_pred_val_seasonal = model.predict_proba(X_val)[:, 1]
roc_auc_seasonal = roc_auc_score(y_val_seasonal, y_pred_val_seasonal)
print(f'ROC AUC for seasonal vaccine: {roc_auc_seasonal}')

ROC AUC for seasonal vaccine: 0.8518072872366175


In [15]:
model.fit(X, y_xyz)
y_test_pred_xyz = model.predict_proba(X_test)[:, 1]

In [16]:
model.fit(X, y_seasonal)
y_test_pred_seasonal = model.predict_proba(X_test)[:, 1]

In [17]:
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': y_test_pred_xyz,
    'seasonal_vaccine': y_test_pred_seasonal
})

In [18]:
submission.to_csv('submission.csv', index=False)
print("Submission file created successfully!")

Submission file created successfully!
