In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score, roc_curve

In [4]:
data = pd.read_csv('test_set_features.csv')

In [5]:
X = data.drop(['respondent_id', 'doctor_recc_xyz', 'doctor_recc_seasonal'], axis=1)
y_xyz = data['doctor_recc_xyz']
y_seasonal = data['doctor_recc_seasonal']

In [6]:
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

In [7]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

In [8]:
model = RandomForestClassifier()


In [9]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', model)])

In [13]:
# Remove rows with NaN in y_xyz
mask_xyz = ~y_xyz.isna()
X_cleaned_xyz = X[mask_xyz]
y_cleaned_xyz = y_xyz[mask_xyz]

# Remove rows with NaN in y_seasonal
mask_seasonal = ~y_seasonal.isna()
X_cleaned_seasonal = X[mask_seasonal]
y_cleaned_seasonal = y_seasonal[mask_seasonal]

# Perform train_test_split
X_train, X_test, y_train_xyz, y_test_xyz = train_test_split(X_cleaned_xyz, y_cleaned_xyz, test_size=0.2, stratify=y_cleaned_xyz, random_state=42)
X_train, X_test, y_train_seasonal, y_test_seasonal = train_test_split(X_cleaned_seasonal, y_cleaned_seasonal, test_size=0.2, stratify=y_cleaned_seasonal, random_state=42)


In [14]:
pipeline.fit(X_train, y_train_xyz)
y_pred_xyz = pipeline.predict_proba(X_test)[:, 1]
roc_auc_xyz = roc_auc_score(y_test_xyz, y_pred_xyz)

In [15]:
pipeline.fit(X_train, y_train_seasonal)
y_pred_seasonal = pipeline.predict_proba(X_test)[:, 1]
roc_auc_seasonal = roc_auc_score(y_test_seasonal, y_pred_seasonal)

In [16]:
print(f'ROC AUC for xyz_vaccine: {roc_auc_xyz}')
print(f'ROC AUC for seasonal_vaccine: {roc_auc_seasonal}')

ROC AUC for xyz_vaccine: 0.4971592631893577
ROC AUC for seasonal_vaccine: 0.7108429722274466


In [19]:
predictions_xyz = pipeline.predict_proba(X)[:, 1]
predictions_seasonal = pipeline.predict_proba(X)[:, 1]

In [20]:
submission = pd.DataFrame({
    'respondent_id': data['respondent_id'],
    'xyz_vaccine': predictions_xyz,
    'seasonal_vaccine': predictions_seasonal
})

submission.to_csv('submission.csv', index=False)