In [1]:
import pandas as pd

In [5]:
# Load data
df_features = pd.read_csv('training_set_features.csv')
df_labels = pd.read_csv('training_set_labels.csv')

# Merge dataframes on respondent_id
df = pd.merge(df_features, df_labels, on='respondent_id')

# Drop respondent_id as it's not a feature
df.drop(columns=['respondent_id'], inplace=True)


# For this competition, there are two target variables:
● xyz_vaccine - Whether respondent received xyz flu vaccine. <br>
● seasonal_vaccine - Whether respondent received seasonal flu vaccine.

In [7]:
# Separate features and targets
target_variables = ['xyz_vaccine', 'seasonal_vaccine']
X = df.drop(columns=target_variables)
y = df[target_variables]

In [8]:
# training and testing set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=48)

Seperating Numerical and categorical data for preprocessing

In [10]:
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns


### Preprocessing

In [11]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier

In [13]:

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])


## Model : 
- Logistic Regresion
- Random Forest
- XGBoost

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [16]:
# List of models to evaluate
models = {
    'Logistic Regression': MultiOutputClassifier(LogisticRegression(max_iter=1000)),
    'Random Forest': MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=48)),
    'XGBoost': MultiOutputClassifier(XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
}

In [17]:
from sklearn.metrics import roc_auc_score


# Train and evaluate each model
results = {}
for name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Predict probabilities for the test set
    y_pred_proba = pipeline.predict_proba(X_test)
    
    # Extract probabilities for each target variable
    y_pred_proba_xyz = y_pred_proba[0][:, 1]
    y_pred_proba_seasonal = y_pred_proba[1][:, 1]
    
    # Evaluate the model
    roc_auc_xyz = roc_auc_score(y_test['xyz_vaccine'], y_pred_proba_xyz)
    roc_auc_seasonal = roc_auc_score(y_test['seasonal_vaccine'], y_pred_proba_seasonal)
    mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2
    
    results[name] = mean_roc_auc
    print(f'{name} Mean ROC AUC: {mean_roc_auc:.4f}')


Logistic Regression Mean ROC AUC: 0.8459
Random Forest Mean ROC AUC: 0.8410
XGBoost Mean ROC AUC: 0.8387


- As we can see its quite close but overall lets say Logistic regression gets the job done

In [18]:
# Prepare submission file with the best model
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]
best_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', best_model)
])

# Train the best model on the entire dataset
best_pipeline.fit(X, y)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['xyz_concern', 'xyz_knowledge', 'behavioral_antiviral_meds',
       'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_hom...
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
      

In [19]:

# Predict probabilities for the submission
submission = pd.DataFrame({
    'respondent_id': df_features['respondent_id'],
    'xyz_vaccine': best_pipeline.predict_proba(df_features)[0][:, 1],
    'seasonal_vaccine': best_pipeline.predict_proba(df_features)[1][:, 1]
})

In [30]:
submission.to_csv('submission.csv', index=False)
print(f'Best model: {best_model_name}')

Best model: Logistic Regression
