In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import BaggingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

In [2]:
# Load datasets
train_features = pd.read_csv(r'C:\Users\hitik\Downloads\dataset and all\training_set_features.csv')
train_labels = pd.read_csv(r'C:\Users\hitik\Downloads\dataset and all\training_set_labels.csv')
test_features = pd.read_csv(r'C:\Users\hitik\Downloads\dataset and all\test_set_features.csv')
submission_format = pd.read_csv(r'C:\Users\hitik\Downloads\dataset and all\submission_format.csv')

In [3]:
# Separate target variables and drop unnecessary columns
train_features = train_features.drop(['respondent_id', 'employment_industry', 'employment_occupation', 'health_insurance'], axis=1)
target_xyz = train_labels['xyz_vaccine']
target_seasonal = train_labels['seasonal_vaccine']
test_ids = test_features['respondent_id']
test_features = test_features.drop(['respondent_id', 'employment_industry', 'employment_occupation', 'health_insurance'], axis=1)

In [4]:
# Identify numerical and categorical columns
numeric_cols = train_features.select_dtypes(include=['float64']).columns
categorical_cols = train_features.select_dtypes(include=['object']).columns


In [9]:
# Define transformers for numerical and categorical columns
num_transformer = Pipeline(steps=[
    ('impute', IterativeImputer(max_iter=12, tol=0.001, random_state=43)),  
    ('scale', StandardScaler())
])
cat_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [6]:
# Combine transformers
preprocess = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numeric_cols),
        ('cat', cat_transformer, categorical_cols)
    ])

In [10]:
# Define cross-validator
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=43)  

# Hyperparameters for the model
xgb_hyperparams = {'n_estimators': 110, 'max_depth': 6,'min_child_weight': 14, 'eta': 0.08, 'reg_lambda': 0.14,'random_state': 43} 
bagging_hyperparams = {'n_estimators': 45, 'random_state': 43,'max_samples': 0.85, 'max_features': 0.88, 'bootstrap': True}

In [8]:
# Function to evaluate the model
def evaluate_model(features, labels):
    class_weight = (len(labels) - labels.sum()) / labels.sum()
    xgb_model = XGBClassifier(objective='binary:logistic', scale_pos_weight=class_weight, **xgb_hyperparams)
    bagging_model = BaggingClassifier(estimator=xgb_model, **bagging_hyperparams)
    auc_scores = []

    for train_idx, test_idx in kf.split(features, labels):
        X_train, X_val = features[train_idx], features[test_idx]
        y_train, y_val = labels[train_idx], labels[test_idx]

        bagging_model.fit(X_train, y_train)
        y_pred_proba = bagging_model.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, y_pred_proba, average="macro")
        auc_scores.append(auc)
    
    return np.mean(auc_scores)

In [11]:
# Preprocess training features
processed_train_features = preprocess.fit_transform(train_features)

In [12]:
# Evaluate models
print("Evaluating xyz_vaccine model...")
roc_auc_xyz = evaluate_model(processed_train_features, target_xyz.values.ravel())
print("Mean ROC AUC Score for xyz_vaccine:", roc_auc_xyz)

print("\nEvaluating seasonal_vaccine model...")
roc_auc_seasonal = evaluate_model(processed_train_features, target_seasonal.values.ravel())
print("Mean ROC AUC Score for seasonal_vaccine:", roc_auc_seasonal)

overall_auc = (roc_auc_xyz + roc_auc_seasonal) / 2
print("\nOverall ROC AUC Score:", overall_auc)

Evaluating xyz_vaccine model...
Mean ROC AUC Score for xyz_vaccine: 0.8380653228436256

Evaluating seasonal_vaccine model...
Mean ROC AUC Score for seasonal_vaccine: 0.8589333717046748

Overall ROC AUC Score: 0.8484993472741502


In [13]:
 # Preprocess test features
processed_test_features = preprocess.transform(test_features)


In [14]:
# Function to train and predict
def train_and_predict(features, labels, test_data):
    class_weight = (len(labels) - labels.sum()) / labels.sum()
    final_model = BaggingClassifier(estimator=XGBClassifier(objective='binary:logistic', scale_pos_weight=class_weight, **xgb_hyperparams), **bagging_hyperparams)
    final_model.fit(features, labels)
    predictions = final_model.predict_proba(test_data)[:, 1]
    return np.round(predictions, 1)

In [15]:
xyz_predictions = train_and_predict(processed_train_features, target_xyz.values.ravel(), processed_test_features)
seasonal_predictions = train_and_predict(processed_train_features, target_seasonal.values.ravel(), processed_test_features)

In [16]:
# Create DataFrame for predictions
submission_df = pd.DataFrame({
    'respondent_id': test_ids,
    'xyz_vaccine': xyz_predictions,
    'seasonal_vaccine': seasonal_predictions
})

In [17]:
# Save results to CSV file
submission_df.to_csv(r'C:\Users\hitik\Downloads\dataset and all\results.csv', index=False)