In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from sklearn.metrics import make_scorer, roc_auc_score

















In [30]:
# Load the datasets
train_features = pd.read_csv('/kaggle/input/train-set/training_set_features.csv')
train_labels = pd.read_csv('/kaggle/input/train-labels/training_set_labels.csv')
test_features = pd.read_csv('/kaggle/input/test-set/test_set_features.csv')

In [32]:

# Merge training features and labels on respondent_id
train_data = pd.merge(train_features, train_labels, on='respondent_id')

In [33]:
# Define preprocessing for numerical features
numerical_cols = ['xyz_concern', 'xyz_knowledge', 'household_adults', 'household_children']  # Adjust if necessary
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [35]:

# Define preprocessing for categorical features
categorical_cols = [col for col in train_features.columns if col not in numerical_cols and col != 'respondent_id']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [36]:

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [37]:

# Apply the preprocessing pipeline to the data
X = train_data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y_xyz = train_data['xyz_vaccine']
y_seasonal = train_data['seasonal_vaccine']
X_test = test_features.drop(columns=['respondent_id'])
X = preprocessor.fit_transform(X)
X_test = preprocessor.transform(X_test)

In [38]:

# Split the training data for cross-validation
X_train, X_val, y_train_xyz, y_val_xyz = train_test_split(X, y_xyz, test_size=0.2, random_state=42)
_, _, y_train_seasonal, y_val_seasonal = train_test_split(X, y_seasonal, test_size=0.2, random_state=42)

# Initialize the XGBoost classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')


In [39]:
# Hyperparameter tuning using RandomizedSearchCV
param_dist = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'alpha': [0, 0.1, 1],
    'lambda': [1, 1.5, 2]
}


In [40]:

# Define a function to perform hyperparameter tuning and model training
def tune_and_train(X_train, y_train, X_val, y_val):
    search = RandomizedSearchCV(xgb_model, param_distributions=param_dist, n_iter=10, cv=3, verbose=1, n_jobs=-1)
    search.fit(X_train, y_train)
    best_model = search.best_estimator_
    y_val_pred = best_model.predict_proba(X_val)[:, 1]
    roc_auc = roc_auc_score(y_val, y_val_pred)
    print(f"Validation ROC AUC Score: {roc_auc}")
    return best_model

In [41]:

# Tune and train for each target variable
best_model_xyz = tune_and_train(X_train, y_train_xyz, X_val, y_val_xyz)
best_model_seasonal = tune_and_train(X_train, y_train_seasonal, X_val, y_val_seasonal)

# Make predictions on the test set
xyz_vaccine_probs = best_model_xyz.predict_proba(X_test)[:, 1]
seasonal_vaccine_probs = best_model_seasonal.predict_proba(X_test)[:, 1]

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Validation ROC AUC Score: 0.8399520123708915
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Validation ROC AUC Score: 0.8625604215505258


In [42]:

# Prepare the submission
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': xyz_vaccine_probs,
    'seasonal_vaccine': seasonal_vaccine_probs
})

In [43]:

# Save the submission to a CSV file
submission.to_csv('submission3.csv', index=False)