In [41]:
import pandas as pd

# Load the datasets
train_features = pd.read_csv('/kaggle/input/train-set/training_set_features.csv')
train_labels = pd.read_csv('/kaggle/input/train-labels/training_set_labels.csv')
test_features = pd.read_csv('/kaggle/input/test-set/test_set_features.csv')

# Inspect the columns
print("Training Features Columns:", train_features.columns)
print("Training Labels Columns:", train_labels.columns)
print("Test Features Columns:", test_features.columns)

# Display the first few rows of each dataframe
print(train_features.head())
print(train_labels.head())
print(test_features.head())



Training Features Columns: Index(['respondent_id', 'xyz_concern', 'xyz_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
       'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'race', 'sex', 'income_poverty', 'marital_status',
       'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
       'household_adults', 'household_children', 'employment_industry',
       'employment_occupation'],
      dtype='object')
Training Labels Columns: Index(['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'], dtype='object

In [42]:
# Check if respondent_id is unique and consistent
assert train_features['respondent_id'].is_unique
assert train_labels['respondent_id'].is_unique
assert test_features['respondent_id'].is_unique

# Merge training features and labels on respondent_id
train_data = pd.merge(train_features, train_labels, on='respondent_id')

# Display the first few rows of the merged training data
print(train_data.head())



   respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0              0          1.0            0.0                        0.0   
1              1          3.0            2.0                        0.0   
2              2          1.0            1.0                        0.0   
3              3          1.0            1.0                        0.0   
4              4          2.0            1.0                        0.0   

   behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
0                   0.0                   0.0                    0.0   
1                   1.0                   0.0                    1.0   
2                   1.0                   0.0                    0.0   
3                   1.0                   0.0                    1.0   
4                   1.0                   0.0                    1.0   

   behavioral_large_gatherings  behavioral_outside_home  \
0                          0.0                      1.0  

In [44]:

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Define preprocessing for numerical features
numerical_cols = ['xyz_concern', 'xyz_knowledge', 'household_adults', 'household_children']  # Adjust if necessary
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Define preprocessing for categorical features
categorical_cols = [col for col in train_features.columns if col not in numerical_cols and col != 'respondent_id']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Apply the preprocessing pipeline to the data
X_train = train_data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y_train = train_data[['xyz_vaccine', 'seasonal_vaccine']]
X_test = test_features.drop(columns=['respondent_id'])

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)


In [45]:
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier

# Initialize the XGBoost classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Wrap the classifier with MultiOutputClassifier
multi_target_model = MultiOutputClassifier(xgb_model, n_jobs=-1)

# Train the model
multi_target_model.fit(X_train, y_train)





In [46]:
# Make predictions
test_predictions = multi_target_model.predict_proba(X_test)

# Extract probabilities
xyz_vaccine_probs = test_predictions[0][:, 1]
seasonal_vaccine_probs = test_predictions[1][:, 1]

# Prepare the submission
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': xyz_vaccine_probs,
    'seasonal_vaccine': seasonal_vaccine_probs
})

# Save the submission to a CSV file
submission.to_csv('submission.csv', index=False)
