In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

In [4]:

submission_format = pd.read_csv(r'C:\Users\OWNER\Downloads\dataset and all\submission_format.csv')
X_train = pd.read_csv(r'C:\Users\OWNER\Downloads\dataset and all\training_set_features.csv')
y_train = pd.read_csv(r'C:\Users\OWNER\Downloads\dataset and all\training_set_labels.csv')
X_test = pd.read_csv(r'C:\Users\OWNER\Downloads\dataset and all\test_set_features.csv')

In [5]:
train_data = X_train.merge(y_train, on='respondent_id')

In [6]:

X_train = train_data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y_train = train_data[['xyz_vaccine', 'seasonal_vaccine']]
X_test = X_test.drop(columns=['respondent_id'])

In [7]:
# Identify categorical and numerical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [8]:
# Define preprocessors for numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean
    ('scaler', StandardScaler())                  # Scale numerical features
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with most frequent
    ('onehot', OneHotEncoder(handle_unknown='ignore'))     # One-hot encode categorical features
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


In [9]:
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', MultiOutputClassifier(LogisticRegression(max_iter=10000, random_state=2)))
])

# Fit the model
clf.fit(X_train, y_train)

# Predict probabilities for the test set
y_pred = clf.predict_proba(X_test)

# Prepare the submission file
submission = submission_format.copy()
submission['xyz_vaccine'] = y_pred[0][:, 1]
submission['seasonal_vaccine'] = y_pred[1][:, 1]


submission.to_csv('project_output.csv', index=False)

