In [35]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split


In [37]:
# Load the datasets
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')

In [38]:
# Separate features and target variables
X = train_features
y = train_labels[['xyz_vaccine', 'seasonal_vaccine']]


In [41]:
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

In [43]:
# Define ordinal features and their ordering
ordinal_features = {
    'xyz_concern': [0, 1, 2, 3],
    'xyz_knowledge': [0, 1, 2],
    'opinion_xyz_vacc_effective': [1, 2, 3, 4, 5],
    'opinion_xyz_risk': [1, 2, 3, 4, 5],
    'opinion_xyz_sick_from_vacc': [1, 2, 3, 4, 5],
    'opinion_seas_vacc_effective': [1, 2, 3, 4, 5],
    'opinion_seas_risk': [1, 2, 3, 4, 5],
    'opinion_seas_sick_from_vacc': [1, 2, 3, 4, 5]
}

ordinal_cols = list(ordinal_features.keys())

In [45]:


# Preprocessing pipelines for numerical, ordinal, and nominal categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Fill missing numerical values with median
    ('scaler', StandardScaler())  # Scale numerical values
])
ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing ordinal values with the most frequent value
    ('ordinal', OrdinalEncoder(categories=[ordinal_features[col] for col in ordinal_cols]))  # Encode ordinal values
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing categorical values with the most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical values
])

In [47]:
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('ord', ordinal_transformer, ordinal_cols),
        ('cat', categorical_transformer, [col for col in categorical_cols if col not in ordinal_cols])
    ])

In [49]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [50]:
# Create a multilabel classifier
multi_target_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', MultiOutputClassifier(LogisticRegression()))
])

In [53]:
# Train the model
multi_target_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [54]:
# Predict probabilities on validation set
y_val_prob = multi_target_model.predict_proba(X_val)

In [55]:
# Separate the probabilities for each target
y_xyz_val_prob = y_val_prob[0][:, 1]
y_seasonal_val_prob = y_val_prob[1][:, 1]

In [59]:
# Evaluation on validation set
y_val_pred = multi_target_model.predict(X_val)
y_xyz_val_pred, y_seasonal_val_pred = y_val_pred[:, 0], y_val_pred[:, 1]

In [61]:
print('XYZ Vaccine Model Validation:')
print('Accuracy:', accuracy_score(y_val['xyz_vaccine'], y_xyz_val_pred))
print('ROC-AUC:', roc_auc_score(y_val['xyz_vaccine'], y_xyz_val_prob))
print(classification_report(y_val['xyz_vaccine'], y_xyz_val_pred))

print('Seasonal Flu Vaccine Model Validation:')
print('Accuracy:', accuracy_score(y_val['seasonal_vaccine'], y_seasonal_val_pred))
print('ROC-AUC:', roc_auc_score(y_val['seasonal_vaccine'], y_seasonal_val_prob))
print(classification_report(y_val['seasonal_vaccine'], y_seasonal_val_pred))

XYZ Vaccine Model Validation:
Accuracy: 0.8397603893672781
ROC-AUC: 0.8314106766171664
              precision    recall  f1-score   support

           0       0.86      0.95      0.90      4212
           1       0.70      0.42      0.53      1130

    accuracy                           0.84      5342
   macro avg       0.78      0.69      0.72      5342
weighted avg       0.83      0.84      0.82      5342

Seasonal Flu Vaccine Model Validation:
Accuracy: 0.7865967802321228
ROC-AUC: 0.8561016257632651
              precision    recall  f1-score   support

           0       0.79      0.82      0.81      2891
           1       0.78      0.75      0.76      2451

    accuracy                           0.79      5342
   macro avg       0.79      0.78      0.78      5342
weighted avg       0.79      0.79      0.79      5342



In [63]:
# Predict probabilities on test set
y_test_prob = multi_target_model.predict_proba(test_features)

In [65]:
# Separate the probabilities for each target
y_xyz_test_prob = y_test_prob[0][:, 1]
y_seasonal_test_prob = y_test_prob[1][:, 1]

In [67]:
# Round probabilities to one decimal place
y_xyz_test_prob = y_xyz_test_prob.round(1)
y_seasonal_test_prob = y_seasonal_test_prob.round(1)

In [69]:
# Create a submission DataFrame
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': y_xyz_test_prob,
    'seasonal_vaccine': y_seasonal_test_prob
})

In [71]:
# Save the submission file as CSV
submission.to_csv('submission.csv', index=False)
print(submission.head())

   respondent_id  xyz_vaccine  seasonal_vaccine
0          26707          0.1               0.3
1          26708          0.0               0.0
2          26709          0.4               0.5
3          26710          0.5               0.9
4          26711          0.1               0.5


In [74]:
submission.to_excel('submission.xlsx', index=False)