In [None]:
import pandas as pd
import numpy as np

In [None]:
# Loading the data
features = ('training_set_features.csv')
labels= ('training_set_labels.csv')
test_features= ('test_set_features.csv')


In [None]:
features_df = pd.read_csv(features)
labels_df = pd.read_csv(labels)
test_features_df = pd.read_csv(test_features)

In [None]:
data = pd.merge(features_df, labels_df, on='respondent_id')

In [None]:
X = data.drop(['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'], axis=1)
y = data[['xyz_vaccine', 'seasonal_vaccine']]


In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Now identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns


In [None]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])


In [None]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


In [None]:
# Combine numerical and categorical transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Apply the preprocessing pipeline
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)



In [None]:
from sklearn.linear_model import LogisticRegression

# Train logistic regression models for each target variable
model_xyz = LogisticRegression(max_iter=1000)
model_seasonal = LogisticRegression(max_iter=1000)


In [None]:
model_xyz.fit(X_train, y_train['xyz_vaccine'])
model_seasonal.fit(X_train, y_train['seasonal_vaccine'])


In [None]:
# Predict probabilities
y_prob_xyz = model_xyz.predict_proba(X_test)[:, 1]
y_prob_seasonal = model_seasonal.predict_proba(X_test)[:, 1]


In [None]:
from sklearn.metrics import roc_auc_score
auc_xyz = roc_auc_score(y_test['xyz_vaccine'], y_prob_xyz)
auc_seasonal = roc_auc_score(y_test['seasonal_vaccine'], y_prob_seasonal)


In [None]:
print(auc_xyz)
print(auc_seasonal)


0.8313516375463279
0.8560635216059745


In [None]:
# Apply the same preprocessing pipeline to the test data
X_test_new = preprocessor.transform(test_features_df)


In [None]:
# Predict probabilities for the new test set
test_prob_xyz = model_xyz.predict_proba(X_test_new)[:, 1]
test_prob_seasonal = model_seasonal.predict_proba(X_test_new)[:, 1]


In [None]:
# Create a DataFrame for the predictions
final_dataset = pd.DataFrame({ 'respondent_id': test_features_df['respondent_id'], 'xyz_vaccine': test_prob_xyz,
    'seasonal_vaccine': test_prob_seasonal})


In [None]:
final_dataset.head(3)


Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,26707,0.050004,0.297041
1,26708,0.04635,0.046415
2,26709,0.366987,0.514891


In [None]:
final_dataset.to_csv('final_dataset.csv', index=False)