In [32]:

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Load the data
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')

# Preprocess the data
# Define categorical and numerical features
categorical_features = [
    'age_group', 'education', 'race', 'sex', 'marital_status', 'rent_or_own',
    'employment_status', 'hhs_geo_region', 'census_msa', 'employment_industry',
    'employment_occupation', 'income_poverty', 'xyz_concern', 'xyz_knowledge',
    'behavioral_antiviral_meds', 'behavioral_avoidance', 'behavioral_face_mask',
    'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home',
    'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal', 'chronic_med_condition',
    'child_under_6_months', 'health_worker', 'health_insurance', 'opinion_xyz_vacc_effective',
    'opinion_xyz_risk', 'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
    'opinion_seas_risk', 'opinion_seas_sick_from_vacc'
]
numerical_features = ['household_adults', 'household_children']

# Handle missing values
imputer_num = SimpleImputer(strategy='median')
imputer_cat = SimpleImputer(strategy='most_frequent')

# Apply the imputers
train_features[numerical_features] = imputer_num.fit_transform(train_features[numerical_features])
test_features[numerical_features] = imputer_num.transform(test_features[numerical_features])

train_features[categorical_features] = imputer_cat.fit_transform(train_features[categorical_features])
test_features[categorical_features] = imputer_cat.transform(test_features[categorical_features])

# Encode categorical variables
le = LabelEncoder()
for col in categorical_features:
    train_features[col] = le.fit_transform(train_features[col].astype(str))
    test_features[col] = le.transform(test_features[col].astype(str))

# Scale numerical variables
scaler = StandardScaler()
train_features[numerical_features] = scaler.fit_transform(train_features[numerical_features])
test_features[numerical_features] = scaler.transform(test_features[numerical_features])

# Check for any remaining NaN values
assert train_features.isnull().sum().sum() == 0, f"Train features contain NaN values in columns: {train_features.columns[train_features.isnull().any()]}"
assert test_features.isnull().sum().sum() == 0, f"Test features contain NaN values in columns: {test_features.columns[test_features.isnull().any()]}"

# Define the feature matrix and target vectors
X = train_features.drop(columns=['respondent_id'])
y_xyz = train_labels['xyz_vaccine']
y_seasonal = train_labels['seasonal_vaccine']

# Split the training data into training and validation sets
X_train, X_val, y_train_xyz, y_val_xyz = train_test_split(X, y_xyz, test_size=0.2, random_state=42)
X_train, X_val, y_train_seasonal, y_val_seasonal = train_test_split(X, y_seasonal, test_size=0.2, random_state=42)

# Train a logistic regression model for xyz_vaccine
model_xyz = LogisticRegression(max_iter=1000)
model_xyz.fit(X_train, y_train_xyz)

# Train a logistic regression model for seasonal_vaccine
model_seasonal = LogisticRegression(max_iter=1000)
model_seasonal.fit(X_train, y_train_seasonal)

# Evaluate the models on the validation set
y_pred_val_xyz = model_xyz.predict_proba(X_val)[:, 1]
y_pred_val_seasonal = model_seasonal.predict_proba(X_val)[:, 1]

print("Validation ROC AUC for xyz_vaccine: ", roc_auc_score(y_val_xyz, y_pred_val_xyz))
print("Validation ROC AUC for seasonal_vaccine: ", roc_auc_score(y_val_seasonal, y_pred_val_seasonal))

# Make predictions on the test set
X_test = test_features.drop(columns=['respondent_id'])
y_pred_test_xyz = model_xyz.predict_proba(X_test)[:, 1]
y_pred_test_seasonal = model_seasonal.predict_proba(X_test)[:, 1]

# Format the submission file
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'].astype(str),
    'xyz_vaccine': y_pred_test_xyz,
    'seasonal_vaccine': y_pred_test_seasonal
})

# Save the submission file
submission.to_csv('submission.csv', index=False)

Validation ROC AUC for xyz_vaccine:  0.8276874332921531
Validation ROC AUC for seasonal_vaccine:  0.8501901185759038
