In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Load the data
train_features = pd.read_csv("C:/Users/Himanshu Malik/Downloads/Compressed/dataset and all/training_set_features.csv")
train_labels = pd.read_csv("C:/Users/Himanshu Malik/Downloads/Compressed/dataset and all/training_set_labels.csv")
test_features = pd.read_csv("C:/Users/Himanshu Malik/Downloads/Compressed/dataset and all/test_set_features.csv")

# Merge training features and labels on respondent_id
train_df = pd.merge(train_features, train_labels, on="respondent_id")

# Separate features and target variables
X = train_df.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y_xyz = train_df['xyz_vaccine']
y_seasonal = train_df['seasonal_vaccine']
test_X = test_features.drop(columns=['respondent_id'])

# Handle missing values
imputer = SimpleImputer(strategy='most_frequent')
X_imputed = imputer.fit_transform(X)
test_X_imputed = imputer.transform(test_X)

# Encode categorical variables
categorical_columns = X.select_dtypes(include=['object']).columns
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_encoded = encoder.fit_transform(X[categorical_columns])
test_X_encoded = encoder.transform(test_X[categorical_columns])

# Convert the imputed arrays back to DataFrames to handle column operations
X_imputed_df = pd.DataFrame(X_imputed, columns=X.columns)
test_X_imputed_df = pd.DataFrame(test_X_imputed, columns=test_X.columns)

# Drop the original categorical columns from imputed DataFrames
X_imputed_df = X_imputed_df.drop(columns=categorical_columns)
test_X_imputed_df = test_X_imputed_df.drop(columns=categorical_columns)

# Combine encoded categorical columns with numerical columns
X_final = pd.concat([pd.DataFrame(X_encoded), X_imputed_df.reset_index(drop=True)], axis=1)
test_X_final = pd.concat([pd.DataFrame(test_X_encoded), test_X_imputed_df.reset_index(drop=True)], axis=1)

# Ensure all column names are strings
X_final.columns = X_final.columns.astype(str)
test_X_final.columns = test_X_final.columns.astype(str)

# Split the data
X_train, X_val, y_xyz_train, y_xyz_val = train_test_split(X_final, y_xyz, test_size=0.2, random_state=42)
X_train, X_val, y_seasonal_train, y_seasonal_val = train_test_split(X_final, y_seasonal, test_size=0.2, random_state=42)

# Train the model for xyz_vaccine using Logistic Regression
model_xyz = LogisticRegression(random_state=42, max_iter=1000)
model_xyz.fit(X_train, y_xyz_train)
xyz_pred_probs = model_xyz.predict_proba(X_val)[:, 1]
xyz_roc_auc = roc_auc_score(y_xyz_val, xyz_pred_probs)

# Train the model for seasonal_vaccine using Logistic Regression
model_seasonal = LogisticRegression(random_state=42, max_iter=1000)
model_seasonal.fit(X_train, y_seasonal_train)
seasonal_pred_probs = model_seasonal.predict_proba(X_val)[:, 1]
seasonal_roc_auc = roc_auc_score(y_seasonal_val, seasonal_pred_probs)

# Print the ROC AUC scores
print(f'ROC AUC for xyz_vaccine: {xyz_roc_auc}')
print(f'ROC AUC for seasonal_vaccine: {seasonal_roc_auc}')

# Calculate and print the mean of the two ROC AUC scores
mean_roc_auc = (xyz_roc_auc + seasonal_roc_auc) / 2
print(f'Mean ROC AUC: {mean_roc_auc}')

# Predict on the test data
test_xyz_pred_probs = model_xyz.predict_proba(test_X_final)[:, 1]
test_seasonal_pred_probs = model_seasonal.predict_proba(test_X_final)[:, 1]

# Prepare the submission file
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': test_xyz_pred_probs,
    'seasonal_vaccine': test_seasonal_pred_probs
})

# Save the submission file
submission.to_csv("C:/Users/Himanshu Malik/Downloads/Compressed/dataset and all/submission_final.csv", index=False)

ROC AUC for xyz_vaccine: 0.8318514736656331
ROC AUC for seasonal_vaccine: 0.8561786808368972
Mean ROC AUC: 0.8440150772512651
