In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from google.colab import drive, files
drive.mount('/content/drive')
train_features=pd.read_csv('/content/drive/MyDrive/training_set_features.csv')
train_labels=pd.read_csv('/content/drive/MyDrive/training_set_labels.csv')
test_data=pd.read_csv('/content/drive/MyDrive/test_set_features.csv')

#Separating the categorical and numerical columns
categorical_cols=train_features.select_dtypes(include=['object']).columns
numerical_cols=train_features.select_dtypes(exclude=['object']).columns

#Encoding the categorical columns numerically and one-hot encode
encoders={}
for col in categorical_cols:
    encoders[col]=LabelEncoder()
    train_features[col]=encoders[col].fit_transform(train_features[col])
    test_data[col]=encoders[col].transform(test_data[col])

ohe=OneHotEncoder(handle_unknown='ignore', sparse_output=False)
ohe_train_features=ohe.fit_transform(train_features[categorical_cols])
ohe_test_features=ohe.transform(test_data[categorical_cols])

X_train=pd.concat([train_features[numerical_cols], pd.DataFrame(ohe_train_features, index=train_features.index)], axis=1)
X_test=pd.concat([test_data[numerical_cols], pd.DataFrame(ohe_test_features, index=test_data.index, columns=X_train.columns[len(numerical_cols):])], axis=1)

#Converting column names to strings
X_train.columns=X_train.columns.astype(str)
X_test.columns=X_test.columns.astype(str)

#Imputing missing values in the numerical columns
numerical_imputer=SimpleImputer(strategy='mean')
X_train[numerical_cols]=numerical_imputer.fit_transform(X_train[numerical_cols])
X_test[numerical_cols]=numerical_imputer.transform(X_test[numerical_cols])

#Separating labels
y_train_xyz=train_labels['xyz_vaccine']
y_train_seasonal=train_labels['seasonal_vaccine']

#Spliting the training data into train and validation sets
X_train, X_val, y_train_xyz, y_val_xyz, y_train_seasonal, y_val_seasonal=train_test_split(X_train, y_train_xyz, y_train_seasonal, test_size=0.2, random_state=42)

#Training
xyz_model=LogisticRegression(max_iter=1000)
xyz_model.fit(X_train, y_train_xyz)

seasonal_model=LogisticRegression(max_iter=1000)
seasonal_model.fit(X_train, y_train_seasonal)

#Prediction
xyz_val_probs=xyz_model.predict_proba(X_val)[:, 1]
seasonal_val_probs=seasonal_model.predict_proba(X_val)[:, 1]

#Calculating scores
xyz_val_auc=roc_auc_score(y_val_xyz, xyz_val_probs)
seasonal_val_auc=roc_auc_score(y_val_seasonal, seasonal_val_probs)

print(f"xyz_vaccine: {xyz_val_auc:.4f}")
print(f"seasonal_vaccine: {seasonal_val_auc:.4f}")
print(f"Mean ROC AUC: {(xyz_val_auc + seasonal_val_auc) / 2:.4f}")

X_test=pd.DataFrame(data=X_test, columns=X_train.columns)

#Probabilities for the test set
xyz_test_probs=xyz_model.predict_proba(X_test)[:, 1]
seasonal_test_probs=seasonal_model.predict_proba(X_test)[:, 1]

#Saving into new file
submission=pd.DataFrame({'respondent_id': test_data['respondent_id'], 'xyz_vaccine': xyz_test_probs, 'seasonal_vaccine': seasonal_test_probs})
submission.to_csv('Output_RoopeshRanjan.csv', index=False)
files.download('Output_RoopeshRanjan.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
xyz_vaccine: 0.8243
seasonal_vaccine: 0.8304
Mean ROC AUC: 0.8273


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>