In [1]:
import pandas as pd
import os

train_features = pd.read_csv("training_set_features.csv")
train_labels = pd.read_csv("training_set_labels.csv")
test_features = pd.read_csv("test_set_features.csv")
train_features = train_features.merge(train_labels, on='respondent_id')

In [2]:
from sklearn.impute import SimpleImputer

X = train_features.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y_xyz = train_features[['xyz_vaccine']].values
y_s = train_features[['seasonal_vaccine']].values
X_test = test_features.drop(columns=['respondent_id'])

imputer = SimpleImputer(strategy='most_frequent')
X_i = imputer.fit_transform(X)
X_test_i = imputer.transform(X_test)
X_i = pd.DataFrame(X_i, columns=X.columns)
X_test_i = pd.DataFrame(X_test_i, columns=X_test.columns)

In [3]:
from sklearn.preprocessing import LabelEncoder

headings = {}
for column in X.columns:
    if X[column].dtype == 'object':
        le = LabelEncoder()
        X_i[column] = le.fit_transform(X_i[column])
        X_test_i[column] = le.transform(X_test_i[column])
        headings[column] = le

In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X_i)
X_test = scaler.transform(X_test_i)

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

lr_xyz = LogisticRegression(max_iter=1000)
lr_xyz.fit(X_i, y_xyz.ravel())

lr_seasonal = LogisticRegression(max_iter=1000)
lr_seasonal.fit(X_i, y_s.ravel())

roc_auc_xyz_lr = roc_auc_score(y_xyz, lr_xyz.predict_proba(X_i)[:, 1],average='macro')
roc_auc_seasonal_lr = roc_auc_score(y_s, lr_seasonal.predict_proba(X_i)[:, 1],average='macro')
print(f"ROC AUC in Logistic {(roc_auc_xyz_lr+roc_auc_seasonal_lr )/2}")

xyz_vaccine_lr = lr_xyz.predict_proba(X_test_i)[:, 1] 
seasonal_vaccine_lr = lr_seasonal.predict_proba(X_test_i)[:, 1] 

ROC AUC in Logistic 0.8398170406107611


In [6]:
from sklearn.naive_bayes import GaussianNB

gnb_xyz = GaussianNB()
gnb_seasonal = GaussianNB()

gnb_xyz.fit(X_i, y_xyz.ravel())
gnb_seasonal.fit(X_i, y_s.ravel())

roc_auc_xyz_gnb = roc_auc_score(y_xyz, gnb_xyz.predict_proba(X_i)[:, 1])
roc_auc_seasonal_gnb = roc_auc_score(y_s, gnb_seasonal.predict_proba(X_i)[:, 1]) 
print(f"ROC AUC in Guassian Naive Bias is {(roc_auc_xyz_gnb+roc_auc_seasonal_gnb)/2}")

xyz_vaccine_gnb = gnb_xyz.predict_proba(X_test_i)[:, 1]
seasonal_vaccine_gnb = gnb_seasonal.predict_proba(X_test_i)[:, 1]

ROC AUC in Guassian Naive Bias is 0.7931017761076203


In [7]:
from sklearn import svm

svm_xyz = svm.SVC(probability=True, kernel='rbf')
svm_seasonal = svm.SVC(probability=True, kernel='rbf')

svm_xyz.fit(X_i, y_xyz.ravel())
svm_seasonal.fit(X_i, y_s.ravel())

roc_auc_xyz_svm = roc_auc_score(y_xyz, svm_xyz.predict_proba(X_i)[:, 1])
roc_auc_seasonal_svm = roc_auc_score(y_s, svm_seasonal.predict_proba(X_i)[:, 1])
print(f"ROC AUC in SVM is {(roc_auc_xyz_svm+roc_auc_seasonal_svm)/2}")

xyz_vaccine_svm = svm_xyz.predict_proba(X_test_i)[:, 1]
seasonal_vaccine_svm = svm_seasonal.predict_proba(X_test_i)[:, 1]

ROC AUC in SVM is 0.8434537851078453


In [8]:
test_ids = test_features['respondent_id']
submission = pd.DataFrame({
    'respondent_id': test_ids,
    'xyz_vaccine': xyz_vaccine_lr,
    'seasonal_vaccine': seasonal_vaccine_lr
})
submission.to_csv('submission.csv', index=False)