In [50]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score

In [52]:
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')

In [54]:
train_data = pd.merge(train_features, train_labels, on='respondent_id')

In [56]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 38 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  int64  
 1   xyz_concern                  26615 non-null  float64
 2   xyz_knowledge                26591 non-null  float64
 3   behavioral_antiviral_meds    26636 non-null  float64
 4   behavioral_avoidance         26499 non-null  float64
 5   behavioral_face_mask         26688 non-null  float64
 6   behavioral_wash_hands        26665 non-null  float64
 7   behavioral_large_gatherings  26620 non-null  float64
 8   behavioral_outside_home      26625 non-null  float64
 9   behavioral_touch_face        26579 non-null  float64
 10  doctor_recc_xyz              24547 non-null  float64
 11  doctor_recc_seasonal         24547 non-null  float64
 12  chronic_med_condition        25736 non-null  float64
 13  child_under_6_mo

In [58]:
X = train_data.drop(['respondent_id','census_msa','race','rent_or_own','employment_industry', 'xyz_vaccine','seasonal_vaccine'], axis=1)
y = train_data[['xyz_vaccine', 'seasonal_vaccine']]
test_data = test_features.drop(['respondent_id'], axis=1)

In [60]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 31 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   xyz_concern                  26615 non-null  float64
 1   xyz_knowledge                26591 non-null  float64
 2   behavioral_antiviral_meds    26636 non-null  float64
 3   behavioral_avoidance         26499 non-null  float64
 4   behavioral_face_mask         26688 non-null  float64
 5   behavioral_wash_hands        26665 non-null  float64
 6   behavioral_large_gatherings  26620 non-null  float64
 7   behavioral_outside_home      26625 non-null  float64
 8   behavioral_touch_face        26579 non-null  float64
 9   doctor_recc_xyz              24547 non-null  float64
 10  doctor_recc_seasonal         24547 non-null  float64
 11  chronic_med_condition        25736 non-null  float64
 12  child_under_6_months         25887 non-null  float64
 13  health_worker   

In [62]:
def fill_missing_values(df):
    for column in df.columns:
        if np.issubdtype(df[column].dtype, np.number):
            df.fillna({column: df[column].mean()}, inplace=True)
        else:
            df.fillna({column: df[column].mode()[0]}, inplace=True)

    return df

In [64]:
X_filled = fill_missing_values(X)
test_data_filled = fill_missing_values(test_data)

In [66]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 31 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   xyz_concern                  26707 non-null  float64
 1   xyz_knowledge                26707 non-null  float64
 2   behavioral_antiviral_meds    26707 non-null  float64
 3   behavioral_avoidance         26707 non-null  float64
 4   behavioral_face_mask         26707 non-null  float64
 5   behavioral_wash_hands        26707 non-null  float64
 6   behavioral_large_gatherings  26707 non-null  float64
 7   behavioral_outside_home      26707 non-null  float64
 8   behavioral_touch_face        26707 non-null  float64
 9   doctor_recc_xyz              26707 non-null  float64
 10  doctor_recc_seasonal         26707 non-null  float64
 11  chronic_med_condition        26707 non-null  float64
 12  child_under_6_months         26707 non-null  float64
 13  health_worker   

In [68]:
X = pd.get_dummies(X_filled, drop_first=True)
test_data = pd.get_dummies(test_data_filled, drop_first=True)

In [70]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 67 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   xyz_concern                           26707 non-null  float64
 1   xyz_knowledge                         26707 non-null  float64
 2   behavioral_antiviral_meds             26707 non-null  float64
 3   behavioral_avoidance                  26707 non-null  float64
 4   behavioral_face_mask                  26707 non-null  float64
 5   behavioral_wash_hands                 26707 non-null  float64
 6   behavioral_large_gatherings           26707 non-null  float64
 7   behavioral_outside_home               26707 non-null  float64
 8   behavioral_touch_face                 26707 non-null  float64
 9   doctor_recc_xyz                       26707 non-null  float64
 10  doctor_recc_seasonal                  26707 non-null  float64
 11  chronic_med_con

In [72]:
X, test_data = X.align(test_data, join='left', axis=1, fill_value=0)

In [74]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

In [76]:
xyz = SVC(probability=True)
seasonal = SVC(probability=True)

In [78]:
xyz.fit(X_train, y_train['xyz_vaccine'])
seasonal.fit(X_train, y_train['seasonal_vaccine'])

In [79]:
xyz_prob = xyz.predict_proba(X_valid)[:, 1]
seasonal_prob = seasonal.predict_proba(X_valid)[:, 1]

In [80]:
roc_auc_xyz = roc_auc_score(y_valid['xyz_vaccine'], xyz_prob)
roc_auc_seasonal = roc_auc_score(y_valid['seasonal_vaccine'], seasonal_prob)

In [81]:
print('ROC AUC Score for xyz_vaccine:', roc_auc_xyz)
print('ROC AUC Score for seasonal_vaccine:', roc_auc_seasonal)
print('Average ROC AUC Score:', (roc_auc_xyz + roc_auc_seasonal) / 2)

ROC AUC Score for xyz_vaccine: 0.8303410937920717
ROC AUC Score for seasonal_vaccine: 0.8605956186884263
Average ROC AUC Score: 0.845468356240249


In [82]:
test_xyz_prob = xyz.predict_proba(test_data)[:, 1]
test_seasonal_prob = seasonal.predict_proba(test_data)[:, 1]

In [83]:
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': test_xyz_prob,
    'seasonal_vaccine': test_seasonal_prob
})

In [84]:
submission.to_csv('Final.csv',index=False)