In [14]:
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [2]:
xTrain = pd.read_csv("data/training_set_features.csv") # 26707 x 36
yTrain = pd.read_csv("data/training_set_labels.csv") # 26707 x 3
xTest = pd.read_csv("data/test_set_features.csv") # 26708 x 36

In [3]:
xTrain.describe()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,health_worker,health_insurance,opinion_xyz_vacc_effective,opinion_xyz_risk,opinion_xyz_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,household_adults,household_children
count,26707.0,26615.0,26591.0,26636.0,26499.0,26688.0,26665.0,26620.0,26625.0,26579.0,...,25903.0,14433.0,26316.0,26319.0,26312.0,26245.0,26193.0,26170.0,26458.0,26458.0
mean,13353.0,1.618486,1.262532,0.048844,0.725612,0.068982,0.825614,0.35864,0.337315,0.677264,...,0.111918,0.87972,3.850623,2.342566,2.35767,4.025986,2.719162,2.118112,0.886499,0.534583
std,7709.791156,0.910311,0.618149,0.215545,0.446214,0.253429,0.379448,0.47961,0.472802,0.467531,...,0.315271,0.3253,1.007436,1.285539,1.362766,1.086565,1.385055,1.33295,0.753422,0.928173
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
25%,6676.5,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,3.0,1.0,1.0,4.0,2.0,1.0,0.0,0.0
50%,13353.0,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,4.0,2.0,2.0,4.0,2.0,2.0,1.0,0.0
75%,20029.5,2.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,1.0,5.0,4.0,4.0,5.0,4.0,4.0,1.0,1.0
max,26706.0,3.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,5.0,5.0,5.0,5.0,5.0,5.0,3.0,3.0


In [4]:
print(xTrain.dtypes)
print(yTrain.dtypes)
xTrain.head()

respondent_id                    int64
xyz_concern                    float64
xyz_knowledge                  float64
behavioral_antiviral_meds      float64
behavioral_avoidance           float64
behavioral_face_mask           float64
behavioral_wash_hands          float64
behavioral_large_gatherings    float64
behavioral_outside_home        float64
behavioral_touch_face          float64
doctor_recc_xyz                float64
doctor_recc_seasonal           float64
chronic_med_condition          float64
child_under_6_months           float64
health_worker                  float64
health_insurance               float64
opinion_xyz_vacc_effective     float64
opinion_xyz_risk               float64
opinion_xyz_sick_from_vacc     float64
opinion_seas_vacc_effective    float64
opinion_seas_risk              float64
opinion_seas_sick_from_vacc    float64
age_group                       object
education                       object
race                            object
sex                      

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


In [5]:
xTrain.select_dtypes('object')

Unnamed: 0,age_group,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,employment_industry,employment_occupation
0,55 - 64 Years,< 12 Years,White,Female,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,,
1,35 - 44 Years,12 Years,White,Male,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",pxcmvdjn,xgwztkwe
2,18 - 34 Years,College Graduate,White,Male,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",rucpziij,xtkaffoo
3,65+ Years,12 Years,White,Female,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",,
4,45 - 54 Years,Some College,White,Female,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",wxleyezf,emcorrxb
...,...,...,...,...,...,...,...,...,...,...,...,...
26702,65+ Years,Some College,White,Female,"<= $75,000, Above Poverty",Not Married,Own,Not in Labor Force,qufhixun,Non-MSA,,
26703,18 - 34 Years,College Graduate,White,Male,"<= $75,000, Above Poverty",Not Married,Rent,Employed,lzgpxyit,"MSA, Principle City",fcxhlnwr,cmhcxjea
26704,55 - 64 Years,Some College,White,Female,,Not Married,Own,,lzgpxyit,"MSA, Not Principle City",,
26705,18 - 34 Years,Some College,Hispanic,Female,"<= $75,000, Above Poverty",Married,Rent,Employed,lrircsnp,Non-MSA,fcxhlnwr,haliazsg


In [13]:
def multi_output_roc_auc_score(y_true, y_pred_proba):
    roc_auc_xyz = roc_auc_score(y_true.iloc[:, 0], y_pred_proba[0][:, 1])
    roc_auc_seasonal = roc_auc_score(y_true.iloc[:, 1], y_pred_proba[1][:, 1])
    return (roc_auc_xyz + roc_auc_seasonal) / 2

def getCvs(estimator,cv=5):
    cvs = cross_val_score(estimator,xTrain,yTrain.iloc[:,1:],cv=cv,scoring=scorer)
    print(f"Cross Validation Scores: {cvs}")
    print(f"Average CVS: {np.mean(cvs):.5f}")
    return cvs
    
# scorer = make_scorer(multi_output_roc_auc_scorer, response_method="predict_proba", greater_is_better=True) #from sklearn 1.4
# using sklearn 1.2.2
scorer = make_scorer(multi_output_roc_auc_score, greater_is_better=True, needs_proba=True)

In [26]:
ss = Pipeline(steps=[('fillNa',SimpleImputer(strategy='median')),('scaler',StandardScaler())])
ohe = Pipeline(steps=[('fillCatNa',SimpleImputer(strategy='most_frequent')),('encoder',OneHotEncoder(handle_unknown='ignore'))])

numerical_features = xTrain.select_dtypes('float64').columns
categorical_features = xTrain.select_dtypes('object').columns

preProcessor = ColumnTransformer(transformers=[('numerical',ss,numerical_features),('categorical',ohe,categorical_features)])

In [27]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

model = MultiOutputClassifier(RandomForestClassifier(random_state=71))
ppl = Pipeline(steps=[('preprocessor',preProcessor),('classifier',model)])

print("Random Forest")
getCvs(ppl)

Random Forest
Cross Validation Scores: [0.83700679 0.83263141 0.84314177 0.84414667 0.83442334]
Average CVS: 0.83827


array([0.83700679, 0.83263141, 0.84314177, 0.84414667, 0.83442334])

In [40]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

model = MultiOutputClassifier(LogisticRegression(random_state=71,max_iter=2500))
ppl = Pipeline(steps=[('preprocessor',preProcessor),('classifier',model)])

param_grid = {
    'classifier__estimator__C': [10**x for x in range(-3,3)],
    'classifier__estimator__solver': ['liblinear','sag','lbfgs','saga','newton-cg']
}

grid_search = GridSearchCV(estimator=ppl, param_grid=param_grid,scoring=scorer,cv=5)
grid_search.fit(xTrain,yTrain.iloc[:,1:])

print("Logistic Regression")
print("Best parameters:")
for x in grid_search.best_params_:
    print(f"{x}: {grid_search.best_params_[x]}")
print("Best score:",grid_search.best_score_)

'''
Logistic Regression
Best parameters:
classifier__estimator__C: 0.1
classifier__estimator__solver: saga
Best score: 0.8436856697800124
'''

# model = MultiOutputClassifier(LogisticRegression(
#     random_state=71,
#     C=grid_search.best_params_['classifier__estimator__C'],
#     solver=grid_search.best_params_['classifier__estimator__solver']))
# ppl = Pipeline(steps=[('preprocessor',preProcessor),('classifier',model)])
# getCvs(ppl)

Logistic Regression
Best parameters:
classifier__estimator__C: 0.1
classifier__estimator__solver: saga
Best score: 0.8436856697800124


In [7]:
ppl.fit(xTrain,yTrain.iloc[:,1:]) # X will use specified columns only

In [10]:
yPred = ppl.predict_proba(xTest.iloc[:,1:])

In [11]:
result = pd.DataFrame({
    'respondent_id':xTest['respondent_id'],
    'xyz_vaccine':yPred[0][:,1],
    'seasonal_vaccine':yPred[1][:,1]
})

result.to_csv('results.csv',index=False)