In [134]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt


In [156]:
# Load the datasets
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')

# Merge features and labels on the training data
train_data = pd.merge(train_features, train_labels, on='respondent_id')

# Drop respondent_id as it is not a feature
X_train = train_data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y_train = train_data[['xyz_vaccine', 'seasonal_vaccine']]

# Test set (respondent_id is needed for submission, so save it separately)
X_test = test_features.drop(columns=['respondent_id'])
test_respondent_id = test_features['respondent_id']


In [11]:
categorical_cols = X_train.select_dtypes(include=['object']).columns
numerical_cols = X_train.select_dtypes(include=['number']).columns

In [158]:
X_train.columns[X_train.isna().any()]

Index(['xyz_concern', 'xyz_knowledge', 'behavioral_antiviral_meds',
       'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
       'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'education',
       'income_poverty', 'marital_status', 'rent_or_own', 'employment_status',
       'household_adults', 'household_children', 'employment_industry',
       'employment_occupation'],
      dtype='object')

In [159]:
X_train.describe()

Unnamed: 0,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_xyz,...,health_worker,health_insurance,opinion_xyz_vacc_effective,opinion_xyz_risk,opinion_xyz_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,household_adults,household_children
count,26615.0,26591.0,26636.0,26499.0,26688.0,26665.0,26620.0,26625.0,26579.0,24547.0,...,25903.0,14433.0,26316.0,26319.0,26312.0,26245.0,26193.0,26170.0,26458.0,26458.0
mean,1.618486,1.262532,0.048844,0.725612,0.068982,0.825614,0.35864,0.337315,0.677264,0.220312,...,0.111918,0.87972,3.850623,2.342566,2.35767,4.025986,2.719162,2.118112,0.886499,0.534583
std,0.910311,0.618149,0.215545,0.446214,0.253429,0.379448,0.47961,0.472802,0.467531,0.414466,...,0.315271,0.3253,1.007436,1.285539,1.362766,1.086565,1.385055,1.33295,0.753422,0.928173
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
25%,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,3.0,1.0,1.0,4.0,2.0,1.0,0.0,0.0
50%,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,4.0,2.0,2.0,4.0,2.0,2.0,1.0,0.0
75%,2.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,0.0,1.0,5.0,4.0,4.0,5.0,4.0,4.0,1.0,1.0
max,3.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,5.0,5.0,5.0,5.0,5.0,5.0,3.0,3.0


In [160]:
X_train.isnull().sum()

xyz_concern                       92
xyz_knowledge                    116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_xyz                 2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_xyz_vacc_effective       391
opinion_xyz_risk                 388
opinion_xyz_sick_from_vacc       395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4423
marital_status                  1408
r

In [161]:
X_train.head()

Unnamed: 0,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_xyz,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


In [162]:

X_train['health_insurance'].value_counts()

health_insurance
1.0    12697
0.0     1736
Name: count, dtype: int64

In [163]:
X_train['employment_industry'].value_counts()

employment_industry
fcxhlnwr    2468
wxleyezf    1804
ldnlellj    1231
pxcmvdjn    1037
atmlpfrs     926
arjwrbjb     871
xicduogh     851
mfikgejo     614
vjjrobsf     527
rucpziij     523
xqicxuve     511
saaquncn     338
cfqqtusy     325
nduyfdeo     286
mcubkhph     275
wlfvacwt     215
dotnnunm     201
haxffmxo     148
msuufmds     124
phxvnwax      89
qnlwzans      13
Name: count, dtype: int64

In [164]:
X_train.shape

(26707, 35)

In [165]:
counts = (joined_df(['xyz_concern','xyz_knowledge']))
concern_counts = counts.sum(axis=1)
concern_counts

NameError: name 'joined_df' is not defined

In [166]:
X_train['child_under_6_months'].fillna(0,inplace=True)

In [167]:
X_train['household_adults'].fillna(0,inplace=True)

In [168]:
X_train['household_children'].fillna(0,inplace=True)

In [169]:
X_train.isna().sum()


xyz_concern                       92
xyz_knowledge                    116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_xyz                 2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months               0
health_worker                    804
health_insurance               12274
opinion_xyz_vacc_effective       391
opinion_xyz_risk                 388
opinion_xyz_sick_from_vacc       395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4423
marital_status                  1408
r

In [170]:
no_null_cols = [col for col in X_train.columns if X_train[col].isna().sum()==0]

## drop columns without missing info to get a clearer view of patterns
X_train.drop(columns=no_null_cols, axis=1)

Unnamed: 0,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_xyz,...,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,education,income_poverty,marital_status,rent_or_own,employment_status,employment_industry,employment_occupation
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,2.0,1.0,2.0,< 12 Years,Below Poverty,Not Married,Own,Not in Labor Force,,
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,4.0,2.0,4.0,12 Years,Below Poverty,Not Married,Rent,Employed,pxcmvdjn,xgwztkwe
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,...,4.0,1.0,2.0,College Graduate,"<= $75,000, Above Poverty",Not Married,Own,Employed,rucpziij,xtkaffoo
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,5.0,4.0,1.0,12 Years,Below Poverty,Not Married,Rent,Not in Labor Force,,
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,3.0,1.0,4.0,Some College,"<= $75,000, Above Poverty",Married,Own,Employed,wxleyezf,emcorrxb
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,5.0,2.0,2.0,Some College,"<= $75,000, Above Poverty",Not Married,Own,Not in Labor Force,,
26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,5.0,1.0,1.0,College Graduate,"<= $75,000, Above Poverty",Not Married,Rent,Employed,fcxhlnwr,cmhcxjea
26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,5.0,4.0,2.0,Some College,,Not Married,Own,,,
26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,...,2.0,1.0,2.0,Some College,"<= $75,000, Above Poverty",Married,Rent,Employed,fcxhlnwr,haliazsg


In [171]:
X_train.loc[X_train['employment_status'] == 'Unemployed', 'employment_industry'] = 'not employed'

## if a person is not in the labor force, change their 'employment_industry' to 'not_employed'
X_train.loc[X_train['employment_status'] == 'Not in Labor Force', 'employment_industry'] = 'not employed'


In [172]:
X_train['employment_industry'].isna().sum()

1646

In [173]:
X_train.loc[X_train['employment_status'] == 'Unemployed', 'employment_occupation'] = 'not employed'

## if a person is not in the labor force, change their 'employment_industry' to 'not_employed'
X_train.loc[X_train['employment_status'] == 'Not in Labor Force', 'employment_occupation'] = 'not employed'


In [174]:
X_train.isna().sum()


xyz_concern                       92
xyz_knowledge                    116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_xyz                 2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months               0
health_worker                    804
health_insurance               12274
opinion_xyz_vacc_effective       391
opinion_xyz_risk                 388
opinion_xyz_sick_from_vacc       395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4423
marital_status                  1408
r

In [175]:
for col in X_train.columns:
    display(X_train[col].value_counts())

xyz_concern
2.0    10575
1.0     8153
3.0     4591
0.0     3296
Name: count, dtype: int64

xyz_knowledge
1.0    14598
2.0     9487
0.0     2506
Name: count, dtype: int64

behavioral_antiviral_meds
0.0    25335
1.0     1301
Name: count, dtype: int64

behavioral_avoidance
1.0    19228
0.0     7271
Name: count, dtype: int64

behavioral_face_mask
0.0    24847
1.0     1841
Name: count, dtype: int64

behavioral_wash_hands
1.0    22015
0.0     4650
Name: count, dtype: int64

behavioral_large_gatherings
0.0    17073
1.0     9547
Name: count, dtype: int64

behavioral_outside_home
0.0    17644
1.0     8981
Name: count, dtype: int64

behavioral_touch_face
1.0    18001
0.0     8578
Name: count, dtype: int64

doctor_recc_xyz
0.0    19139
1.0     5408
Name: count, dtype: int64

doctor_recc_seasonal
0.0    16453
1.0     8094
Name: count, dtype: int64

chronic_med_condition
0.0    18446
1.0     7290
Name: count, dtype: int64

child_under_6_months
0.0    24569
1.0     2138
Name: count, dtype: int64

health_worker
0.0    23004
1.0     2899
Name: count, dtype: int64

health_insurance
1.0    12697
0.0     1736
Name: count, dtype: int64

opinion_xyz_vacc_effective
4.0    11683
5.0     7166
3.0     4723
2.0     1858
1.0      886
Name: count, dtype: int64

opinion_xyz_risk
2.0    9919
1.0    8139
4.0    5394
5.0    1750
3.0    1117
Name: count, dtype: int64

opinion_xyz_sick_from_vacc
2.0    9129
1.0    8998
4.0    5850
5.0    2187
3.0     148
Name: count, dtype: int64

opinion_seas_vacc_effective
4.0    11629
5.0     9973
2.0     2206
1.0     1221
3.0     1216
Name: count, dtype: int64

opinion_seas_risk
2.0    8954
4.0    7630
1.0    5974
5.0    2958
3.0     677
Name: count, dtype: int64

opinion_seas_sick_from_vacc
1.0    11870
2.0     7633
4.0     4852
5.0     1721
3.0       94
Name: count, dtype: int64

age_group
65+ Years        6843
55 - 64 Years    5563
45 - 54 Years    5238
18 - 34 Years    5215
35 - 44 Years    3848
Name: count, dtype: int64

education
College Graduate    10097
Some College         7043
12 Years             5797
< 12 Years           2363
Name: count, dtype: int64

race
White                21222
Black                 2118
Hispanic              1755
Other or Multiple     1612
Name: count, dtype: int64

sex
Female    15858
Male      10849
Name: count, dtype: int64

income_poverty
<= $75,000, Above Poverty    12777
> $75,000                     6810
Below Poverty                 2697
Name: count, dtype: int64

marital_status
Married        13555
Not Married    11744
Name: count, dtype: int64

rent_or_own
Own     18736
Rent     5929
Name: count, dtype: int64

employment_status
Employed              13560
Not in Labor Force    10231
Unemployed             1453
Name: count, dtype: int64

hhs_geo_region
lzgpxyit    4297
fpwskwrf    3265
qufhixun    3102
oxchjgsf    2859
kbazzjca    2858
bhuqouqj    2846
mlyzmhmf    2243
lrircsnp    2078
atmpeygn    2033
dqpwygqj    1126
Name: count, dtype: int64

census_msa
MSA, Not Principle  City    11645
MSA, Principle City          7864
Non-MSA                      7198
Name: count, dtype: int64

household_adults
1.0    14474
0.0     8305
2.0     2803
3.0     1125
Name: count, dtype: int64

household_children
0.0    18921
1.0     3175
2.0     2864
3.0     1747
Name: count, dtype: int64

employment_industry
not employed    11684
fcxhlnwr         2468
wxleyezf         1804
ldnlellj         1231
pxcmvdjn         1037
atmlpfrs          926
arjwrbjb          871
xicduogh          851
mfikgejo          614
vjjrobsf          527
rucpziij          523
xqicxuve          511
saaquncn          338
cfqqtusy          325
nduyfdeo          286
mcubkhph          275
wlfvacwt          215
dotnnunm          201
haxffmxo          148
msuufmds          124
phxvnwax           89
qnlwzans           13
Name: count, dtype: int64

employment_occupation
not employed    11684
xtkaffoo         1778
mxkfnird         1509
emcorrxb         1270
cmhcxjea         1247
xgwztkwe         1082
hfxkjkmi          766
qxajmpny          548
xqwwgdyp          485
kldqjyjy          469
uqqtjvyb          452
tfqavkke          388
ukymxvdu          372
vlluhbov          354
oijqvulv          344
ccgxvspp          341
bxpfxfdn          331
haliazsg          296
rcertsgn          276
xzmlyyjv          248
dlvbwzss          227
hodpvpew          208
dcjcmpih          148
pvmttkik           98
Name: count, dtype: int64

In [176]:
behavior_cols = [col for col in X_train.columns if 'behavioral' in col]
behavior_cols

['behavioral_antiviral_meds',
 'behavioral_avoidance',
 'behavioral_face_mask',
 'behavioral_wash_hands',
 'behavioral_large_gatherings',
 'behavioral_outside_home',
 'behavioral_touch_face']

In [177]:
X_train['behav_score'] = X_train[behavior_cols].sum(axis=1)

## ## check counts of unique values in new col and plot distribution
X_train['behav_score'].value_counts()

behav_score
3.0    7256
2.0    4584
5.0    4504
4.0    4331
1.0    2715
0.0    2109
6.0    1037
7.0     171
Name: count, dtype: int64

In [178]:
 X_train['behav_to_risk'] = ( X_train['behav_score'] + 1) / X_train['opinion_seas_risk']

## ## check counts of unique values in new col and plot distribution
X_train['behav_to_risk'].value_counts()

behav_to_risk
1.000000    4871
2.000000    3563
1.500000    3256
3.000000    2364
4.000000    1429
1.250000    1404
2.500000    1340
0.500000    1283
0.750000    1210
6.000000     840
5.000000     783
1.200000     710
0.800000     697
0.600000     363
1.750000     347
0.250000     296
1.400000     268
3.500000     231
1.333333     170
0.400000     152
7.000000     144
0.666667     116
1.666667      96
0.333333      80
0.200000      79
1.600000      62
8.000000      24
2.333333      15
Name: count, dtype: int64

In [179]:
def is_older_65(row):
    if row['age_group'] == '65+ Years':
        return 1
    else:
        return 0

## apply the function to create new column
X_train['older_65'] = X_train.apply(lambda x: is_older_65(x), axis=1)

## check counts of unique values in new col and plot distribution
X_train['older_65'].value_counts()

older_65
0    19864
1     6843
Name: count, dtype: int64

In [181]:
def calc_high_risk(row):
    risk = 0
    if row['older_65'] == 1:
        risk += 1
    if row['child_under_6_months'] == 1:
        risk += 1
    if row['chronic_med_condition'] == 1:
        risk += 1
    return risk

## apply the function to create new column
X_train['high_risk_compl'] = X_train.apply(lambda x: calc_high_risk(x), axis=1)

## check counts of unique values in new col and plot distribution
X_train['high_risk_compl'].value_counts()

high_risk_compl
0    13931
1     9426
2     3205
3      145
Name: count, dtype: int64

In [182]:
X_train['high_risk_cat'] = X_train['high_risk_compl'].map({0:'low risk', 1:'med risk',
                                                 2:'high risk', 3:'high risk'})

## check counts of unique values in new col
X_train['high_risk_cat'].value_counts()

high_risk_cat
low risk     13931
med risk      9426
high risk     3350
Name: count, dtype: int64

In [183]:

## change binary variable from floats to strings without altering NaN values
 ## (NaN will be automatically filled with 'missing' during preprocessing)
X_train['doctor_recc_seasonal'] = X_train['doctor_recc_seasonal'].map({1.0: '1', 0.0: '0'})
X_train['doctor_recc_seasonal'].value_counts(dropna=False)


doctor_recc_seasonal
0      16453
1       8094
NaN     2160
Name: count, dtype: int64

In [185]:
## change binary variable from floats to strings without altering NaN values
 ## (NaN will be automatically filled with 'missing' during preprocessing)
X_train['health_insurance'] = X_train['health_insurance'].map({1.0: '1', 0.0: '0'})
X_train['health_insurance'].value_counts(dropna=False)


health_insurance
NaN    26707
Name: count, dtype: int64

In [186]:
## define a function to return make combine people of color into one category 
 ## since they're so underrepresented in the dataset
def race_func(row):
    if row['race'] == 'White':
        return 'White'
    else:
        return 'POC'

## apply the function to create new column
X_train['race'] = X_train.apply(lambda x: race_func(x), axis=1)

## check counts of unique values in new col and plot distribution
X_train['race'].value_counts(dropna=False)

race
White    21222
POC       5485
Name: count, dtype: int64

In [187]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 40 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   xyz_concern                  26615 non-null  float64
 1   xyz_knowledge                26591 non-null  float64
 2   behavioral_antiviral_meds    26636 non-null  float64
 3   behavioral_avoidance         26499 non-null  float64
 4   behavioral_face_mask         26688 non-null  float64
 5   behavioral_wash_hands        26665 non-null  float64
 6   behavioral_large_gatherings  26620 non-null  float64
 7   behavioral_outside_home      26625 non-null  float64
 8   behavioral_touch_face        26579 non-null  float64
 9   doctor_recc_xyz              24547 non-null  float64
 10  doctor_recc_seasonal         24547 non-null  object 
 11  chronic_med_condition        25736 non-null  float64
 12  child_under_6_months         26707 non-null  float64
 13  health_worker   

In [188]:

## create df with remaining null values filled in with 'missing' for vizualizations
X_train_missing = X_train.fillna(value='missing')
df_missing.head()

NameError: name 'df_missing' is not defined

In [189]:
X_train.isnull().sum()

xyz_concern                       92
xyz_knowledge                    116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_xyz                 2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months               0
health_worker                    804
health_insurance               26707
opinion_xyz_vacc_effective       391
opinion_xyz_risk                 388
opinion_xyz_sick_from_vacc       395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4423
marital_status                  1408
r

In [190]:
X_train['health_insurance'].fillna(0,inplace=True)

In [191]:
X_train.dropna

<bound method DataFrame.dropna of        xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0              1.0            0.0                        0.0   
1              3.0            2.0                        0.0   
2              1.0            1.0                        0.0   
3              1.0            1.0                        0.0   
4              2.0            1.0                        0.0   
...            ...            ...                        ...   
26702          2.0            0.0                        0.0   
26703          1.0            2.0                        0.0   
26704          2.0            2.0                        0.0   
26705          1.0            1.0                        0.0   
26706          0.0            0.0                        0.0   

       behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
0                       0.0                   0.0                    0.0   
1                       1.0                  

In [249]:
X_train.isull().sum()

AttributeError: 'NoneType' object has no attribute 'isull'

In [248]:
X_train = X_train.dropna(inplace=True)

AttributeError: 'NoneType' object has no attribute 'dropna'

In [218]:
y_train.head()

Unnamed: 0,xyz_vaccine,seasonal_vaccine
0,0,0
1,0,1
2,0,0
3,0,1
4,0,0


In [222]:

# Create a column transformer with OneHotEncoder for categorical variables and StandardScaler for numerical variables
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Create a preprocessing and modeling pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', MultiOutputClassifier(MultinomialNB(), n_jobs=-1))
])



In [234]:
y_train

array([[0, 0],
       [0, 1],
       [0, 0],
       ...,
       [0, 1],
       [0, 0],
       [0, 0]], dtype=int64)

In [243]:
y_train = y_train.reshape(1, -1)
# Train the model
model.fit(X_train, y_train)

ValueError: Expected 2D array, got scalar array instead:
array=None.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [247]:
X.train.shape()

In [241]:
# Train the model
model.fit(X_train, y_train)

ValueError: Expected 2D array, got scalar array instead:
array=None.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [209]:
# Predict probabilities for the test set
y_pred_prob = model.predict_proba(X_test)

# Extract the probabilities for each target variable
xyz_vaccine_prob = y_pred_prob[0][:, 1]
seasonal_vaccine_prob = y_pred_prob[1][:, 1]


NotFittedError: This ColumnTransformer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [210]:
# Split the training data into a train and validation set for evaluation
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train the model on the split training set
model.fit(X_train_split, y_train_split)

# Predict probabilities for the validation set
y_val_pred_prob = model.predict_proba(X_val_split)

# Extract the probabilities for each target variable
xyz_vaccine_val_prob = y_val_pred_prob[0][:, 1]
seasonal_vaccine_val_prob = y_val_pred_prob[1][:, 1]

# Calculate ROC and AUC for each label
fpr_xyz, tpr_xyz, _ = roc_curve(y_val_split['xyz_vaccine'], xyz_vaccine_val_prob)
fpr_seasonal, tpr_seasonal, _ = roc_curve(y_val_split['seasonal_vaccine'], seasonal_vaccine_val_prob)

auc_xyz = roc_auc_score(y_val_split['xyz_vaccine'], xyz_vaccine_val_prob)
auc_seasonal = roc_auc_score(y_val_split['seasonal_vaccine'], seasonal_vaccine_val_prob)

# Plot ROC curves
plt.figure()
plt.plot(fpr_xyz, tpr_xyz, label=f'XYZ Vaccine (AUC = {auc_xyz:.2f})')
plt.plot(fpr_seasonal, tpr_seasonal, label=f'Seasonal Vaccine (AUC = {auc_seasonal:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for XYZ and Seasonal Vaccines')
plt.legend(loc='lower right')
plt.show()

print(f'XYZ Vaccine AUC: {auc_xyz:.2f}')
print(f'Seasonal Vaccine AUC: {auc_seasonal:.2f}')


TypeError: Expected sequence or array-like, got <class 'NoneType'>

In [211]:
# Prepare the submission DataFrame
submission = pd.DataFrame({
    'respondent_id': test_respondent_id,
    'xyz_vaccine': xyz_vaccine_prob,
    'seasonal_vaccine': seasonal_vaccine_prob
})

# Save to CSV
submission.to_csv('submission.csv', index=False)


NameError: name 'xyz_vaccine_prob' is not defined