In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC

In [2]:
df = pd.read_csv('training_set_features.csv')
df_labels = pd.read_csv('training_set_labels.csv')
df_test = pd.read_csv('test_set_features.csv')

In [3]:
df.drop(['employment_industry', 'employment_occupation'], axis = 1, inplace = True)
df_test.drop(['employment_industry', 'employment_occupation'], axis = 1, inplace = True)
df.drop(['health_insurance'], axis =1, inplace = True)
df_test.drop(['health_insurance'], axis =1, inplace = True)

In [4]:
res_id = df_test['respondent_id']
df_labels.drop(['respondent_id'], axis =1, inplace = True)
df = pd.concat([df, df_labels], axis = 1)

In [5]:
df.dropna(inplace=True)

In [6]:
df

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,xyz_vaccine,seasonal_vaccine
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,0,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,0,1
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,0,1
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,0,0
5,5,3.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,atmpeygn,"MSA, Principle City",2.0,3.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26700,26700,3.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,"> $75,000",Married,Own,Not in Labor Force,lzgpxyit,"MSA, Principle City",1.0,0.0,0,1
26701,26701,2.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,"> $75,000",Not Married,Rent,Not in Labor Force,fpwskwrf,"MSA, Principle City",3.0,0.0,0,0
26702,26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Not in Labor Force,qufhixun,Non-MSA,0.0,0.0,0,0
26703,26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Rent,Employed,lzgpxyit,"MSA, Principle City",1.0,0.0,0,0


In [7]:
x = df.select_dtypes(include=['object']).columns
print(x)

Index(['age_group', 'education', 'race', 'sex', 'income_poverty',
       'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region',
       'census_msa'],
      dtype='object')


In [8]:
df[x] = df[x].astype('category')
for i in range(len(x)):
    df[x[i]] = df[x[i]].cat.codes

In [9]:
independent_xyz = ['xyz_concern', 'xyz_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_xyz', 'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'opinion_xyz_vacc_effective', 'opinion_xyz_risk', 'marital_status',
       'rent_or_own', 'employment_status',
       'opinion_xyz_sick_from_vacc', 'age_group',
       'education', 'race', 'sex', 'income_poverty', 'hhs_geo_region', 'census_msa', 'household_adults', 'household_children']
dependent_xyz = ['xyz_vaccine']

independent_seas = ['behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'opinion_seas_vacc_effective', 'marital_status',
       'rent_or_own', 'employment_status',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'race', 'sex', 'income_poverty', 'hhs_geo_region', 'census_msa', 'household_adults', 'household_children']
dependent_seas = ['seasonal_vaccine']

In [10]:
sm = SMOTE(random_state = 42)
df_oversampled, df_y_oversampled = sm.fit_resample(df[independent_xyz], df[dependent_xyz])
df_xyz = pd.DataFrame(df_oversampled, columns=independent_xyz)

In [11]:
df_xyz = pd.concat([df_xyz, df_y_oversampled], axis = 1)

In [12]:
df_xyz

Unnamed: 0,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_xyz,...,age_group,education,race,sex,income_poverty,hhs_geo_region,census_msa,household_adults,household_children,xyz_vaccine
0,1.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1.000000,1.000000,0.000000,...,3,1,3,0,2,8,2,0.000000,0.000000,0
1,3.000000,2.000000,0.000000,1.000000,0.0,1.000000,0.000000,1.000000,1.000000,0.000000,...,1,0,3,1,2,1,0,0.000000,0.000000,0
2,1.000000,1.000000,0.000000,1.000000,0.0,1.000000,1.000000,0.000000,0.000000,0.000000,...,4,0,3,0,2,5,1,0.000000,0.000000,0
3,2.000000,1.000000,0.000000,1.000000,0.0,1.000000,1.000000,0.000000,1.000000,0.000000,...,2,3,3,0,0,9,0,1.000000,0.000000,0
4,3.000000,1.000000,0.000000,1.000000,0.0,1.000000,0.000000,0.000000,1.000000,0.000000,...,4,0,3,1,0,0,1,2.000000,3.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30251,2.448026,1.551974,0.000000,1.000000,0.0,1.000000,0.000000,0.448026,1.000000,0.551974,...,3,2,3,0,0,6,2,1.000000,0.000000,1
30252,1.000000,1.000000,0.000000,0.046777,0.0,0.046777,0.000000,0.000000,0.000000,0.000000,...,1,0,3,1,1,5,1,1.000000,2.000000,1
30253,1.974272,2.000000,0.974272,1.000000,0.0,1.000000,0.974272,0.000000,1.000000,0.974272,...,1,2,3,0,0,8,2,1.000000,2.000000,1
30254,2.000000,2.000000,0.000000,1.000000,0.0,1.000000,0.000000,0.000000,1.000000,0.171350,...,3,2,3,0,0,1,0,0.171350,0.000000,1


In [13]:
sm = SMOTE(random_state = 42)
df_oversampled, df_y_oversampled = sm.fit_resample(df[independent_seas], df[dependent_seas])
df_seas = pd.DataFrame(df_oversampled, columns=independent_seas)

In [14]:
df_seas = pd.concat([df_seas, df_y_oversampled], axis = 1)
df_seas.columns

Index(['behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'opinion_seas_vacc_effective', 'marital_status', 'rent_or_own',
       'employment_status', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc',
       'age_group', 'education', 'race', 'sex', 'income_poverty',
       'hhs_geo_region', 'census_msa', 'household_adults',
       'household_children', 'seasonal_vaccine'],
      dtype='object')

In [15]:
scaler = MinMaxScaler()
df_xyz[independent_xyz] = scaler.fit_transform(df_xyz[independent_xyz])

df_seas[independent_seas] = scaler.fit_transform(df_seas[independent_seas])

In [16]:
x = df_test.select_dtypes(include=['object']).columns
print(x)

Index(['age_group', 'education', 'race', 'sex', 'income_poverty',
       'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region',
       'census_msa'],
      dtype='object')


In [17]:
df_test[x] = df_test[x].astype('category')
for i in range(len(x)):
    df_test[x[i]] = df_test[x[i]].cat.codes

In [18]:
independent_test = ['xyz_concern', 'xyz_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker','opinion_xyz_vacc_effective', 'opinion_xyz_risk',
       'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'education', 'race', 'sex', 'income_poverty', 'marital_status',
       'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
       'household_adults', 'household_children']

In [19]:
df_test[independent_test] = scaler.fit_transform(df_test[independent_test])

In [20]:
for col in independent_test:
    df_test[col].fillna(df_test[col].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test[col].fillna(df_test[col].mean(), inplace=True)


In [21]:
x_train_xyz, x_test_xyz, y_train_xyz, y_test_xyz = train_test_split(df_xyz[independent_xyz], df_xyz[dependent_xyz[0]], random_state=42)

In [22]:
x_train_seas, x_test_seas, y_train_seas, y_test_seas = train_test_split(df_seas[independent_seas], df_seas[dependent_seas[0]], random_state=42)

In [23]:
model1 = LogisticRegression()
model1.fit(x_train_xyz, y_train_xyz)

In [24]:
model2 = LogisticRegression()
model2.fit(x_train_seas, y_train_seas)

In [25]:
pred1 = model1.predict(x_test_xyz)
pred2 = model2.predict(x_test_seas)

In [26]:
roc_auc_score(y_test_xyz, pred1)

0.7865590510774376

In [27]:
roc_auc_score(y_test_seas, pred2)

0.7768696857901627

In [28]:
pred3 = model1.predict_proba(df_test[independent_xyz])
pred4 = model2.predict_proba(df_test[independent_seas])

In [29]:
pred3 = pd.Series(pred3[:,1])
pred4 = pd.Series(pred4[:,1])
submission = pd.concat([res_id, pred3, pred4], axis = 1)

In [30]:
submission.set_index('respondent_id', inplace= True)
submission

Unnamed: 0_level_0,0,1
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
26707,0.148231,0.350207
26708,0.033640,0.047630
26709,0.353290,0.969686
26710,0.642198,0.997856
26711,0.367743,0.713595
...,...,...
53410,0.828542,0.790417
53411,0.045568,0.258067
53412,0.150657,0.201310
53413,0.050495,0.940969


In [31]:
submission.to_csv('submission_LOG.csv')

Applying the same function on a SVM model, we see that,

In [32]:
svm1 = SVC(random_state=42)
svm1.fit(x_train_xyz, y_train_xyz)

In [33]:
svm2 = SVC(random_state=42)
svm2.fit(x_train_seas, y_train_seas)

In [34]:
pred1 = svm1.predict(x_test_xyz)
pred2 = svm2.predict(x_test_seas)

In [35]:
roc_auc_score(y_test_xyz, pred1)

0.8442329715074339

In [36]:
roc_auc_score(y_test_seas, pred2)

0.7803794457669839

In [37]:
pred3 = svm1.decision_function(df_test[independent_xyz])
pred4 = svm2.decision_function(df_test[independent_seas])

In [38]:
pred3[:10]

array([-1.41969981, -1.51269633, -2.39643254, -4.04435407, -0.3099178 ,
       -1.0364174 , -3.8986043 , -3.25130456, -2.9604509 , -4.31664445])

In [39]:
pred4[:10]

array([-0.24731652, -1.27319068,  1.09128497, -0.26547761,  0.80705193,
        2.06523718, -0.29996805,  0.5781677 ,  0.41624001, -0.15476996])

In [40]:
low = pred3.min()
high = pred3.max()
for x in range(len(pred3)):
    if pred3[x] < 0:
        y = pred3[x]/low
        pred3[x] = (0.5 - (y * 0.5))
    else:
        y = pred3[x]/high
        pred3[x] = 0.5 + (y * 0.5)

In [41]:
low = pred4.min()
high = pred4.max()
for x in range(len(pred4)):
    if pred4[x] < 0:
        y = pred4[x]/low
        pred4[x] = (0.5 - (y * 0.5))
    else:
        y = pred4[x]/high
        pred4[x] = 0.5 + (y * 0.5)

In [42]:
pred3 = pd.Series(pred3)
pred4 = pd.Series(pred4)
submission = pd.concat([res_id, pred3, pred4], axis = 1)

In [43]:
submission.set_index('respondent_id', inplace= True)
submission

Unnamed: 0_level_0,0,1
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
26707,0.354310,0.463172
26708,0.344767,0.310409
26709,0.254077,0.674066
26710,0.084967,0.460468
26711,0.468196,0.628730
...,...,...
53410,0.657318,0.627269
53411,0.390655,0.344911
53412,0.425162,0.354656
53413,0.151639,0.552201


In [44]:
submission.to_csv('submission_SVM.csv')