In [158]:
import numpy as np 
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_validate
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
import matplotlib.pyplot as plt

In [111]:
X = pd.read_csv('/Users/stephen/Flatiron/Phase3/project_3/Data/training_set_features.csv', index_col='respondent_id')
y = pd.read_csv('/Users/stephen/Flatiron/Phase3/project_3/Data/training_set_labels.csv', index_col='respondent_id')
test_set = pd.read_csv('/Users/stephen/Flatiron/Phase3/project_3/Data/test_set_features.csv', index_col='respondent_id')

In [112]:
y = pd.Series(y['h1n1_vaccine'])


In [113]:
y.value_counts(normalize=True)

0    0.787546
1    0.212454
Name: h1n1_vaccine, dtype: float64

In [114]:
X.drop(columns=['hhs_geo_region', 'census_msa', 'employment_industry', 'employment_occupation'], inplace=True)

In [115]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26707 entries, 0 to 26706
Data columns (total 31 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   h1n1_concern                 26615 non-null  float64
 1   h1n1_knowledge               26591 non-null  float64
 2   behavioral_antiviral_meds    26636 non-null  float64
 3   behavioral_avoidance         26499 non-null  float64
 4   behavioral_face_mask         26688 non-null  float64
 5   behavioral_wash_hands        26665 non-null  float64
 6   behavioral_large_gatherings  26620 non-null  float64
 7   behavioral_outside_home      26625 non-null  float64
 8   behavioral_touch_face        26579 non-null  float64
 9   doctor_recc_h1n1             24547 non-null  float64
 10  doctor_recc_seasonal         24547 non-null  float64
 11  chronic_med_condition        25736 non-null  float64
 12  child_under_6_months         25887 non-null  float64
 13  health_worker   

In [116]:

num_cols = X.select_dtypes('number').columns

ord_cols = ['age_group', 'education',  'income_poverty', 'employment_status']

cat_cols = ['race', 'sex', 'marital_status', 'rent_or_own'] 

In [117]:
X['age_group'].value_counts(dropna=False)

65+ Years        6843
55 - 64 Years    5563
45 - 54 Years    5238
18 - 34 Years    5215
35 - 44 Years    3848
Name: age_group, dtype: int64

In [118]:
age_group_dict = {
np.NAN: -1, 
'18 - 34 Years': 1, 
'35 - 44 Years': 2, 
'45 - 54 Years': 3, 
'55 - 64 Years': 4, 
'65+ Years': 5, 
}

X['age_group_ordinal'] = X.age_group.map(age_group_dict).astype("category")
X['age_group_ordinal'].value_counts()

5    6843
4    5563
3    5238
1    5215
2    3848
Name: age_group_ordinal, dtype: int64

In [119]:
X['education'].value_counts(dropna=False)

College Graduate    10097
Some College         7043
12 Years             5797
< 12 Years           2363
NaN                  1407
Name: education, dtype: int64

In [120]:
education_dict = {
    np.NAN: -1,
    '< 12 Years': 1, 
    '12 Years': 2, 
    'Some College': 3, 
    'College Graduate': 4
}

X['education_ordinal'] = X.education.map(education_dict).astype("category")
X['education_ordinal'].value_counts(ascending=True)

-1     1407
1      2363
2      5797
3      7043
4     10097
Name: education_ordinal, dtype: int64

In [121]:
X['income_poverty'].value_counts()

<= $75,000, Above Poverty    12777
> $75,000                     6810
Below Poverty                 2697
Name: income_poverty, dtype: int64

In [122]:
income_dict = {
    np.NAN: -1,
    'Below Poverty': 1, 
    '<= $75,000, Above Poverty': 2, 
    '> $75,000': 3
}

X['income_poverty_ordinal'] = X.income_poverty.map(income_dict).astype("category")
X['income_poverty_ordinal'].value_counts()

2     12777
3      6810
-1     4423
1      2697
Name: income_poverty_ordinal, dtype: int64

In [123]:
X['employment_status'].value_counts()

Employed              13560
Not in Labor Force    10231
Unemployed             1453
Name: employment_status, dtype: int64

In [124]:
employment_dict = { 
    np.NAN: -1,
    'Unemployed': 1, 
    'Not in Labor Force': 2, 
    'Employed': 3
}

X['employment_status_ordinal'] = X.employment_status.map(employment_dict).astype("category")
X['employment_status_ordinal']

respondent_id
0        2
1        3
2        3
3        2
4        3
        ..
26702    2
26703    3
26704   -1
26705    3
26706    2
Name: employment_status_ordinal, Length: 26707, dtype: category
Categories (4, int64): [-1, 1, 2, 3]

In [125]:
X.shape

(26707, 35)

In [126]:
X.drop(columns=ord_cols, inplace=True)
X.shape

(26707, 31)

In [127]:
y.value_counts(normalize=True)

0    0.787546
1    0.212454
Name: h1n1_vaccine, dtype: float64

In [128]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26707 entries, 0 to 26706
Data columns (total 31 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   h1n1_concern                 26615 non-null  float64 
 1   h1n1_knowledge               26591 non-null  float64 
 2   behavioral_antiviral_meds    26636 non-null  float64 
 3   behavioral_avoidance         26499 non-null  float64 
 4   behavioral_face_mask         26688 non-null  float64 
 5   behavioral_wash_hands        26665 non-null  float64 
 6   behavioral_large_gatherings  26620 non-null  float64 
 7   behavioral_outside_home      26625 non-null  float64 
 8   behavioral_touch_face        26579 non-null  float64 
 9   doctor_recc_h1n1             24547 non-null  float64 
 10  doctor_recc_seasonal         24547 non-null  float64 
 11  chronic_med_condition        25736 non-null  float64 
 12  child_under_6_months         25887 non-null  float64 
 13  h

In [129]:
X

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,race,sex,marital_status,rent_or_own,household_adults,household_children,age_group_ordinal,education_ordinal,income_poverty_ordinal,employment_status_ordinal
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,White,Female,Not Married,Own,0.0,0.0,4,1,1,2
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,White,Male,Not Married,Rent,0.0,0.0,2,2,1,3
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,...,White,Male,Not Married,Own,2.0,0.0,1,4,2,3
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,White,Female,Not Married,Rent,0.0,0.0,5,2,1,2
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,White,Female,Married,Own,1.0,0.0,3,3,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,White,Female,Not Married,Own,0.0,0.0,5,3,2,2
26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,White,Male,Not Married,Rent,1.0,0.0,1,4,2,3
26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,White,Female,Not Married,Own,0.0,0.0,4,3,-1,-1
26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,...,Hispanic,Female,Married,Rent,1.0,0.0,1,3,2,3


In [138]:
# https://www.drivendata.co/blog/predict-flu-vaccine-data-benchmark/

# chain preprocessing into a Pipeline object
# each step is a tuple of (name you chose, sklearn transformer)
numeric_preprocessing_steps = Pipeline([
    ('standard_scaler', StandardScaler()),
    ('simple_imputer', SimpleImputer(strategy='median'))])

cat_transformer = Pipeline(steps=[
    ('cat_imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('ohe', OneHotEncoder(handle_unknown='ignore'))])


# create the preprocessor stage of final pipeline
# each entry in the transformer list is a tuple of
# (name you choose, sklearn transformer, list of columns)
preprocessor = ColumnTransformer(
    transformers = [
        ('numeric', numeric_preprocessing_steps, num_cols), 
        ('category', cat_transformer, cat_cols)],
    remainder = "drop")

estimators = LogisticRegression(penalty="l2", C=1, class_weight='balanced')


full_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", estimators),])

In [131]:
full_pipeline

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('standard_scaler',
                                                                   StandardScaler()),
                                                                  ('simple_imputer',
                                                                   SimpleImputer(strategy='median'))]),
                                                  Index(['h1n1_concern', 'h1n1_knowledge', 'behavioral_antiviral_meds',
       'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', '...
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'household_adults',
  

In [132]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    shuffle=True, 
    stratify=y, 
    random_state=42)

In [140]:
full_pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('standard_scaler',
                                                                   StandardScaler()),
                                                                  ('simple_imputer',
                                                                   SimpleImputer(strategy='median'))]),
                                                  Index(['h1n1_concern', 'h1n1_knowledge', 'behavioral_antiviral_meds',
       'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', '...
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'household_adults',
       'household_children'],
      dtype='object')),
                                                 ('category',
                                                  Pipeline(steps=[('cat_imputer',
                         

In [141]:
preds = full_pipeline.predict_proba(X_test)
preds

array([[0.5074285 , 0.4925715 ],
       [0.08927742, 0.91072258],
       [0.79489421, 0.20510579],
       ...,
       [0.58730357, 0.41269643],
       [0.54162618, 0.45837382],
       [0.60220545, 0.39779455]])

In [160]:
full_pipeline.score(X_test, y_test)

0.7708551744795566

In [162]:
print('Test accuracy: %.3f' % full_pipeline.score(X_test, y_test))


Test accuracy: 0.771


In [161]:
output

{'fit_time': array([0.17627597, 0.12617207, 0.13519096]),
 'score_time': array([0.02079415, 0.01532793, 0.01456714]),
 'test_score': array([0.83510559, 0.83285907, 0.83523068])}