In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score, roc_auc_score, roc_curve, precision_score

import matplotlib.pyplot as plt

In [24]:
features = pd.read_csv('data/training_set_features.csv')
labels = pd.read_csv('data/training_set_labels.csv')

In [25]:
flu_df = pd.merge(features, labels, on='respondent_id', how='outer')

In [26]:
flu_df

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation,h1n1_vaccine,seasonal_vaccine
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,,0,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe,0,1
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo,0,0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,,0,1
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,Own,Not in Labor Force,qufhixun,Non-MSA,0.0,0.0,,,0,0
26703,26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,Rent,Employed,lzgpxyit,"MSA, Principle City",1.0,0.0,fcxhlnwr,cmhcxjea,0,0
26704,26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,...,Own,,lzgpxyit,"MSA, Not Principle City",0.0,0.0,,,0,1
26705,26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,...,Rent,Employed,lrircsnp,Non-MSA,1.0,0.0,fcxhlnwr,haliazsg,0,0


In [27]:
flu_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26707 entries, 0 to 26706
Data columns (total 38 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  int64  
 1   h1n1_concern                 26615 non-null  float64
 2   h1n1_knowledge               26591 non-null  float64
 3   behavioral_antiviral_meds    26636 non-null  float64
 4   behavioral_avoidance         26499 non-null  float64
 5   behavioral_face_mask         26688 non-null  float64
 6   behavioral_wash_hands        26665 non-null  float64
 7   behavioral_large_gatherings  26620 non-null  float64
 8   behavioral_outside_home      26625 non-null  float64
 9   behavioral_touch_face        26579 non-null  float64
 10  doctor_recc_h1n1             24547 non-null  float64
 11  doctor_recc_seasonal         24547 non-null  float64
 12  chronic_med_condition        25736 non-null  float64
 13  child_under_6_mo

In [28]:
X=flu_df.drop('seasonal_vaccine', axis=1)
y=flu_df['seasonal_vaccine']

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=12)

In [30]:
numbers=['h1n1_concern', 'h1n1_knowledge','opinion_h1n1_vacc_effective', 'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'household_adults', 'household_children']
onehots=['race', 'sex',  'marital_status', 'income_poverty', 'age_group', 'education', 'rent_or_own','employment_status', 'hhs_geo_region', 'census_msa', 'employment_industry', 'employment_occupation']
binary= ['behavioral_antiviral_meds', 'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home', 'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal', 'chronic_med_condition', 'child_under_6_months', 'health_worker', 'health_insurance', 'h1n1_vaccine']

In [31]:
numbers_pipeline = Pipeline(steps=[
    ('impute_nums', SimpleImputer(strategy='most_frequent', add_indicator=True)),
    ('num', StandardScaler())])

In [32]:
onehots_pipeline = Pipeline(steps=[
    ('impute_ohe', SimpleImputer(strategy='most_frequent', add_indicator=True)),
    ('ohe', OneHotEncoder(handle_unknown='ignore'))])

In [33]:
binary_pipeline = Pipeline(steps=[('impute_bin', SimpleImputer(strategy='most_frequent', add_indicator=True))])

In [34]:
col_tr = ColumnTransformer(transformers=[('num', numbers_pipeline, numbers),
                                        ('ohe', onehots_pipeline, onehots),
                                        ('bin', binary_pipeline, binary)],
                          remainder='drop', sparse_threshold=0)

In [35]:

dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(X_train, y_train)

DummyClassifier(strategy='most_frequent')

In [36]:
dummy.score(X_train, y_train)

0.5326510234648028

In [37]:
tree_pipe = Pipeline(steps=[('ct', col_tr), 
                           ('dtree', DecisionTreeClassifier(random_state=12))])

In [38]:
tree_pipe.fit(X_train, y_train)

Pipeline(steps=[('ct',
                 ColumnTransformer(sparse_threshold=0,
                                   transformers=[('num',
                                                  Pipeline(steps=[('impute_nums',
                                                                   SimpleImputer(add_indicator=True,
                                                                                 strategy='most_frequent')),
                                                                  ('num',
                                                                   StandardScaler())]),
                                                  ['h1n1_concern',
                                                   'h1n1_knowledge',
                                                   'opinion_h1n1_vacc_effective',
                                                   'opinion_h1n1_risk',
                                                   'opinion_h1n1_sick_from_vacc',
                                       

In [39]:
cv_results = cross_validate(estimator=tree_pipe, X=X_train, y=y_train, cv=5,
                            return_train_score=True)
cv_results

{'fit_time': array([1.3068912 , 1.29942298, 1.30274892, 1.31261587, 1.36961102]),
 'score_time': array([0.04183483, 0.04198718, 0.04232121, 0.04191899, 0.0460391 ]),
 'test_score': array([0.71018472, 0.70968547, 0.72241638, 0.71867199, 0.71767349]),
 'train_score': array([1., 1., 1., 1., 1.])}

In [40]:
tree_y_train = tree_pipe.predict(X_train)
tree_y_pred = tree_pipe.predict(X_test)

In [41]:
print(precision_score(y_train, tree_y_train))
print(precision_score(y_test, tree_y_pred))

1.0
0.6860133206470028


In [42]:
tree_pipe = Pipeline(steps=[('ct', col_tr), 
                           ('dtree', DecisionTreeClassifier(random_state=12, max_depth=7))])

In [43]:
tree_pipe.fit(X_train, y_train)

Pipeline(steps=[('ct',
                 ColumnTransformer(sparse_threshold=0,
                                   transformers=[('num',
                                                  Pipeline(steps=[('impute_nums',
                                                                   SimpleImputer(add_indicator=True,
                                                                                 strategy='most_frequent')),
                                                                  ('num',
                                                                   StandardScaler())]),
                                                  ['h1n1_concern',
                                                   'h1n1_knowledge',
                                                   'opinion_h1n1_vacc_effective',
                                                   'opinion_h1n1_risk',
                                                   'opinion_h1n1_sick_from_vacc',
                                       

In [44]:
cv_results = cross_validate(estimator=tree_pipe, X=X_train, y=y_train, cv=5,
                            return_train_score=True)
cv_results

{'fit_time': array([1.20090795, 1.37408209, 1.31816721, 1.21902585, 1.17196703]),
 'score_time': array([0.04674506, 0.04922485, 0.05128384, 0.04324508, 0.04198503]),
 'test_score': array([0.77858213, 0.77333999, 0.78582127, 0.78032951, 0.78082876]),
 'train_score': array([0.80029955, 0.8007988 , 0.80054918, 0.80048677, 0.8007364 ])}

In [45]:
tree_y_train = tree_pipe.predict(X_train)
tree_y_pred = tree_pipe.predict(X_test)

In [46]:
print(precision_score(y_train, tree_y_train))
print(precision_score(y_test, tree_y_pred))

0.7993498486716736
0.7915804002760525


In [47]:
log_pipe = Pipeline(steps=[('ct', col_tr), 
                           ('log', LogisticRegression(penalty='l2', C=0.1, solver='lbfgs', max_iter=1000))])

In [48]:
log_pipe.fit(X_train, y_train)

cv_log_results = cross_validate(estimator=log_pipe, X=X_train, y=y_train, cv=5,
                            return_train_score=True)
cv_log_results


{'fit_time': array([1.92186689, 1.80630469, 1.80918813, 1.75900102, 1.78311205]),
 'score_time': array([0.0446341 , 0.04060006, 0.04227901, 0.03929281, 0.05919409]),
 'test_score': array([0.79131303, 0.79855217, 0.80354468, 0.79505741, 0.80004993]),
 'train_score': array([0.80416875, 0.8017973 , 0.79992511, 0.80292062, 0.80204693])}

In [49]:
log_y_train=log_pipe.predict(X_train)
log_y_pred = log_pipe.predict(X_test)

In [50]:
print(precision_score(y_train, log_y_train))
print(precision_score(y_test, log_y_pred))

0.804571687224171
0.8036264112213479


### Needs to be updated for pipeline

In [51]:
C_values = [0.001, 0.01, 0.1, 1, 10]

best_f1_score = 0
best_C = None

for C in C_values:
    model = log_pipe(C=C, max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred)

    if f1 > best_f1_score:
        best_f1_score = f1
        best_C = C

TypeError: 'Pipeline' object is not callable

In [None]:
f1 = f1_score(y_test, y_pred)
f1

In [None]:
best_C

In [None]:
roc_auc = roc_auc_score(y_test, y_pred)


fpr, tpr, _ = roc_curve(y_test, y_pred)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, linewidth=2, label='ROC Curve (AUC = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], 'k--', linewidth=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()