# Predict Flu

# Imports and loading data

In [1]:
# Imports
from pathlib import Path
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_auc_score, plot_roc_curve

In [6]:
# Load in data
ft_df = pd.read_csv("./data/training_set_features.csv")
lbl_df = pd.read_csv("./data/training_set_labels.csv")

In [7]:
ft_df.head()

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


In [8]:
ft_df.shape

(26707, 36)

In [9]:
ft_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  int64  
 1   h1n1_concern                 26615 non-null  float64
 2   h1n1_knowledge               26591 non-null  float64
 3   behavioral_antiviral_meds    26636 non-null  float64
 4   behavioral_avoidance         26499 non-null  float64
 5   behavioral_face_mask         26688 non-null  float64
 6   behavioral_wash_hands        26665 non-null  float64
 7   behavioral_large_gatherings  26620 non-null  float64
 8   behavioral_outside_home      26625 non-null  float64
 9   behavioral_touch_face        26579 non-null  float64
 10  doctor_recc_h1n1             24547 non-null  float64
 11  doctor_recc_seasonal         24547 non-null  float64
 12  chronic_med_condition        25736 non-null  float64
 13  child_under_6_mo

In [30]:
ft_df.tail()

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
26702,26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Not in Labor Force,qufhixun,Non-MSA,0.0,0.0,,
26703,26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Rent,Employed,lzgpxyit,"MSA, Principle City",1.0,0.0,fcxhlnwr,cmhcxjea
26704,26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,...,,Not Married,Own,,lzgpxyit,"MSA, Not Principle City",0.0,0.0,,
26705,26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,...,"<= $75,000, Above Poverty",Married,Rent,Employed,lrircsnp,Non-MSA,1.0,0.0,fcxhlnwr,haliazsg
26706,26706,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Married,Own,Not in Labor Force,mlyzmhmf,"MSA, Principle City",1.0,0.0,,


In [10]:
lbl_df.head()

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,0,0,0
1,1,0,1
2,2,0,0
3,3,0,1
4,4,0,0


In [31]:
lbl_df.tail()

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
26702,26702,0,0
26703,26703,0,0
26704,26704,0,1
26705,26705,0,0
26706,26706,0,0


- Since head and tail are the same, can assume redpondent_id can be merged on. will cry if there are issues

In [11]:
lbl_df.shape

(26707, 3)

In [12]:
# Assertion, found online
np.testing.assert_array_equal(ft_df.index.values, lbl_df.index.values)

In [18]:
h1_stats = lbl_df['h1n1_vaccine'].value_counts(normalize = True)

In [19]:
seas_stats = lbl_df['seasonal_vaccine'].value_counts(normalize = True)

In [25]:
print(f"Percent of those without h1n1 vaccine: {round(h1_stats[0]*100,2)}%")
print(f"Percent of those with h1n1vaccine: {round(h1_stats[1]*100,2)}%")
print("==================================================")
print(f"Percent of those without seasonal vaccine: {round(seas_stats[0]*100,2)}%")
print(f"Percent of those with seasonal vaccine: {round(seas_stats[1]*100,2)}%")

Percent of those without h1n1 vaccine: 78.75%
Percent of those with h1n1vaccine: 21.25%
Percent of those without seasonal vaccine: 53.44%
Percent of those with seasonal vaccine: 46.56%


In [26]:
# Found online
pd.crosstab(
    lbl_df["h1n1_vaccine"], 
    lbl_df["seasonal_vaccine"], 
    normalize=True
)

seasonal_vaccine,0,1
h1n1_vaccine,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.49781,0.289737
1,0.036582,0.175871


- Rare to see with h1n1 and withouth seasonal.
- about half of sample doesnt have either h1n1 or seasonal vaccine. 
- Those with both is about 17.6% of sample

In [28]:
lbl_df["h1n1_vaccine"].corr(lbl_df["seasonal_vaccine"], method="pearson")


0.37714265306144684

- There is some, but not strong correlation

### Will combine the df's from this point

In [32]:
df_merged = ft_df.merge(lbl_df)

In [33]:
df_merged.head()

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation,h1n1_vaccine,seasonal_vaccine
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,,0,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe,0,1
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo,0,0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,,0,1
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb,0,0


In [34]:
df_merged['h1n1_concern'].value_counts(normalize = True)

2.0    0.397332
1.0    0.306331
3.0    0.172497
0.0    0.123840
Name: h1n1_concern, dtype: float64

In [35]:
df_merged['h1n1_knowledge'].value_counts(normalize = True)

1.0    0.548983
2.0    0.356775
0.0    0.094242
Name: h1n1_knowledge, dtype: float64

In [45]:
for col in df_merged.drop(['hhs_geo_region','respondent_id'], axis = 1):
    temp_arr = df_merged[col].value_counts(normalize = True)
    print(col)
    for row in temp_arr:
        print(row)
        #print(f"Val: {row[0]},         Percentage: {round(row[1]*100)}")
    
    

h1n1_concern
0.39733233139207214
0.3063310163441668
0.1724967123802367
0.12383993988352433
h1n1_knowledge
0.548982738520552
0.35677484863299613
0.09424241284645181
behavioral_antiviral_meds
0.9511563297792461
0.048843670220753865
behavioral_avoidance
0.7256122872561229
0.27438771274387713
behavioral_face_mask
0.9310176858513189
0.06898231414868106
behavioral_wash_hands
0.825614100881305
0.17438589911869493
behavioral_large_gatherings
0.6413598797896318
0.3586401202103681
behavioral_outside_home
0.6626854460093897
0.33731455399061033
behavioral_touch_face
0.6772640054178111
0.32273599458218893
doctor_recc_h1n1
0.7796879455737972
0.2203120544262028
doctor_recc_seasonal
0.6702652055240966
0.32973479447590337
chronic_med_condition
0.7167391980105688
0.28326080198943115
child_under_6_months
0.9174102831537065
0.0825897168462935
health_worker
0.8880824614909469
0.111917538509053
health_insurance
0.8797200859142243
0.12027991408577565
opinion_h1n1_vacc_effective
0.44395044839641284
0.27230582