In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

#Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt


#Modeling

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split


from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

#Evaluation
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


from scipy import stats

#Set figure size for all plots
plt.rc("figure", figsize = (16,16))

#Set fontsize for titles
plt.rc("font", size=14)

pd.set_option('display.max_columns', None)

# Project purpose

The goal of this project is to use features captured from survey data to predict how likely individuals are to receive their H1N1 and seasonal flu vaccines.

Specifically, I will be predicting two probabilities:
- h1n1_vaccine - Whether respondent received H1N1 flu vaccine.
- seasonal_vaccine - Whether respondent received seasonal flu vaccine.
> Both are binary variables: 0 = No; 1 = Yes. Some respondents didn't get either vaccine, others got only one, and some got both. This is formulated as a multilabel (and not multiclass) problem.

# Project Plan

1. Aquisition
  * Download data into local drive
2. Prepare
  * Read in data csv using pandas
  * Chekck data types and null values
  * Fill in nulls
  * Encode appropriately
  * Scale if needed
3. Explore
4. Modeling
5. Conclusions

# Acquire

- I downloaded the data from https://www.drivendata.org/competitions/66/flu-shot-learning/data/ into the same file where this analysis is being conducted
- Now I'll turn the data into a pandas dataframe for analysis and modeling

In [2]:
# Read in the csv using pandas
df = pd.read_csv('Flu_Shot_Learning_Predict_H1N1_and_Seasonal_Flu_Vaccines_-_Training_Features.csv', index_col=0)

Take a look at the data

In [3]:
df

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,doctor_recc_seasonal,chronic_med_condition,child_under_6_months,health_worker,health_insurance,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,age_group,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,1.0,2.0,2.0,1.0,2.0,55 - 64 Years,< 12 Years,White,Female,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0,4.0,4.0,4.0,2.0,4.0,35 - 44 Years,12 Years,White,Male,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,,1.0,0.0,0.0,,3.0,1.0,1.0,4.0,1.0,2.0,18 - 34 Years,College Graduate,White,Male,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,,3.0,3.0,5.0,5.0,4.0,1.0,65+ Years,12 Years,White,Female,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,3.0,3.0,2.0,3.0,1.0,4.0,45 - 54 Years,Some College,White,Female,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,3.0,1.0,1.0,5.0,2.0,2.0,65+ Years,Some College,White,Female,"<= $75,000, Above Poverty",Not Married,Own,Not in Labor Force,qufhixun,Non-MSA,0.0,0.0,,
26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,4.0,2.0,2.0,5.0,1.0,1.0,18 - 34 Years,College Graduate,White,Male,"<= $75,000, Above Poverty",Not Married,Rent,Employed,lzgpxyit,"MSA, Principle City",1.0,0.0,fcxhlnwr,cmhcxjea
26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,4.0,4.0,2.0,5.0,4.0,2.0,55 - 64 Years,Some College,White,Female,,Not Married,Own,,lzgpxyit,"MSA, Not Principle City",0.0,0.0,,
26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,2.0,2.0,1.0,2.0,18 - 34 Years,Some College,Hispanic,Female,"<= $75,000, Above Poverty",Married,Rent,Employed,lrircsnp,Non-MSA,1.0,0.0,fcxhlnwr,haliazsg


In [8]:
target_variable_df = pd.read_csv('Flu_Shot_Learning_Predict_H1N1_and_Seasonal_Flu_Vaccines_-_Training_Labels.csv')

In [None]:
tar

### The initial datatframe has:
- 26,707 rows where each row is one person
- 35 columns where each column is a feature this person has indicated on their survey

Let's get some more info about the datframe. We'll use .info() to see how many non-null values we have in each column and what the data types are

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26707 entries, 0 to 26706
Data columns (total 35 columns):
h1n1_concern                   26615 non-null float64
h1n1_knowledge                 26591 non-null float64
behavioral_antiviral_meds      26636 non-null float64
behavioral_avoidance           26499 non-null float64
behavioral_face_mask           26688 non-null float64
behavioral_wash_hands          26665 non-null float64
behavioral_large_gatherings    26620 non-null float64
behavioral_outside_home        26625 non-null float64
behavioral_touch_face          26579 non-null float64
doctor_recc_h1n1               24547 non-null float64
doctor_recc_seasonal           24547 non-null float64
chronic_med_condition          25736 non-null float64
child_under_6_months           25887 non-null float64
health_worker                  25903 non-null float64
health_insurance               14433 non-null float64
opinion_h1n1_vacc_effective    26316 non-null float64
opinion_h1n1_risk          

### Initial thoughts for cleaning/prep
1. Several columns will need to be one hot encoded or label encoded
2. Nans in several columns needs to be handled
3. May need to drop certain columns
4. All the datatypes seem to be appropriate

# Prepare

I'll start by splitting the data into train and test to avoid exploring the test data which is meant to stay unseen. Then I'll perform the same clean and prep changes to each dataframe sequentially.
> There's enough data here to split this further into train and validate sets which will help to prevent overfitting by allowing signs of overfitting to be caught before applying the model to the test data. After the appropriate cleaning and preparation are complete, I will create a validate set.

In [6]:
# use the train test split function from Sklearn and add a random seed for reproducibility
train, test = train_test_split(df, random_state=123, train_size=.80, stratify=df.h1n1_vaccine)

AttributeError: 'DataFrame' object has no attribute 'h1n1_vaccine'

Now I'll check to see which columns have nans, how many there are, and explore the best ways to fill those nans

In [None]:
#Write a for loop to determine what percentage of each column are nans
def percent_nans(df):
    x = ['column','n_nans', 'percentage_nans']
    missing_data_df = pd.DataFrame(columns=x)
    columns = df.columns
    for col in columns:
        column_name = col
        missing_data = df[col].isnull().sum()
        missing_in_percentage = (df[col].isnull().sum()/df[col].shape[0])*100
        
        missing_data_df.loc[len(missing_data_df)] = [column_name, missing_data, missing_in_percentage]
    return missing_data_df.sort_values(by = 'percentage_nans')

In [None]:
percent_nans(df)

- It looks like only three columns have a significant number of values missing:
  - __health_insurance__
  - __employment_industry__
  - __employment_occupation__
- Look at the most common values in each feature and decide if there is a reasonable way to fill the null values

In [None]:
#What is the most common health insurance?
df.health_insurance.value_counts()

In [None]:
#What is the most common employment_industry?
df.employment_industry.value_counts()

In [None]:
#What is the most common employment occupation?
df.employment_occupation.value_counts()

It is evident that __employment_industry__ and __employment_occupation__ do not have a single most overwhelming industry or occupation that could be reasonably used to fill in the remaining half empty values. I will drop these columns and not include them in the analysis

On the other hand, __health_insurance__ has 12,697 observation recoded as having insurance of the total 14,433 observations with values recorded. It may be reasonable to simply fill in the remaining 45% missing values with the label for having insurance.

### Drop the employment_industry and employment_occupation columns from dataframes

In [None]:
train = train.drop(columns =['employment_industry', 'employment_occupation'] )
test = test.drop(columns =['employment_industry', 'employment_occupation'] )

Check to make sure the columns were dropped

In [None]:
train

In [None]:
test

It looks like the columns were dropped appropriately. Now let's fill in null values in the remaining columns

In [None]:
#Write a function to fill in the null values with the most common occurence
def fill_null_values(train, test):
    train = train.apply(lambda x:x.fillna(x.value_counts().index[0]))
    test = test.apply(lambda x:x.fillna(x.value_counts().index[0]))

    return train, test

In [None]:
train, test = fill_null_values(train, test)

Verify that there are no more null values in either dataframe

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

### Encoding

Next we need to encode the columns that have objects as values and turn them into integer representations for the purpose of the classification model to predict how likely people are to get their flu vaccines

Look at which columns need to be encoded

In [None]:
train

In [None]:
# def label_encode_columns(train, test):

#     encoder = LabelEncoder()
   
#     encode_list = ['rent_or_own', 'employment_status', 'marital_status', 'sex']
    
             
#     for column in encode_list:
#         train[column] = encoder.fit_transform(train[column])
#         test[column] = encoder.transform(test[column])

#         return train, test

In [None]:
# train, test = label_encode_columns(train, test)

In [None]:
def label_encode_columns(train, test):

    encoder = LabelEncoder()
   
    train['encoded_rent_or_own'] = encoder.fit_transform(train['rent_or_own'])
    train['encoded_marital_status'] = encoder.fit_transform(train['marital_status'])
    train['encoded_sex'] = encoder.fit_transform(train['sex'])

    test['encoded_rent_or_own'] = encoder.fit_transform(test['rent_or_own'])
    test['encoded_marital_status'] = encoder.fit_transform(test['marital_status'])
    test['encoded_sex'] = encoder.fit_transform(test['sex'])
    
    return train, test

In [None]:
train, test = label_encode_columns(train, test)

In [None]:
# categorical_cols = ['rent_or_own', 'employment_status', 'marital_status', 'sex']

# categorical_cols

# train[categorical_cols] = train[categorical_cols].apply(lambda col: encoder.fit_transform(col))

In [None]:
train

In [None]:
# def encode(train, test):
#     # creating instance of one-hot-encoder
#     enc = OneHotEncoder()
#     # passing bridge-types-cat column (label encoded values of bridge_types)
#     enc_df = pd.DataFrame(enc.fit_transform(train[['age_group', 'education', 'race', 'income_poverty']]).toarray())
#     # merge with main df bridge_df on key values
#     train = train.join(enc_df)

#     # passing bridge-types-cat column (label encoded values of bridge_types)
#     enc_df2 = pd.DataFrame(enc.fit_transform(test[['age_group', 'education', 'race', 'income_poverty']]).toarray())
#     # merge with main df bridge_df on key values
#     test = test.join(enc_df2)
    
#     return train, test

In [None]:
# col_name = ['age_group', 'education', 'race', 'income_poverty']

In [None]:
# train, test = encode(train, test)

Look at the dataframe to ensure all the label encoded columns were added correctly

In [None]:
train

Check the origional columns against the encoded ones to be clear which labels correspond to eachother

In [None]:
housing_df = train[['rent_or_own', 'encoded_rent_or_own']]
housing_df

#### Own == 0
#### Rent == 1

In [None]:
married_df = train[['marital_status', 'encoded_marital_status']]
married_df

#### Married == 0
#### Not Married == 1

In [None]:
gender_df = train[['sex', 'encoded_sex']]
gender_df

#### Female == 0 
#### Male == 1

### One Hot Encode Remaining Columns

In [None]:
# Encode age_group column

# Create encoder object
encoder = OneHotEncoder()

# Fit on the age_group column of the train df
encoder.fit(train[['age_group']])

# nice columns for display
cols = ['age_group_' + c for c in encoder.categories_[0]]

# Transform the column on train and test and concatenate new df onto train and test dfs
m = encoder.transform(train[['age_group']]).todense()
train = pd.concat([
    train,
    pd.DataFrame(m, columns=cols, index=train.index)
], axis=1)

m = encoder.transform(test[['age_group']]).todense()
test = pd.concat([
    test,
    pd.DataFrame(m, columns=cols, index=test.index)
], axis=1)

Check to make sure it worked

In [None]:
train

In [None]:
test

Excellent! It worked the way I wanted. Now I have a column with a 1 if the observation falls into that category and a zero if it does not. 

Now repeat for the remaining columns

In [None]:
# Encode education column

# Create encoder object
encoder = OneHotEncoder()

# Fit on the age_group column of the train df
encoder.fit(train[['education']])

# nice columns for display
cols = ['education_' + c for c in encoder.categories_[0]]

# Transform the column on train and test and concatenate new df onto train and test dfs
m = encoder.transform(train[['education']]).todense()
train = pd.concat([
    train,
    pd.DataFrame(m, columns=cols, index=train.index)
], axis=1)

m = encoder.transform(test[['education']]).todense()
test = pd.concat([
    test,
    pd.DataFrame(m, columns=cols, index=test.index)
], axis=1)

In [None]:
# Encode race column

# Create encoder object
encoder = OneHotEncoder()

# Fit on the age_group column of the train df
encoder.fit(train[['race']])

# nice columns for display
cols = ['race_' + c for c in encoder.categories_[0]]

# Transform the column on train and test and concatenate new df onto train and test dfs
m = encoder.transform(train[['race']]).todense()
train = pd.concat([
    train,
    pd.DataFrame(m, columns=cols, index=train.index)
], axis=1)

m = encoder.transform(test[['race']]).todense()
test = pd.concat([
    test,
    pd.DataFrame(m, columns=cols, index=test.index)
], axis=1)

In [None]:
# Encode income_poverty column

# Create encoder object
encoder = OneHotEncoder()

# Fit on the age_group column of the train df
encoder.fit(train[['income_poverty']])

# nice columns for display
cols = ['income_poverty_' + c for c in encoder.categories_[0]]

# Transform the column on train and test and concatenate new df onto train and test dfs
m = encoder.transform(train[['income_poverty']]).todense()
train = pd.concat([
    train,
    pd.DataFrame(m, columns=cols, index=train.index)
], axis=1)

m = encoder.transform(test[['income_poverty']]).todense()
test = pd.concat([
    test,
    pd.DataFrame(m, columns=cols, index=test.index)
], axis=1)

In [None]:
train

In [None]:
test

Turn these into functions and add to prepare.py. I'll just do one example here to test if it works then comment it out to make sure I don't have duplicate columns moving forward. The other functions will only be added to the .py file.

In [None]:
# def ohe_income_poverty(train,test):
#     # Encode income_poverty column

#     # Create encoder object
#     encoder = OneHotEncoder()

#     # Fit on the age_group column of the train df
#     encoder.fit(train[['income_poverty']])

#     # nice columns for display
#     cols = ['income_poverty_' + c for c in encoder.categories_[0]]

#     # Transform the column on train and test and concatenate new df onto train and test dfs
#     m = encoder.transform(train[['income_poverty']]).todense()
#     train = pd.concat([
#         train,
#         pd.DataFrame(m, columns=cols, index=train.index)
#     ], axis=1)

#     m = encoder.transform(test[['income_poverty']]).todense()
#     test = pd.concat([
#         test,
#         pd.DataFrame(m, columns=cols, index=test.index)
#     ], axis=1)

#     return train, test

### Scaling

- The following columns are either opions on a scale of 1-5 or a count of number of children or adults in a household:
  - h1n1_concern
  - h1n1_knowledge
  - opinion_h1n1_vacc_effective
  - opinion_h1n1_risk
  - opinion_h1n1_sick_from_vacc
  - opinion_seas_vacc_effective
  - opinion_seas_risk
  - opinion_seas_sick_from_vac
  - household_adults
  - household_children
- All other features are on a scale of 0-1. I will apply a MinMax Scaler to the above columns to get them also on a 0-1 scale to avoid weighting issues in the models to come. 

In [None]:
# Create a scaler object using SKlearn's MinMax Scaler
scaler = MinMaxScaler()

In [None]:
# Add scaled columns to train dataframe
train[['h1n1_concern','h1n1_knowledge', 
       'opinion_h1n1_vacc_effective',
       'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc',
       'opinion_seas_vacc_effective',
       'opinion_seas_risk',
       'opinion_seas_sick_from_vacc',
       'household_adults',
       'household_children'
      ]] = scaler.fit_transform(
    train[['h1n1_concern',
       'h1n1_knowledge', 
       'opinion_h1n1_vacc_effective',
       'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc',
       'opinion_seas_vacc_effective',
       'opinion_seas_risk',
       'opinion_seas_sick_from_vacc',
       'household_adults',
       'household_children']])

Check to make sure scaling worked appropriately

In [None]:
train

Excellent! The MinMax Scaler was applied correctly. Now repeat this process for the test dataframe and turn these transformations into functions

In [None]:
# Add scaled columns to train and test dataframes
test[['h1n1_concern','h1n1_knowledge', 
       'opinion_h1n1_vacc_effective',
       'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc',
       'opinion_seas_vacc_effective',
       'opinion_seas_risk',
       'opinion_seas_sick_from_vacc',
       'household_adults',
       'household_children'
      ]] = scaler.fit_transform(
    test[['h1n1_concern',
       'h1n1_knowledge', 
       'opinion_h1n1_vacc_effective',
       'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc',
       'opinion_seas_vacc_effective',
       'opinion_seas_risk',
       'opinion_seas_sick_from_vacc',
       'household_adults',
       'household_children']])

Verify changes

In [None]:
test