In [772]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
import sklearn, scipy
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from sklearn.impute import SimpleImputer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

sub_format = pd.read_csv("###.csv")
training_features = pd.read_csv("###.csv"", index_col="respondent_id")
training_labels = pd.read_csv("###.csv", index_col="respondent_id")
test_features = pd.read_csv("###.csv", index_col="respondent_id")

In [773]:
tf_copy = training_features.copy().drop(columns = ["hhs_geo_region"])

In [774]:
test_copy = test_features.copy().drop(columns = ["hhs_geo_region"])

In [775]:
tf_copy.columns

Index(['h1n1_concern', 'h1n1_knowledge', 'behavioral_antiviral_meds',
       'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'race', 'sex', 'income_poverty', 'marital_status',
       'rent_or_own', 'employment_status', 'census_msa', 'household_adults',
       'household_children', 'employment_industry', 'employment_occupation'],
      dtype='object')

In [778]:
def remap_field(df, field_name, mapping):
    """
    Maps dataframe column values in field to those in a new dictionary
    field_name = string
    mapping = dictionary
    """
    df[field_name] =  df[field_name].replace(mapping)
    
    
def concat_fields(df, old_fields, new_field, fillna = "MissingData"):
    """
    joins multiple string-type dataframe columns into a single field. 
    old_fields = list of strings
    new_fields = string
    """ 
    df[new_field] = df[old_fields].fillna("missing").apply(lambda col: " ".join(col), axis=1)
    df.drop(columns = old_fields, inplace = True)

In [779]:
edu_map = dict(zip(['< 12 Years', '12 Years', 'Some College', 'College Graduate'], 
                   ["Dropout", "HS", "College", "Graduate"]))

updates = {
    "marital_status": {"Not Married": "Single"},
    "race": {"Other or Multiple": "Other"}, 
    "education": edu_map
          }
"""
for field, mapping in updates.items():
    remap_field(tf_copy, field, mapping)

concat_fields(tf_copy, old_fields = updates.keys(), new_field = "demographic") """

for df in [tf_copy, test_copy]:
    for field, mapping in updates.items():
        remap_field(df, field, mapping)
    concat_fields(df, old_fields = updates.keys(), new_field = "demographic") 
    df["age_group"] = df["age_group"].str[:2].astype("int64")
    df["employment_status"] = tf_copy["employment_status"]
    income_map = {"<= $75,000, Above Poverty": 70, "Below Poverty": 20, "> $75,000": 100}
    df["income_poverty"] = df.income_poverty.replace(income_map);

In [780]:
sns.set()

In [801]:
X = tf_copy
y = training_labels

text_cols = [col for col in X.columns if X[col].dtypes not in (["float64", "int64"])]
num_cols = [col for col in X.columns if X[col].dtypes in (["float64", "int64"])]

def combine_text_columns(df):#, to_drop=num_cols):#NUMERIC_COLUMNS + LABELS):
    """ converts all text in each row of data_frame to single vector """
    
    # Drop non-text columns that are in the df
    #to_drop = set(to_drop) & set(df.columns.tolist())
    #text_data = df.drop(to_drop, axis=1)
    
    text_cols = [col for col in X.columns if X[col].dtypes not in (["float64", "int64"])]
    text_data = df[text_cols]
    
    # Replace nans with blanks
    #text_data.fillna("Missing")#, inplace=True)
    # Join all text items in a row that have a space in between
    return text_data.fillna("MissingData").apply(lambda x: " ".join(x), axis=1)

In [814]:
get_text_data = FunctionTransformer(combine_text_columns, validate = False)#lambda df: df[text_cols], validate=False)
get_num_data = FunctionTransformer(lambda df: df[num_cols], validate=False)
to_dense = FunctionTransformer(lambda sparse: sparse.toarray())

#Aggregate text data and vectorize text information with a hashing function 
#(words will map to a dictionary of ints, saving computation time)
text_pipeline = Pipeline([
    ("selector", get_text_data),
    #("imputer", SimpleImputer(strategy = "constant", fill_value="Missing")), combine text_columns makes imputer not work
    ("vectorizer", HashingVectorizer(ngram_range = (1, 3)))
])

#Aggregate numeric data and impute missing value with column mean
num_pipeline = Pipeline([
    #no need for selector if using ColumnTransformer instead of FeatureUnion
    ("selector", get_num_data),
    ("imputer", SimpleImputer(strategy = "mean")),
    ("scaler", StandardScaler()) #added line for run 2
     ])

pl = Pipeline([
    ("union", FeatureUnion([
        ("numeric", num_pipeline), 
        ("text", text_pipeline)])),
    ("clf", OneVsRestClassifier(LogisticRegression()))
    ])

In [815]:
#from sklearn.compose import make_column_selector
X_train, X_test, y_train, y_test = train_test_split(X, y["h1n1_vaccine"], test_size = .2, random_state = 420)#, stratify = y["seasonal_vaccine"])

preprocessor = ColumnTransformer(transformers=[
    ('numeric', num_pipeline, num_cols)
    #,('text', text_pipeline, text_cols)# selector(dtype_include="object"))
])

#older version of pipeline, saved for archival
for i in range(5, 20):
    pl_preprocessing = Pipeline([
        ("preprocessing", preprocessor),
        ("clf", RandomForestClassifier(max_depth = i, n_estimators = 200, random_state = 420))
        ])

    model = pl_preprocessing.fit(X_train, y_train)
    model_predictions = model.predict(X_test)
    model_roc = roc_auc_score(y_test, model_predictions)
    print(f"Numeric-only, n = {i} has an accuarcy of: {model.score(X_test, y_test)} and an AUC_ROC of {model_roc}")

Numeric-only, n = 5 has an accuarcy of: 0.8393859977536503 and an AUC_ROC of 0.6509132452748754
Numeric-only, n = 6 has an accuarcy of: 0.8461250467989517 and an AUC_ROC of 0.6731689539518405
Numeric-only, n = 7 has an accuarcy of: 0.8489329839011606 and an AUC_ROC of 0.683466246088957
Numeric-only, n = 8 has an accuarcy of: 0.8538000748783228 and an AUC_ROC of 0.697911788551903
Numeric-only, n = 9 has an accuarcy of: 0.8539872706851367 and an AUC_ROC of 0.7043331237493109
Numeric-only, n = 10 has an accuarcy of: 0.855672032946462 and an AUC_ROC of 0.7098182755194461
Numeric-only, n = 11 has an accuarcy of: 0.8566080119805316 and an AUC_ROC of 0.7110449950568438
Numeric-only, n = 12 has an accuarcy of: 0.8577311868214152 and an AUC_ROC of 0.7161722370202501
Numeric-only, n = 13 has an accuarcy of: 0.8554848371396481 and an AUC_ROC of 0.7137952930040605
Numeric-only, n = 14 has an accuarcy of: 0.8575439910146013 and an AUC_ROC of 0.718258644926315
Numeric-only, n = 15 has an accuarcy of

In [804]:
#from sklearn.compose import make_column_selector
X_train, X_test, y_train, y_test = train_test_split(X, y["seasonal_vaccine"], test_size = .2, random_state = 420)#, stratify = y["seasonal_vaccine"])

preprocessor = ColumnTransformer(transformers=[
    ('numeric', num_pipeline, num_cols)
    #,('text', text_pipeline, text_cols)# selector(dtype_include="object"))
])

#older version of pipeline, saved for archival
for i in range(5, 20):
    pl_preprocessing = Pipeline([
        ("preprocessing", preprocessor),
        ("clf", RandomForestClassifier(max_depth = i, n_estimators = 200, random_state = 420))
        ])

    model = pl_preprocessing.fit(X_train, y_train)
    model_predictions = model.predict(X_test)
    model_roc = roc_auc_score(y_test, model_predictions)
    print(f"Numeric-only, n = {i} has an accuarcy of: {model.score(X_test, y_test)} and an AUC_ROC of {model_roc}")

Numeric-only, n = 5 has an accuarcy of: 0.7721827031074504 and an AUC_ROC of 0.7694019753487487
Numeric-only, n = 6 has an accuarcy of: 0.7751778360164733 and an AUC_ROC of 0.7719523770849619
Numeric-only, n = 7 has an accuarcy of: 0.7802321228004493 and an AUC_ROC of 0.7772244951366398
Numeric-only, n = 8 has an accuarcy of: 0.7817296892549607 and an AUC_ROC of 0.7790199847269175
Numeric-only, n = 9 has an accuarcy of: 0.7813552976413328 and an AUC_ROC of 0.778643374651872
Numeric-only, n = 10 has an accuarcy of: 0.7862223886184949 and an AUC_ROC of 0.7832502563373704
Numeric-only, n = 11 has an accuarcy of: 0.7847248221639835 and an AUC_ROC of 0.7818305308242163
Numeric-only, n = 12 has an accuarcy of: 0.7845376263571696 and an AUC_ROC of 0.7818012028962458
Numeric-only, n = 13 has an accuarcy of: 0.7819168850617746 and an AUC_ROC of 0.7790782175838977
Numeric-only, n = 14 has an accuarcy of: 0.7821040808685885 and an AUC_ROC of 0.779627834234039
Numeric-only, n = 15 has an accuarcy 

In [807]:
pl_preprocessing = Pipeline([
        ("preprocessing", preprocessor),
        ("clf", RandomForestClassifier(max_depth = 12, n_estimators = 200, random_state = 420))
        ])

X_train, X_test, y_train, y_test = train_test_split(X, y["h1n1_vaccine"], test_size = .2, random_state = 420)#, stratify = y["seasonal_vaccine"])

model = pl_preprocessing.fit(X_train, y_train)
h1n1_predictions = model.predict_proba(test_copy)

pl_preprocessing = Pipeline([
        ("preprocessing", preprocessor),
        ("clf", RandomForestClassifier(max_depth = 10, n_estimators = 200, random_state = 420))
        ])
X_train, X_test, y_train, y_test = train_test_split(X, y["seasonal_vaccine"], test_size = .2, random_state = 420)#, stratify = y["seasonal_vaccine"])
model = pl_preprocessing.fit(X_train, y_train)
seasonal_predictions = model.predict_proba(test_copy)


In [809]:
submission = test_features.copy()

In [810]:
submission["h1n1_vaccine"] = h1n1_predictions[:,1]
submission["seasonal_vaccine"] = seasonal_predictions[:,1]

In [811]:
submission[["h1n1_vaccine", "seasonal_vaccine"]].to_csv("Flu Shot Entry 12-6-20v2.csv")

Submission Results: 

AUC: ~81% 

Top 11.25%