In [None]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np
df = pd.read_csv('data/diabetic_data.csv', encoding = 'latin1',low_memory=False)

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split

df = pd.read_csv('data/diabetic_data.csv', encoding = 'latin1',low_memory=False)
df.readmitted[df.readmitted == 'NO' ] = 0
df.readmitted[df.readmitted == '<30' ] = 1
df = df.drop(df[df.readmitted == '>30'].index)
df.drop(['encounter_id', 'patient_nbr', 'weight', 'payer_code', 'medical_specialty'], axis=1, inplace=True)
df.dropna(axis=1, how='all')
df_train, df_test = train_test_split(df, test_size=0.2, stratify=df['readmitted'])
#X = pd.get_dummies(X)


sample_train = df_train.sample(frac=0.1)
sample_test = df_test.sample(frac=0.1)

y_train_sample = sample_train['readmitted']
y_test_sample = sample_test['readmitted']

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
int_categorical_headers = [
'admission_type_id',
'discharge_disposition_id',
'admission_source_id',
]
numeric_headers = [
    "time_in_hospital", 
    "num_lab_procedures", 
    "num_procedures", 
    "num_medications", 
    "number_outpatient", 
    "number_emergency", 
    "number_inpatient", 
    "number_diagnoses",
]
categorical_headers = [
 'diag_3',
 'diag_2',
 'diag_1',
 'race',
 'gender',
 'age',
 'max_glu_serum',
 'A1Cresult',
 'metformin',
 'repaglinide',
 'nateglinide',
 'chlorpropamide',
 'glimepiride',
 'acetohexamide',
 'glipizide',
 'glyburide',
 'tolbutamide',
 'pioglitazone',
 'rosiglitazone',
 'acarbose',
 'miglitol',
 'troglitazone',
 'tolazamide',
 'examide',
 'citoglipton',
 'insulin',
 'glyburide-metformin',
 'glipizide-metformin',
 'glimepiride-pioglitazone',
 'metformin-rosiglitazone',
 'metformin-pioglitazone',
 'change',
 'diabetesMed']

df_train.replace(to_replace=' ?',value=np.nan, inplace=True)
df_train.dropna(inplace=True)
df_train.reset_index()

df_test.replace(to_replace=' ?',value=np.nan, inplace=True)
df_test.dropna(inplace=True)
df_test.reset_index()

encoders = dict()



for col in categorical_headers:
    df[col] = df[col].str.strip()
    df_train[col] = df_train[col].str.strip()
    df_test[col] = df_test[col].str.strip()
    sample_train[col] = sample_train[col].str.strip()
    sample_test[col] = sample_test[col].str.strip()
    

    encoders[col] = LabelEncoder()
    df[col+'_int'] = encoders[col].fit_transform(df[col])
    df_train[col+'_int'] = encoders[col].transform(df_train[col])
    df_test[col+'_int'] = encoders[col].transform(df_test[col])
    sample_train[col+'_int'] = encoders[col].transform(sample_train[col])
    sample_test[col+'_int'] = encoders[col].transform(sample_test[col])
    
    
    
for col in int_categorical_headers:
    df[col+'_int'] = df[col]
    df_train[col+'_int'] = df_train[col]
    df_test[col+'_int'] = df_test[col]
    sample_test[col+'_int'] = sample_test[col]
    sample_train[col+'_int'] = sample_train[col]

for col in numeric_headers:
    df_train[col] = df_train[col].astype(np.float)
    df_test[col] = df_test[col].astype(np.float)
    sample_train[col] = sample_train[col].astype(np.float)
    sample_test[col] = sample_test[col].astype(np.float)
    df[col] = df[col].astype(np.float)
    
    ss = StandardScaler()
    df[col] = ss.fit_transform(df[col].values.reshape(-1, 1))
    df_train[col] = ss.transform(df_train[col].values.reshape(-1, 1))
    df_test[col] = ss.transform(df_test[col].values.reshape(-1, 1))
    sample_train[col] = ss.transform(sample_train[col].values.reshape(-1, 1))
    sample_test[col] = ss.transform(sample_test[col].values.reshape(-1, 1))

    
df_train.head()

In [None]:
from sklearn import metrics as mt

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Input
from keras.layers import Embedding, Flatten, Merge, concatenate
from keras.models import Model

In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

## Starting work on the actual deep and wide network

## CV function

In [None]:
def run_network(IN_Train, IN_Test, deep=None):
    categorical_headers_ints = [x+'_int' for x in categorical_headers]
    master_numeric_headers = numeric_headers + int_categorical_headers
    df_num =  IN_Train[master_numeric_headers].values
    X_train_num =  IN_Train[master_numeric_headers].values
    X_test_num =  IN_Test[master_numeric_headers].values
    y_train = IN_Train['readmitted'].values.astype(np.int)
    y_test = IN_Test['readmitted'].values.astype(np.int)

    import tensorflow as tf
    with tf.device('/gpu:0'):
        embed_branches = []
        X_ints_train = []
        X_ints_test = []
        all_inputs = []
        all_branch_outputs = []

        for cols in cross_columns:
            # encode crossed columns as ints for the embedding
            enc = LabelEncoder()

            # create crossed labels
            # needs to be commented better, Eric!
            X_crossed_train = IN_Train[cols].apply(lambda x: '_'.join(x), axis=1)
            X_crossed_test = IN_Test[cols].apply(lambda x: '_'.join(x), axis=1)

            enc.fit(np.hstack((X_crossed_train.values,  X_crossed_test.values)))
            X_crossed_train = enc.transform(X_crossed_train)
            X_crossed_test = enc.transform(X_crossed_test)
            X_ints_train.append( X_crossed_train )
            X_ints_test.append( X_crossed_test )

            # get the number of categories
            N = max(X_ints_train[-1]+1) # same as the max(df_train[col])

            # create embedding branch from the number of categories
            inputs = Input(shape=(1,),dtype='int32', name = '_'.join(cols))
            all_inputs.append(inputs)
            x = Embedding(input_dim=N, output_dim=int(np.sqrt(N)), input_length=1)(inputs)
            x = Flatten()(x)
            all_branch_outputs.append(x)

        # merge the branches together
        wide_branch = concatenate(all_branch_outputs)

        # reset this input branch
        all_branch_outputs = []
        # add in the embeddings
        for col in categorical_headers_ints:
            # encode as ints for the embedding
            X_ints_train.append( IN_Train[col].values )
            X_ints_test.append( IN_Test[col].values )

            # get the number of categories
            N = max(X_ints_train[-1]+1) # same as the max(df_train[col])

            # create embedding branch from the number of categories
            inputs = Input(shape=(1,),dtype='int32', name=col)
            all_inputs.append(inputs)
            x = Embedding(input_dim=N, output_dim=int(np.sqrt(N)), input_length=1)(inputs)
            x = Flatten()(x)
            all_branch_outputs.append(x)

        # also get a dense branch of the numeric features
        all_inputs.append(Input(shape=(X_train_num.shape[1],),sparse=False,name='numeric_data'))
        x = Dense(units=20, activation='relu')(all_inputs[-1])
        all_branch_outputs.append( x )

        # merge the branches together
        deep_branch = concatenate(all_branch_outputs)
        deep_branch = Dense(units=50,activation='relu')(deep_branch)
        if deep == "start":
            deep_branch = Dense(units=40,activation='relu')(deep_branch)
            deep_branch = Dense(units=30,activation='relu')(deep_branch)
            deep_branch = Dense(units=20,activation='relu')(deep_branch)
        deep_branch = Dense(units=10,activation='relu')(deep_branch)

        final_branch = concatenate([wide_branch, deep_branch])
        if deep == "final":
            final_branch = Dense(units=50,activation='sigmoid')(final_branch)
        final_branch = Dense(units=1,activation='sigmoid')(final_branch)
        model = None
        model = Model(inputs=all_inputs, outputs=final_branch)
        model.compile(optimizer='adagrad',
                      loss='binary_crossentropy',
                      metrics=['accuracy'])

        model.fit(X_ints_train+ [X_train_num],
                y_train, epochs=1, batch_size=32, verbose=1)
        return model
        #yhat = np.round(model.predict(X_ints_test + [X_test_num]))
        #print(mt.confusion_matrix(y_test,yhat),mt.accuracy_score(y_test,yhat))

# THIS SHIT WORKS
## This is for 1)

In [None]:
num_folds = 3

model_list1 = []
yhat_list1 = []
y_test_list1 = []

skf = StratifiedKFold(n_splits=num_folds, shuffle=True)
for iter_num, (train_indices, test_indices) in enumerate(skf.split(df_train.drop(['readmitted'], axis = 1), df_train['readmitted'])):
    df_train_cv = df_train.iloc[train_indices]
    df_test_cv = df_train.iloc[test_indices]
    embed_branches = []
    X_ints_train = []
    X_ints_test = []
    all_inputs = []
    all_branch_outputs = []
    
    categorical_headers_ints = [x+'_int' for x in categorical_headers]
    master_numeric_headers = numeric_headers + int_categorical_headers
    df_num =  df_train_cv[master_numeric_headers].values
    X_train_num =  df_train_cv[master_numeric_headers].values
    X_test_num =  df_test_cv[master_numeric_headers].values
    y_train = df_train_cv['readmitted'].values.astype(np.int)
    y_test = df_test_cv['readmitted'].values.astype(np.int)

    for cols in cross_columns:
        # encode crossed columns as ints for the embedding
        enc = LabelEncoder()

        # create crossed labels
        # needs to be commented better, Eric!
        X_crossed_train = df_train_cv[cols].apply(lambda x: '_'.join(x), axis=1)
        X_crossed_test = df_test_cv[cols].apply(lambda x: '_'.join(x), axis=1)

        enc.fit(np.hstack((X_crossed_train.values,  X_crossed_test.values)))
        X_crossed_train = enc.transform(X_crossed_train)
        X_crossed_test = enc.transform(X_crossed_test)
        X_ints_train.append( X_crossed_train )
        X_ints_test.append( X_crossed_test )

        # get the number of categories
        N = max(X_ints_train[-1]+1) # same as the max(df_train[col])

        # create embedding branch from the number of categories
        inputs = Input(shape=(1,),dtype='int32', name = '_'.join(cols))
        all_inputs.append(inputs)
        x = Embedding(input_dim=N, output_dim=int(np.sqrt(N)), input_length=1)(inputs)
        x = Flatten()(x)
        all_branch_outputs.append(x)

    # merge the branches together
    wide_branch = concatenate(all_branch_outputs)

    # reset this input branch
    all_branch_outputs = []
    # add in the embeddings
    for col in categorical_headers_ints:
        # encode as ints for the embedding
        X_ints_train.append( df_train_cv[col].values )
        X_ints_test.append( df_test_cv[col].values )

        # get the number of categories
        N = max(X_ints_train[-1]+1) # same as the max(df_train[col])

        # create embedding branch from the number of categories
        inputs = Input(shape=(1,),dtype='int32', name=col)
        all_inputs.append(inputs)
        x = Embedding(input_dim=N, output_dim=int(np.sqrt(N)), input_length=1)(inputs)
        x = Flatten()(x)
        all_branch_outputs.append(x)

    # also get a dense branch of the numeric features
    all_inputs.append(Input(shape=(X_train_num.shape[1],),sparse=False,name='numeric_data'))
    x = Dense(units=20, activation='relu')(all_inputs[-1])
    all_branch_outputs.append( x )

    # merge the branches together
    deep_branch = concatenate(all_branch_outputs)
    deep_branch = Dense(units=50,activation='relu')(deep_branch)
    deep_branch = Dense(units=10,activation='relu')(deep_branch)

    final_branch = concatenate([wide_branch, deep_branch])
    final_branch = Dense(units=1,activation='sigmoid')(final_branch)

    modeltemp = Model(inputs=all_inputs, outputs=final_branch)

    modeltemp.compile(optimizer='adagrad',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    modeltemp.fit(X_ints_train+ [X_train_num],
            y_train, epochs=1, batch_size=32, verbose=1)
    model_list1.append(modeltemp)
    yhat_temp = np.round(modeltemp.predict(X_ints_test + [X_test_num]))
    yhat_list1.append(yhat_temp)
    y_test_list1.append(y_test)
    

In [None]:
scores1 = []
for i in range(0, len(yhat_list1)):
    print(mt.confusion_matrix(y_test_list1[i],yhat_list1[i]), mt.recall_score(y_test_list1[i],yhat_list1[i]))
    scores1.append(mt.recall_score(y_test_list1[i],yhat_list1[i]))

In [None]:
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline

i = list(range(1,num_folds+1))
plt.bar(i,scores1)

## For #2

In [None]:
model_list2 = []
yhat_list2 = []
y_test_list2 = []

skf = StratifiedKFold(n_splits=num_folds, shuffle=True)
for iter_num, (train_indices, test_indices) in enumerate(skf.split(df_train.drop(['readmitted'], axis = 1), df_train['readmitted'])):
    df_train_cv = df_train.iloc[train_indices]
    df_test_cv = df_train.iloc[test_indices]
    embed_branches = []
    X_ints_train = []
    X_ints_test = []
    all_inputs = []
    all_branch_outputs = []
    
    categorical_headers_ints = [x+'_int' for x in categorical_headers]
    master_numeric_headers = numeric_headers + int_categorical_headers
    df_num =  df_train_cv[master_numeric_headers].values
    X_train_num =  df_train_cv[master_numeric_headers].values
    X_test_num =  df_test_cv[master_numeric_headers].values
    y_train = df_train_cv['readmitted'].values.astype(np.int)
    y_test = df_test_cv['readmitted'].values.astype(np.int)

    for cols in cross_columns:
        # encode crossed columns as ints for the embedding
        enc = LabelEncoder()

        # create crossed labels
        # needs to be commented better, Eric!
        X_crossed_train = df_train_cv[cols].apply(lambda x: '_'.join(x), axis=1)
        X_crossed_test = df_test_cv[cols].apply(lambda x: '_'.join(x), axis=1)

        enc.fit(np.hstack((X_crossed_train.values,  X_crossed_test.values)))
        X_crossed_train = enc.transform(X_crossed_train)
        X_crossed_test = enc.transform(X_crossed_test)
        X_ints_train.append( X_crossed_train )
        X_ints_test.append( X_crossed_test )

        # get the number of categories
        N = max(X_ints_train[-1]+1) # same as the max(df_train[col])

        # create embedding branch from the number of categories
        inputs = Input(shape=(1,),dtype='int32', name = '_'.join(cols))
        all_inputs.append(inputs)
        x = Embedding(input_dim=N, output_dim=int(np.sqrt(N)), input_length=1)(inputs)
        x = Flatten()(x)
        all_branch_outputs.append(x)

    # merge the branches together
    wide_branch = concatenate(all_branch_outputs)

    # reset this input branch
    all_branch_outputs = []
    # add in the embeddings
    for col in categorical_headers_ints:
        # encode as ints for the embedding
        X_ints_train.append( df_train_cv[col].values )
        X_ints_test.append( df_test_cv[col].values )

        # get the number of categories
        N = max(X_ints_train[-1]+1) # same as the max(df_train[col])

        # create embedding branch from the number of categories
        inputs = Input(shape=(1,),dtype='int32', name=col)
        all_inputs.append(inputs)
        x = Embedding(input_dim=N, output_dim=int(np.sqrt(N)), input_length=1)(inputs)
        x = Flatten()(x)
        all_branch_outputs.append(x)

    # also get a dense branch of the numeric features
    all_inputs.append(Input(shape=(X_train_num.shape[1],),sparse=False,name='numeric_data'))
    x = Dense(units=20, activation='relu')(all_inputs[-1])
    all_branch_outputs.append( x )

    # merge the branches together
    deep_branch = concatenate(all_branch_outputs)
    deep_branch = Dense(units=50,activation='relu')(deep_branch)
    deep_branch = Dense(units=40,activation='relu')(deep_branch)
    deep_branch = Dense(units=30,activation='relu')(deep_branch)
    deep_branch = Dense(units=20,activation='relu')(deep_branch)
    deep_branch = Dense(units=10,activation='relu')(deep_branch)

    final_branch = concatenate([wide_branch, deep_branch])
    final_branch = Dense(units=1,activation='sigmoid')(final_branch)

    modeltemp = Model(inputs=all_inputs, outputs=final_branch)

    modeltemp.compile(optimizer='adagrad',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    modeltemp.fit(X_ints_train+ [X_train_num],
            y_train, epochs=1, batch_size=32, verbose=1)
    model_list2.append(modeltemp)
    yhat_temp = np.round(modeltemp.predict(X_ints_test + [X_test_num]))
    yhat_list2.append(yhat_temp)
    y_test_list2.append(y_test)

## For #3

In [None]:
model_list3 = []
yhat_list3 = []
y_test_list3 = []
skf = StratifiedKFold(n_splits=num_folds, shuffle=True)
for iter_num, (train_indices, test_indices) in enumerate(skf.split(df_train.drop(['readmitted'], axis = 1), df_train['readmitted'])):
    df_train_cv = df_train.iloc[train_indices]
    df_test_cv = df_train.iloc[test_indices]
    embed_branches = []
    X_ints_train = []
    X_ints_test = []
    all_inputs = []
    all_branch_outputs = []
    
    categorical_headers_ints = [x+'_int' for x in categorical_headers]
    master_numeric_headers = numeric_headers + int_categorical_headers
    df_num =  df_train_cv[master_numeric_headers].values
    X_train_num =  df_train_cv[master_numeric_headers].values
    X_test_num =  df_test_cv[master_numeric_headers].values
    y_train = df_train_cv['readmitted'].values.astype(np.int)
    y_test = df_test_cv['readmitted'].values.astype(np.int)

    for cols in cross_columns:
        # encode crossed columns as ints for the embedding
        enc = LabelEncoder()

        # create crossed labels
        # needs to be commented better, Eric!
        X_crossed_train = df_train_cv[cols].apply(lambda x: '_'.join(x), axis=1)
        X_crossed_test = df_test_cv[cols].apply(lambda x: '_'.join(x), axis=1)

        enc.fit(np.hstack((X_crossed_train.values,  X_crossed_test.values)))
        X_crossed_train = enc.transform(X_crossed_train)
        X_crossed_test = enc.transform(X_crossed_test)
        X_ints_train.append( X_crossed_train )
        X_ints_test.append( X_crossed_test )

        # get the number of categories
        N = max(X_ints_train[-1]+1) # same as the max(df_train[col])

        # create embedding branch from the number of categories
        inputs = Input(shape=(1,),dtype='int32', name = '_'.join(cols))
        all_inputs.append(inputs)
        x = Embedding(input_dim=N, output_dim=int(np.sqrt(N)), input_length=1)(inputs)
        x = Flatten()(x)
        all_branch_outputs.append(x)

    # merge the branches together
    wide_branch = concatenate(all_branch_outputs)

    # reset this input branch
    all_branch_outputs = []
    # add in the embeddings
    for col in categorical_headers_ints:
        # encode as ints for the embedding
        X_ints_train.append( df_train_cv[col].values )
        X_ints_test.append( df_test_cv[col].values )

        # get the number of categories
        N = max(X_ints_train[-1]+1) # same as the max(df_train[col])

        # create embedding branch from the number of categories
        inputs = Input(shape=(1,),dtype='int32', name=col)
        all_inputs.append(inputs)
        x = Embedding(input_dim=N, output_dim=int(np.sqrt(N)), input_length=1)(inputs)
        x = Flatten()(x)
        all_branch_outputs.append(x)

    # also get a dense branch of the numeric features
    all_inputs.append(Input(shape=(X_train_num.shape[1],),sparse=False,name='numeric_data'))
    x = Dense(units=20, activation='relu')(all_inputs[-1])
    all_branch_outputs.append( x )

    # merge the branches together
    deep_branch = concatenate(all_branch_outputs)
    deep_branch = Dense(units=50,activation='relu')(deep_branch)
    deep_branch = Dense(units=10,activation='relu')(deep_branch)

    final_branch = concatenate([wide_branch, deep_branch])
    final_branch = Dense(units=50,activation='sigmoid')(final_branch)
    final_branch = Dense(units=1,activation='sigmoid')(final_branch)

    modeltemp = Model(inputs=all_inputs, outputs=final_branch)

    modeltemp.compile(optimizer='adagrad',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    modeltemp.fit(X_ints_train+ [X_train_num],
            y_train, epochs=1, batch_size=32, verbose=1)
    model_list3.append(modeltemp)
    yhat_temp = np.round(modeltemp.predict(X_ints_test + [X_test_num]))
    yhat_list3.append(yhat_temp)
    y_test_list3.append(y_test)


In [None]:
scores1 = []

for i in range(0, len(yhat_list1)):
    print(mt.confusion_matrix(y_test_list1[i],yhat_list1[i]), mt.recall_score(y_test_list1[i],yhat_list1[i]))
    scores1.append(mt.recall_score(y_test_list1[i],yhat_list1[i]))
    
import matplotlib.pyplot as plt
%matplotlib inline

i = list(range(1,num_folds+1))
plt.bar(i,scores1)

In [None]:
scores2 = []

for i in range(0, len(yhat_list2)):
    print(mt.confusion_matrix(y_test_list2[i],yhat_list2[i]), mt.recall_score(y_test_list2[i],yhat_list2[i]))
    scores2.append(mt.recall_score(y_test_list2[i],yhat_list2[i]))
    
import matplotlib.pyplot as plt
%matplotlib inline

i = list(range(1,num_folds+1))
plt.bar(i,scores2)

In [None]:
scores3 = []

for i in range(0, len(yhat_list3)):
    print(mt.confusion_matrix(y_test_list3[i],yhat_list3[i]), mt.recall_score(y_test_list3[i],yhat_list3[i]))
    scores3.append(mt.recall_score(y_test_list3[i],yhat_list3[i]))
    
import matplotlib.pyplot as plt
%matplotlib inline

i = list(range(1,num_folds+1))
plt.bar(i,scores3)

In [None]:
from sklearn.neural_network import MLPClassifier
yhat_list_sk = []
y_test_list_sk = []


skf = StratifiedKFold(n_splits=num_folds, shuffle=True)
for iter_num, (train_indices, test_indices) in enumerate(skf.split(df_train.drop(['readmitted'], axis = 1), df_train['readmitted'])):
    df_train_cv = df_train.iloc[train_indices]
    df_test_cv = df_train.iloc[test_indices]
    
    df_dummies = pd.get_dummies(df_train_cv, columns=categorical_headers)

    df_dummies_train, df_dummies_test = train_test_split(df_dummies, test_size=0.2, stratify=df_dummies['readmitted'])
    y_train_sk = df_dummies_train['readmitted']
    y_test_sk = df_dummies_test['readmitted']
    df_dummies_train.drop(['readmitted'], axis=1, inplace=True)
    df_dummies_test.drop(['readmitted'], axis=1, inplace=True)

    clf = MLPClassifier()
    clf.fit(X=df_dummies_train.values, y=np.asarray(y_train_sk.values, dtype="|S6"))
    yhat_sk = clf.predict(df_dummies_test.values)
    yhat_list_sk.append(yhat_sk.astype(int))
    y_test_list_sk.append(y_test_sk.values.astype(int))
    print(mt.confusion_matrix(y_test_sk.values.astype(int),yhat_sk.astype(int)),mt.recall_score(y_test_sk.values.astype(int),yhat_sk.astype(int)))

In [None]:
scores_sk = []

for i in range(0, len(yhat_list_sk)):
    print(mt.confusion_matrix(y_test_list_sk[i],yhat_list_sk[i]), mt.recall_score(y_test_list_sk[i],yhat_list_sk[i]))
    scores_sk.append(mt.recall_score(y_test_list_sk[i],yhat_list_sk[i]))
    
import matplotlib.pyplot as plt
%matplotlib inline

i = list(range(1,num_folds+1))
plt.bar(i,scores_sk)

In [None]:
num_folds=2

In [None]:
#CV, uses function above
#in theory, this should treat df_train as our dataset, and split it, but it breaks when we use the cross val to split it
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, ShuffleSplit
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit

from sklearn.metrics import make_scorer, accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
#train_vals = np.array(df_train.values)
model_list = []
indices_list = []
skf = StratifiedKFold(n_splits=2, shuffle=True)
for iter_num, (train_indices, test_indices) in enumerate(skf.split(df_train.drop(['readmitted'], axis = 1), df_train['readmitted'])):
    print(train, test)
    indices_list.append([train_indices, test_indices])
    model_list.append(run_network(df_train.iloc[train_indices], df_train.iloc[test_indices] )) #TODO: problem where we use the indicies from skf

    
    '''for train, test in skf.split(df_train.drop(['readmitted'], axis = 1), df_train['readmitted']):
        print(train, test)
        model_list.append(run_network(df_train[train], df_train[test])) #TODO: problem where we use the indicies from skf'''

In [None]:
for i in range(0, len(model_list)):
    print(df_train.iloc[indices_list[i][1]].values)
    temp_list = []
    for a in df_train.iloc[indices_list[i][1]].values:
        temp_list.append(a)
    yhat = np.round(model_list[i].predict(temp_list))
    print(mt.confusion_matrix(df_train.iloc[indices_list[i][1]]['readmitted'],yhat),mt.accuracy_score(df_train.iloc[indices_list[i][1]]['readmitted'],yhat))

## End of CV work

In [None]:
master_numeric_headers = numeric_headers + int_categorical_headers
categorical_headers_ints = [x+'_int' for x in categorical_headers]
df_num =  df[numeric_headers].values
X_train_num =  df_train[master_numeric_headers].values
X_test_num =  df_test[master_numeric_headers].values
y_train = df_train['readmitted'].values.astype(np.int)
y_test = df_test['readmitted'].values.astype(np.int)

X_train_num_sample =  sample_train[master_numeric_headers].values
X_test_num_sample =  sample_test[master_numeric_headers].values
y_train_sample = sample_train['readmitted'].values.astype(np.int)
y_test_sample = sample_test['readmitted'].values.astype(np.int)

In [None]:
print(master_categorical_headers)

In [None]:
cross_columns = [['gender','race'],
                 ['age', 'diag_1'],
                ['gender', 'diag_1']]

## 1) Standard Configuration Wide and Deep Network

In [None]:
# we need to create separate sequential models for each embedding
import tensorflow as tf
with tf.device('/GPU:0'):
    embed_branches = []
    X_ints_train = []
    X_ints_test = []
    all_inputs = []
    all_branch_outputs = []

    for cols in cross_columns:
        # encode crossed columns as ints for the embedding
        enc = LabelEncoder()

        # create crossed labels
        # needs to be commented better, Eric!
        X_crossed_train = df_train[cols].apply(lambda x: '_'.join(x), axis=1)
        X_crossed_test = df_test[cols].apply(lambda x: '_'.join(x), axis=1)

        enc.fit(np.hstack((X_crossed_train.values,  X_crossed_test.values)))
        X_crossed_train = enc.transform(X_crossed_train)
        X_crossed_test = enc.transform(X_crossed_test)
        X_ints_train.append( X_crossed_train )
        X_ints_test.append( X_crossed_test )

        # get the number of categories
        N = max(X_ints_train[-1]+1) # same as the max(df_train[col])

        # create embedding branch from the number of categories
        inputs = Input(shape=(1,),dtype='int32', name = '_'.join(cols))
        all_inputs.append(inputs)
        x = Embedding(input_dim=N, output_dim=int(np.sqrt(N)), input_length=1)(inputs)
        x = Flatten()(x)
        all_branch_outputs.append(x)

    # merge the branches together
    wide_branch = concatenate(all_branch_outputs)

    # reset this input branch
    all_branch_outputs = []
    # add in the embeddings
    for col in categorical_headers_ints:
        # encode as ints for the embedding
        X_ints_train.append( df_train[col].values )
        X_ints_test.append( df_test[col].values )

        # get the number of categories
        N = max(X_ints_train[-1]+1) # same as the max(df_train[col])

        # create embedding branch from the number of categories
        inputs = Input(shape=(1,),dtype='int32', name=col)
        all_inputs.append(inputs)
        x = Embedding(input_dim=N, output_dim=int(np.sqrt(N)), input_length=1)(inputs)
        x = Flatten()(x)
        all_branch_outputs.append(x)

    # also get a dense branch of the numeric features
    all_inputs.append(Input(shape=(X_train_num.shape[1],),sparse=False,name='numeric_data'))
    x = Dense(units=20, activation='relu')(all_inputs[-1])
    all_branch_outputs.append( x )

    # merge the branches together
    deep_branch = concatenate(all_branch_outputs)
    deep_branch = Dense(units=50,activation='relu')(deep_branch)
    deep_branch = Dense(units=10,activation='relu')(deep_branch)

    final_branch = concatenate([wide_branch, deep_branch])
    final_branch = Dense(units=1,activation='sigmoid')(final_branch)

    model1 = Model(inputs=all_inputs, outputs=final_branch)

    model1.compile(optimizer='adagrad',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    model1.fit(X_ints_train+ [X_train_num],
            y_train, epochs=10, batch_size=32, verbose=1)

In [None]:
weights_int = []
for ind in model1.layers:
    if 'embedding' in ind.name:
        print(ind.name)
        print(ind.get_weights())
        weights_int.append(ind.get_weights())

In [None]:
yhat = np.round(model1.predict(X_ints_test + [X_test_num]))
print(mt.confusion_matrix(y_test,yhat),mt.accuracy_score(y_test,yhat))
print(X_ints_test) #+ [X_test_num])

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

plt.bar(range(len(per_fold_eval_criteria)),per_fold_eval_criteria)
plt.ylim([min(per_fold_eval_criteria)-0.01,max(per_fold_eval_criteria)])

In [None]:
SVG(model_to_dot(model1).create(prog='dot', format='svg'))

## 2) Deeper "Deep Network"

In [None]:
# we need to create separate sequential models for each embedding
import tensorflow as tf
with tf.device('/GPU:0'):
    embed_branches = []
    X_ints_train = []
    X_ints_test = []
    all_inputs = []
    all_branch_outputs = []

    for cols in cross_columns:
        # encode crossed columns as ints for the embedding
        enc = LabelEncoder()

        # create crossed labels
        # needs to be commented better, Eric!
        X_crossed_train = df_train[cols].apply(lambda x: '_'.join(x), axis=1)
        X_crossed_test = df_test[cols].apply(lambda x: '_'.join(x), axis=1)

        enc.fit(np.hstack((X_crossed_train.values,  X_crossed_test.values)))
        X_crossed_train = enc.transform(X_crossed_train)
        X_crossed_test = enc.transform(X_crossed_test)
        X_ints_train.append( X_crossed_train )
        X_ints_test.append( X_crossed_test )

        # get the number of categories
        N = max(X_ints_train[-1]+1) # same as the max(df_train[col])

        # create embedding branch from the number of categories
        inputs = Input(shape=(1,),dtype='int32', name = '_'.join(cols))
        all_inputs.append(inputs)
        x = Embedding(input_dim=N, output_dim=int(np.sqrt(N)), input_length=1)(inputs)
        x = Flatten()(x)
        all_branch_outputs.append(x)

    # merge the branches together
    wide_branch = concatenate(all_branch_outputs)

    # reset this input branch
    all_branch_outputs = []
    # add in the embeddings
    for col in categorical_headers_ints:
        # encode as ints for the embedding
        X_ints_train.append( df_train[col].values )
        X_ints_test.append( df_test[col].values )

        # get the number of categories
        N = max(X_ints_train[-1]+1) # same as the max(df_train[col])

        # create embedding branch from the number of categories
        inputs = Input(shape=(1,),dtype='int32', name=col)
        all_inputs.append(inputs)
        x = Embedding(input_dim=N, output_dim=int(np.sqrt(N)), input_length=1)(inputs)
        x = Flatten()(x)
        all_branch_outputs.append(x)

    # also get a dense branch of the numeric features
    all_inputs.append(Input(shape=(X_train_num.shape[1],),sparse=False,name='numeric_data'))
    x = Dense(units=20, activation='relu')(all_inputs[-1])
    all_branch_outputs.append( x )

    # merge the branches together
    deep_branch = concatenate(all_branch_outputs)
    deep_branch = Dense(units=50,activation='relu')(deep_branch)
    deep_branch = Dense(units=40,activation='relu')(deep_branch)
    deep_branch = Dense(units=30,activation='relu')(deep_branch)
    deep_branch = Dense(units=20,activation='relu')(deep_branch)
    deep_branch = Dense(units=10,activation='relu')(deep_branch)

    final_branch = concatenate([wide_branch, deep_branch])
    final_branch = Dense(units=1,activation='sigmoid')(final_branch)

    model2 = Model(inputs=all_inputs, outputs=final_branch)

    model2.compile(optimizer='adagrad',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    model2.fit(X_ints_train+ [X_train_num],
            y_train, epochs=10, batch_size=32, verbose=1)

## 3) Deeper Final Stage

In [None]:
# we need to create separate sequential models for each embedding
import tensorflow as tf
with tf.device('/GPU:0'):
    embed_branches = []
    X_ints_train = []
    X_ints_test = []
    all_inputs = []
    all_branch_outputs = []

    for cols in cross_columns:
        # encode crossed columns as ints for the embedding
        enc = LabelEncoder()

        # create crossed labels
        # needs to be commented better, Eric!
        X_crossed_train = df_train[cols].apply(lambda x: '_'.join(x), axis=1)
        X_crossed_test = df_test[cols].apply(lambda x: '_'.join(x), axis=1)

        enc.fit(np.hstack((X_crossed_train.values,  X_crossed_test.values)))
        X_crossed_train = enc.transform(X_crossed_train)
        X_crossed_test = enc.transform(X_crossed_test)
        X_ints_train.append( X_crossed_train )
        X_ints_test.append( X_crossed_test )

        # get the number of categories
        N = max(X_ints_train[-1]+1) # same as the max(df_train[col])

        # create embedding branch from the number of categories
        inputs = Input(shape=(1,),dtype='int32', name = '_'.join(cols))
        all_inputs.append(inputs)
        x = Embedding(input_dim=N, output_dim=int(np.sqrt(N)), input_length=1)(inputs)
        x = Flatten()(x)
        all_branch_outputs.append(x)

    # merge the branches together
    wide_branch = concatenate(all_branch_outputs)

    # reset this input branch
    all_branch_outputs = []
    # add in the embeddings
    for col in categorical_headers_ints:
        # encode as ints for the embedding
        X_ints_train.append( df_train[col].values )
        X_ints_test.append( df_test[col].values )

        # get the number of categories
        N = max(X_ints_train[-1]+1) # same as the max(df_train[col])

        # create embedding branch from the number of categories
        inputs = Input(shape=(1,),dtype='int32', name=col)
        all_inputs.append(inputs)
        x = Embedding(input_dim=N, output_dim=int(np.sqrt(N)), input_length=1)(inputs)
        x = Flatten()(x)
        all_branch_outputs.append(x)

    # also get a dense branch of the numeric features
    all_inputs.append(Input(shape=(X_train_num.shape[1],),sparse=False,name='numeric_data'))
    x = Dense(units=20, activation='relu')(all_inputs[-1])
    all_branch_outputs.append( x )

    # merge the branches together
    deep_branch = concatenate(all_branch_outputs)
    deep_branch = Dense(units=50,activation='relu')(deep_branch)
    deep_branch = Dense(units=10,activation='relu')(deep_branch)

    final_branch = concatenate([wide_branch, deep_branch])
    final_branch = Dense(units=50,activation='sigmoid')(final_branch)
    final_branch = Dense(units=1,activation='sigmoid')(final_branch)

    model3 = Model(inputs=all_inputs, outputs=final_branch)

    model3.compile(optimizer='Adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    model3.fit(X_ints_train+ [X_train_num],
            y_train, epochs=10, batch_size=32, verbose=1)

In [None]:
yhat = np.round(model.predict(X_ints_test + [X_test_num]))
print(mt.confusion_matrix(y_test,yhat),mt.accuracy_score(y_test,yhat))

In [None]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

SVG(model_to_dot(model).create(prog='dot', format='svg'))

In [None]:
SVG(model_to_dot(model3).create(prog='dot', format='svg'))

## SKLearn MLP 

In [None]:
from sklearn.neural_network import MLPClassifier

df_dummies = pd.get_dummies(df, columns=categorical_headers)

df_dummies_train, df_dummies_test = train_test_split(df_dummies, test_size=0.2, stratify=df_dummies['readmitted'])
y_train_sk = df_dummies_train['readmitted']
y_test_sk = df_dummies_test['readmitted']
df_dummies_train.drop(['readmitted'], axis=1, inplace=True)
df_dummies_test.drop(['readmitted'], axis=1, inplace=True)

clf = MLPClassifier()
clf.fit(X=df_dummies_train.values, y=np.asarray(y_train_sk.values, dtype="|S6"))
yhat_sk = clf.predict(df_dummies_test.values)
print(mt.confusion_matrix(y_test_sk.values.astype(int),yhat_sk.astype(int)),mt.recall_score(y_test_sk.values.astype(int),yhat_sk.astype(int)))

## I'm just tryna visualize some weights tbh
We'll see how this goes. I'm just dicking around pretty much

In [None]:
sample_train = df_train.sample(frac=0.1)
sample_test = df_test.sample(frac=0.1)

y_train_sample = sample_train['readmitted']
y_test_sample = sample_test['readmitted']

In [None]:
# we need to create separate sequential models for each embedding
import tensorflow as tf
with tf.device('/GPU:0'):
    embed_branches = []
    X_ints_train = []
    X_ints_test = []
    all_inputs = []
    all_branch_outputs = []

    for cols in cross_columns:
        # encode crossed columns as ints for the embedding
        enc = LabelEncoder()

        # create crossed labels
        # needs to be commented better, Eric!
        X_crossed_train = sample_train[cols].apply(lambda x: '_'.join(x), axis=1)
        X_crossed_test = sample_test[cols].apply(lambda x: '_'.join(x), axis=1)

        enc.fit(np.hstack((X_crossed_train.values,  X_crossed_test.values)))
        X_crossed_train = enc.transform(X_crossed_train)
        X_crossed_test = enc.transform(X_crossed_test)
        X_ints_train.append( X_crossed_train )
        X_ints_test.append( X_crossed_test )

        # get the number of categories
        N = max(X_ints_train[-1]+1) # same as the max(df_train[col])

        # create embedding branch from the number of categories
        inputs = Input(shape=(1,),dtype='int32', name = '_'.join(cols))
        all_inputs.append(inputs)
        x = Embedding(input_dim=N, output_dim=int(np.sqrt(N)), input_length=1)(inputs)
        x = Flatten()(x)
        all_branch_outputs.append(x)

    # merge the branches together
    wide_branch = concatenate(all_branch_outputs)
    print(wide_branch.get_weights())

    # reset this input branch
    all_branch_outputs = []
    # add in the embeddings
    for col in categorical_headers_ints:
        # encode as ints for the embedding
        X_ints_train.append( sample_train[col].values )
        X_ints_test.append( sample_test[col].values )

        # get the number of categories
        N = max(X_ints_train[-1]+1) # same as the max(df_train[col])

        # create embedding branch from the number of categories
        inputs = Input(shape=(1,),dtype='int32', name=col)
        all_inputs.append(inputs)
        x = Embedding(input_dim=N, output_dim=int(np.sqrt(N)), input_length=1)(inputs)
        x = Flatten()(x)
        all_branch_outputs.append(x)

    # also get a dense branch of the numeric features
    all_inputs.append(Input(shape=(X_train_num_sample.shape[1],),sparse=False,name='numeric_data'))
    x = Dense(units=20, activation='relu')(all_inputs[-1])
    all_branch_outputs.append( x )

    # merge the branches together
    deep_branch = concatenate(all_branch_outputs)
    deep_branch = Dense(units=50,activation='relu')(deep_branch)
    deep_branch = Dense(units=10,activation='relu')(deep_branch)

    final_branch = concatenate([wide_branch, deep_branch])
    final_branch = Dense(units=1,activation='sigmoid')(final_branch)

    model4 = Model(inputs=all_inputs, outputs=final_branch)

    model4.compile(optimizer='adagrad',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    model4.fit(X_ints_train+ [X_train_num_sample],
            y_train_sample, epochs=10, batch_size=32, verbose=1)