In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np
df = pd.read_csv('data/diabetic_data.csv', encoding = 'latin1',low_memory=False)

In [2]:
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [3]:
from sklearn.model_selection import train_test_split

df = pd.read_csv('data/diabetic_data.csv', encoding = 'latin1',low_memory=False)
df.readmitted[df.readmitted == 'NO' ] = 0
df.readmitted[df.readmitted == '<30' ] = 1
df = df.drop(df[df.readmitted == '>30'].index)
df.drop(['encounter_id', 'patient_nbr', 'weight', 'payer_code', 'medical_specialty'], axis=1, inplace=True)
df.dropna(axis=1, how='all')
df_train, df_test = train_test_split(df, test_size=0.2, stratify=df['readmitted'])
#X = pd.get_dummies(X)

In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
numeric_headers = ["time_in_hospital", "num_lab_procedures", "num_procedures", "num_medications", "number_outpatient", "number_emergency", "number_inpatient", "number_diagnoses"]
categorical_headers = ['race',
 'gender',
 'age',
 'max_glu_serum',
 'A1Cresult',
 'metformin',
 'repaglinide',
 'nateglinide',
 'chlorpropamide',
 'glimepiride',
 'acetohexamide',
 'glipizide',
 'glyburide',
 'tolbutamide',
 'pioglitazone',
 'rosiglitazone',
 'acarbose',
 'miglitol',
 'troglitazone',
 'tolazamide',
 'examide',
 'citoglipton',
 'insulin',
 'glyburide-metformin',
 'glipizide-metformin',
 'glimepiride-pioglitazone',
 'metformin-rosiglitazone',
 'metformin-pioglitazone',
 'change',
 'diabetesMed']

df_train.replace(to_replace=' ?',value=np.nan, inplace=True)
df_train.dropna(inplace=True)
df_train.reset_index()

df_test.replace(to_replace=' ?',value=np.nan, inplace=True)
df_test.dropna(inplace=True)
df_test.reset_index()

encoders = dict()

int_categorical_headers = [
'admission_type_id',
'discharge_disposition_id',
'admission_source_id',
'diag_1',
'diag_2',
'diag_3',
]

for col in categorical_headers:
    df[col] = df[col].str.strip()
    df_train[col] = df_train[col].str.strip()
    df_test[col] = df_test[col].str.strip()
    

    encoders[col] = LabelEncoder()
    df[col+'_int'] = encoders[col].fit_transform(df[col])
    df_train[col+'_int'] = encoders[col].transform(df_train[col])
    df_test[col+'_int'] = encoders[col].transform(df_test[col])
    
    
    
for col in int_categorical_headers:
    df[col+'_int'] = df[col]
    df_train[col+'_int'] = df_train[col]
    df_test[col+'_int'] = df_test[col]

for col in numeric_headers:
    df_train[col] = df_train[col].astype(np.float)
    df_test[col] = df_test[col].astype(np.float)
    df[col] = df[col].astype(np.float)
    ss = StandardScaler()
    df[col] = ss.fit_transform(df[col].values.reshape(-1, 1))
    df_train[col] = ss.transform(df_train[col].values.reshape(-1, 1))
    df_test[col] = ss.transform(df_test[col].values.reshape(-1, 1))

    
df_train.head()

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,metformin-rosiglitazone_int,metformin-pioglitazone_int,change_int,diabetesMed_int,admission_type_id_int,discharge_disposition_id_int,admission_source_id_int,diag_1_int,diag_2_int,diag_3_int
96868,Caucasian,Female,[80-90),1,1,7,-0.114867,0.826678,-0.805627,-0.224435,...,0,0,1,0,1,1,7,560,202,250.8
28590,Caucasian,Male,[70-80),2,1,1,-1.120853,-0.542491,2.096259,-0.701502,...,0,0,1,1,2,1,1,996,427,585.0
12694,Caucasian,Male,[70-80),1,1,7,-0.114867,0.623838,0.355127,-0.105168,...,0,0,0,1,1,1,7,185,599,599.0
13483,Caucasian,Male,[40-50),1,1,7,-0.785524,0.370288,-0.22525,-0.582235,...,0,0,1,0,1,1,7,813,342,250.0
68496,AfricanAmerican,Male,[40-50),1,1,7,-0.785524,-1.50598,-0.805627,0.014099,...,0,0,0,1,1,1,7,428,427,305.0


In [5]:
from sklearn import metrics as mt

In [6]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Input
from keras.layers import Embedding, Flatten, Merge, concatenate
from keras.models import Model

Using TensorFlow backend.


In [7]:
#from tensorflow.python.client import device_lib
#print(device_lib.list_local_devices())

## Starting work on the actual deep and wide network

In [8]:
categorical_headers_ints = [x+'_int' for x in categorical_headers]
df_num =  df[numeric_headers].values
y_train = df_train['readmitted'].values.astype(np.int)
y_test = df_test['readmitted'].values.astype(np.int)

In [9]:
# we need to create separate sequential models for each embedding
embed_branches = []
X_ints_train = []
X_ints_test = []
all_inputs = []
all_branch_outputs = []
X_train_num =  df_train[numeric_headers].values
X_test_num =  df_test[numeric_headers].values

for col in categorical_headers_ints:
    # encode as ints for the embedding
    X_ints_train.append( df_train[col].values )
    X_ints_test.append( df_test[col].values )

    
    # get the number of categories
    N = max(X_ints_train[-1]+1) # same as the max(df_train[col])
    
    # create embedding branch from the number of categories
    inputs = Input(shape=(1,),dtype='int32')
    all_inputs.append(inputs)
    x = Embedding(input_dim=N, output_dim=int(np.sqrt(N)), input_length=1)(inputs)
    x = Flatten()(x)
    all_branch_outputs.append(x)

# also get a dense branch of the numeric features
all_inputs.append(Input(shape=(X_train_num.shape[1],),sparse=False))
x = Dense(units=20, activation='relu')(all_inputs[-1])
all_branch_outputs.append( Dense(units=10,activation='relu')(x) )

# merge the branches together
final_branch = concatenate(all_branch_outputs)
final_branch = Dense(units=1,activation='sigmoid')(final_branch)

model = Model(inputs=all_inputs, outputs=final_branch)

model.compile(optimizer='sgd',
              loss='mean_squared_error',
              metrics=['accuracy'])

model.fit(X_ints_train + [X_train_num],
        y_train, epochs=10, batch_size=32, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1ab3b9a6e80>

In [10]:
X_test_num =  df_test[numeric_headers].values
yhat = np.round(model.predict(X_ints_test + [X_test_num]))
print(mt.confusion_matrix(y_test,yhat),mt.accuracy_score(y_test,yhat))

[[10872   101]
 [ 2060   212]] 0.83684409211


In [11]:
cross_columns = [['gender','race'],
                 ['age', 'diag_1'],
                ['gender', 'diag_1']]

# we need to create separate sequential models for each embedding
embed_branches = []
X_ints_train = []
X_ints_test = []
all_inputs = []
all_branch_outputs = []

for cols in cross_columns:
    # encode as ints for the embedding
    enc = LabelEncoder()
    
    # create crossed labels
    X_crossed_train = df_train[cols].apply(lambda x: '_'.join(x), axis=1)
    X_crossed_test = df_test[cols].apply(lambda x: '_'.join(x), axis=1)
    
    enc.fit(np.hstack((X_crossed_train.values,  X_crossed_test.values)))
    X_crossed_train = enc.transform(X_crossed_train)
    X_crossed_test = enc.transform(X_crossed_test)
    print(X_crossed_train)
    X_ints_train.append( X_crossed_train )
    X_ints_test.append( X_crossed_test )
    
    # get the number of categories
    N = max(X_ints_train[-1]+1) # same as the max(df_train[col])
    
    # create embedding branch from the number of categories
    inputs = Input(shape=(1,),dtype='int32')
    all_inputs.append(inputs)
    x = Embedding(input_dim=N, output_dim=int(np.sqrt(N)), input_length=1)(inputs)
    x = Flatten()(x)
    all_branch_outputs.append(x)
    


[3 9 9 ..., 3 3 9]
[2751 2494 2071 ..., 1930 1406 2657]
[ 335 1157  632 ...,  471  416  816]


In [12]:
# merge the branches together
final_branch = concatenate(all_branch_outputs)
final_branch = Dense(units=1,activation='sigmoid')(final_branch)


model = Model(inputs=all_inputs, outputs=final_branch)

model.compile(optimizer='sgd',
              loss='mean_squared_error',
              metrics=['accuracy'])

model.fit(X_ints_train,
        y_train.astype(np.int), epochs=10, batch_size=32, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1accefafe10>

In [13]:

yhat = np.round(model.predict(X_ints_test))
print(mt.confusion_matrix(y_test.astype(np.int),yhat),mt.accuracy_score(y_test.astype(np.int),yhat))

[[10973     0]
 [ 2272     0]] 0.828463571159
