In [1]:
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn import tree
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier 
from sklearn.neighbors import KNeighborsClassifier
import time
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from tensorflow import keras
import keras_tuner
from keras.models import Model
from keras import layers
from keras import Input
from keras.layers import Dense, LeakyReLU, ReLU, Conv1D
from tensorflow.keras.utils import plot_model 
from imblearn.over_sampling import SMOTE,SVMSMOTE


2024-04-11 16:08:01.296877: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def prep_demo_data(y_train, y_test):
    X = pd.read_csv('train/train_demos.csv')
    X = X.set_index('patient_id')
    cat = ['gender', 'insurance', 'marital_status', 'ethnicity']
    # ONE HOT
    enc = OneHotEncoder()
    X_encoded = enc.fit_transform(X[cat])
    X_encoded = pd.DataFrame.sparse.from_spmatrix(X_encoded)
    X_encoded.index = X.index

    # LABEL
    # encoder = LabelEncoder()
    # X_encoded = X.copy()
    # for var in cat:
    #     X_encoded[var] = encoder.fit_transform(X_encoded[var])

    
    X = pd.concat([X.drop(cat, axis=1), X_encoded], axis=1)
    X['admittime'] = X.apply(lambda x: time.mktime(pd.Timestamp(x['admittime']).timetuple()), axis=1)
    X['admittime'] = X['admittime'] - X['admittime'].min()

    X_train = split(X, y_train.index)
    X_test = split(X, y_test.index)

    scaler = StandardScaler()
    scaler.fit(X_train[['age', 'admittime']])
    X_train[['age', 'admittime']] = scaler.transform(X_train[['age', 'admittime']])
    X_test[['age', 'admittime']] = scaler.transform(X_test[['age', 'admittime']])

    X_train.columns = X_train.columns.astype('str')
    X_test.columns = X_test.columns.astype('str')
    
    return X_train, X_test

def prep_signs_data(y_train, y_test):
    # CURRENTLY DROPS TIME COL
    signs = pd.read_csv('train/train_signs.csv')
    signs['charttime'] = pd.to_datetime(signs['charttime'])
    
    
    first_time_row = signs.groupby('patient_id')['charttime'].first()
    signs['firsttime'] = signs['patient_id'].map(first_time_row)
    # Sets the index as the time from the first reading so all patients start at 0 and go toward 24 hours
    signs['timediff'] = pd.to_numeric(signs['charttime'] - signs['firsttime'])
    signs = signs.drop(['charttime','firsttime'],axis=1)
    
    aggs = signs.groupby('patient_id').agg(['mean', 'min', 'max', 'first', 'last'])
    X_train = split(aggs, y_train.index)
    X_test = split(aggs, y_test.index)

    scaler = StandardScaler()
    features = X_train.columns
    id = X_train.index
    id_test = X_test.index

    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)
    X_train.columns = ['_'.join(x) for x in features]
    X_train.index = id
    X_test.columns = ['_'.join(x) for x in features]
    X_test.index = id_test

    # columns with more than 10% null values, drop these (10 columns, 2 metrics)
    drop_cols = X_train.columns[X_train.isna().sum() / X_train.shape[0] > .1] # should be just train set
    X_train = X_train.drop(columns=drop_cols)
    X_test = X_test.drop(columns=drop_cols)

    # is this the best way to do it??
    X_train = X_train.fillna(X_train.mean())
    X_test = X_test.fillna(X_train.mean()) # note train mean

    return X_train, X_test

def prep_radiology_data(y_train, y_test):
    df = pd.read_csv('train/train_radiology.csv')
    df = df.groupby('patient_id').agg({'text': ['sum']})

    
    X_train = split(df, y_train.index)
    X_test = split(df, y_test.index)    
    
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, min_df=5, max_features=100, stop_words="english")
    vec_train = vectorizer.fit_transform(X_train['text']['sum'])
    vec_test = vectorizer.transform(X_test['text']['sum'])
    X_train = pd.concat([X_train.drop(columns=['text']), pd.DataFrame(vec_train.toarray(), index=y_train.index)], axis=1)
    X_test = pd.concat([X_test.drop(columns=['text']), pd.DataFrame(vec_test.toarray(), index=y_test.index)], axis=1)
    
    # X_train[('charttime','first')] = pd.to_datetime(X_train[('charttime','first')])
    # X_train[('charttime','last')] = pd.to_datetime(X_train[('charttime','last')])
    # X_test[('charttime','first')] = pd.to_datetime(X_test[('charttime','first')])
    # X_test[('charttime','last')] = pd.to_datetime(X_test[('charttime','last')])

    scaler = StandardScaler()
    features = X_train.columns
    id = X_train.index
    id_test = X_test.index

    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)
    X_train.columns = features
    X_train.index = id
    X_test.columns = features
    X_test.index = id_test

    return X_train, X_test
def train_test(balance_train=False):
    # returns the train test split labels
    y = pd.read_csv('train/train_labels.csv')
    y = y.set_index('patient_id')
    y_train = y.sample(n=int(y.shape[0] * .8))
    y_test = y.drop(y_train.index)
    if balance_train:
        # RUN THIS CELL IF YOU WANT AN EVEN DATA CLASS BALANCE
        keep_ids = y_train[y_train['label'] == 0].sample(n=y_train['label'].sum()).index
        ys = y_train[y_train['label'] == 1].index
        y_train = y_train.loc[[*keep_ids, *ys]]
    return y_train, y_test

def split(X, index):
    return X.loc[index]

In [3]:
y_train, y_test = train_test(balance_train=True)

In [4]:
rad_train, rad_test = prep_radiology_data(y_train, y_test)
demo_train, demo_test = prep_demo_data(y_train, y_test)
sign_train,sign_test = prep_signs_data(y_train,y_test)

X_train = pd.concat([rad_train,sign_train, demo_train], axis=1)
X_test = pd.concat([rad_test,sign_test,  demo_test], axis=1)
X_just_rad_train = rad_train
X_jst_rad_test = rad_test
X_train.columns = X_train.columns.astype('str')
X_test.columns = X_test.columns.astype('str')
print(X_train.shape,X_test.shape)

(2090, 265) (2742, 265)


In [20]:
# Function for running a random search over a keras model, acts as a first pass for model design
auc = keras.metrics.AUC()
loss = keras.losses.BinaryCrossentropy()
def build_random_model(hp):
    
    model = keras.Sequential()
    j = 0
    for i in range(1, hp.Int("num_layers", 2, 4)):
        model.add(
            keras.layers.Dense(
                units=hp.Choice("units_" + str(i),[100,1000]),
                kernel_regularizer=keras.regularizers.L1L2(l1=1e-2,l2=1e-2),
                activation="relu",
                name='Hidden-Layer-'+str(i))
            )
        model.add(keras.layers.Dropout(0.5)) 
    
    
    model.add(keras.layers.Dense(1, activation='sigmoid',name='Output'))
    
    opt = keras.optimizers.Adam(learning_rate=1e-4)
    model.compile(optimizer=opt, loss=loss,metrics=[auc])
    return model

In [21]:
tuner = keras_tuner.RandomSearch(
    build_random_model,
    objective=keras_tuner.Objective('val_loss','min'),
    max_trials=300,
    overwrite=True,
    directory="random_search",
    project_name="v1"
)

tuner.search(X_train, y_train, epochs=200, validation_split=0.1,batch_size=128)

Trial 28 Complete [00h 00m 27s]
val_loss: 0.8213372230529785

Best val_loss So Far: 0.6912960410118103
Total elapsed time: 00h 18m 28s


In [5]:
# Second pass grid search centered on best params found in random search
auc = keras.metrics.AUC()
loss = keras.losses.BinaryCrossentropy()
best_params = tuner.get_best_hyperparameters()[0].values
print(best_params)
def build_grid_model(hp):
    
    model = keras.Sequential()
    
    for i in range(1, best_params['num_layers']+1):
        units = "units_" + str(i)
        name = 'Hidden-Layer-'+str(i)
        model.add(
            keras.layers.Dense(
                units=hp.Int(units, min_value=32, max_value=1024, step=32, default=best_params[units]),
                kernel_regularizer=keras.regularizers.L1L2(l1=1e-4,l2=1e-3),
                activation="relu",
                name=name,
            ))
        
        # Add dropout to all layers except last one
        if i != 5:
            model.add(keras.layers.Dropout(0.5))
    
    
    model.add(keras.layers.Dense(1, activation='sigmoid',name='Output'))
    
    model.compile(optimizer='adam', loss=loss,metrics=[auc])
    return model

NameError: name 'tuner' is not defined

In [None]:
# Tune binary cross entropy directly as that is the loss function in the model and training AUC may lead to overfitting
tuner = keras_tuner.GridSearch(
    build_grid_model,
    objective=keras_tuner.Objective('val_'+loss.name,'min'),
    max_trials=300,
    overwrite=True,
    directory="grid_search",
    project_name="v1"
)

tuner.search(X_train, y_train, epochs=15, validation_split=0.2)

In [None]:
# Random search result :0.873 val AUC
# {'num_layers': 5, 'units_1': 512, 'l1': 0.0001, 'l2': 0.001, 'dropout_1': 0.1, 'units_2': 288, 'dropout_2': 0.2, 'units_3': 320, 'dropout_3': 0.1, 'units_4': 96, 'dropout_4': 0.1, 'units_5': 128, 'dropout_5': 0.0}
best_params = tuner.get_best_hyperparameters()

for params in best_params:
    print(params.values)