In [1]:
# BERT imports
from sklearn.model_selection import StratifiedShuffleSplit
from transformers import BertTokenizer
import pandas as pd
import numpy as np
import string
import time
import re

In [2]:
import nltk                                # Natural Language Toolkit
from nltk.corpus import stopwords          # module for stop words that come with NLTK
import tensorflow as tf
import tensorflow.keras.losses

In [3]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import average_precision_score, precision_recall_curve
from sklearn.metrics import auc, plot_precision_recall_curve
from sklearn.metrics import roc_auc_score

In [4]:
# !pip install tensorflow_text

In [5]:
import tensorflow_hub as hub
import tensorflow_text as text

In [6]:
# Load the data
labels_all = pd.read_csv("data/labels_all.csv")
labels_all.drop(['Unnamed: 0'], axis=1, inplace=True)

text_all = pd.read_csv('data/text_all.csv')
text_all.drop(['Unnamed: 0'], axis=1, inplace=True)

#  Create binary labels for id
disease_types = ['Hypertriglyceridemia','Venous Insufficiency','Asthma','Gout','OSA','PVD','Gallstones','OA','GERD',
                'Depression','Obesity','CHF','Hypercholesterolemia','CAD','Diabetes','Hypertension']

In [7]:
# Make the outcome variables (disease types) binary
def make_binary(labels_df):
    for i in range(16):
        # In the disease column, code the disease of interest as 1 and other disease types as 0 to indicate 
        # whether this row is about this particular disease
        labels_df["disease_"+str(i)] = [1 if x==disease_types[i] else 0 for x in labels_df["disease"]]
        # In the label column, code Y as 1 and everything else as 0 to indicate whether a disease is present
        # (it may or may not be about this particular disease)
        labels_df["label_"+str(i)] = [1 if x=="Y" else 0 for x in labels_df["label"]] 
        # Create a "This_Disease" column that indicates whether a patient has this particular disease or not
        # A patient is coded as having this disease only when (disease is 1) AND (label is 1)
        labels_df["This_Disease_"+str(i)] = labels_df["disease_"+str(i)] * labels_df["label_"+str(i)] 
        labels_df.drop(["disease_"+str(i), "label_"+str(i)], axis=1, inplace=True)

    # Now that all useful info in "disease" and "Label" are combined in column "This_Disease", no longer need disease and label
    labels_df.drop(['label', 'disease'], axis=1, inplace=True)
    # Duplicates after removing disease and label are due to 0's in the "disease" column if the patient had other disease info
    labels_df.drop_duplicates(inplace = True)
    # Aggregate the binary values into one row for each id
    labels_df = labels_df.groupby('id').aggregate('sum').reset_index()
    
    return(labels_df)

In [8]:
binary_labels = make_binary(labels_all)

# Calculate the disease prevalence for each disease type
prevalences = (binary_labels.aggregate('sum')/1237).iloc[1:,]

# Combine the text data and the labels
df = text_all.merge(binary_labels, on="id", how="left")

del labels_all, text_all, binary_labels

In [10]:
# Download the stopwords from NLTK
nltk.download('stopwords')

# Import the standard English stop words list from NLTK
stopwords_english = stopwords.words('english') 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hanna\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
def preprocessing(text):
    text = "".join([x for x in text if x not in string.punctuation]) # Remmove punctuations 
    text = "".join([x.lower() for x in text]) # Convert to lower case
    text = ' '.join(['' if (x in stopwords_english) else x for x in text.split()]) # Remove stopwords
    text = re.sub("(\W|\d+|\n)", " ", text).strip() # remove spaces, digits and line breaks
    text = ' '.join(['' if (len(x) <= 2) else x for x in text.split()]) # Remove short words
    return(text)

In [12]:
df['text'] = [preprocessing(x) for x in df['text']]

### Model:

In [14]:
# preprocess_url = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
encoder_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4"

In [15]:
bert_encoder = hub.KerasLayer(encoder_url, trainable=False)

INFO:absl:Using C:\Users\Hanna\AppData\Local\Temp\tfhub_modules to cache modules.


In [17]:
preprocessor = hub.load(
    "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")

In [22]:
# Define a function to evaluate the model performance in terms of F1 score
def evaluate(model, X, y):
    pred = model.predict(X)
    pred = [1 if p>=0.5 else 0 for p in pred]
    acc = np.sum(y == pred)/len(y)
    
    tn, fp, fn, tp = confusion_matrix(y, pred).ravel()

    precision = tp/(tp + fp)
    recall = tp/(tp + fn)
    specificity = tn/(tn + fp)
    f1 = (2*precision*recall)/(precision + recall)
    
    y_pred = model.predict(X)
    auc_roc = round(roc_auc_score(y, y_pred),4)
    pre, rec, thresholds = precision_recall_curve(y, y_pred)
    auc_pr = round(auc(rec, pre),4)
    return(auc_roc, auc_pr, acc, precision, recall, specificity, f1)

In [25]:
# Run the model
X = df['text']

epochs = 3
max_length = 512
batch_size = 32

time_1 = time.time()

for i in range(16): # Go through each disease type

    event_categorical = df['This_Disease_'+str(i)]
    dis_type = disease_types[i]
    
    time_start = time.time()   

    # Split the data 10 times into train and test sets with stratification
    sss = StratifiedShuffleSplit(n_splits=10, test_size=0.25,random_state=0)
    
    j = 0
    for train_index, test_index in sss.split(X, event_categorical):   
              
        x_train, x_test = X[train_index], X[test_index]
        y_train, y_test = event_categorical[train_index], event_categorical[test_index]
        
        j += 1
        iteration = "iter" + str(j)

        # tokenize batches of text inputs.
        text_inputs = [tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')]
        tokenize = hub.KerasLayer(preprocessor.tokenize)
        tokenized_inputs = [tokenize(segment) for segment in text_inputs]

        # pack input sequences for the Transformer encoder.       
        bert_pack_inputs = hub.KerasLayer(
            preprocessor.bert_pack_inputs,
            arguments=dict(seq_length=max_length)) 
        encoder_inputs = bert_pack_inputs(tokenized_inputs)
        
        # BERT model:
        outputs = bert_encoder(encoder_inputs)

        x = tf.keras.layers.Dropout(0.3, name="dropout")(outputs['pooled_output'])
        x = tf.keras.layers.Dense(128, activation='relu', name="dense")(x) # Feed-forward layer on top
        x = tf.keras.layers.Dropout(0.3, name="dropout2")(x)
        y = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(x)

        # Use inputs and outputs to construct a final model
        mymodel = tf.keras.Model(inputs=[text_inputs], outputs = [y])
        mymodel.layers[3].trainable = False
        mymodel.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])

        time_s = time.time() 
        mymodel.fit(x_train, y_train, epochs=epochs, batch_size=batch_size)
        time_e = time.time() - time_s

        # Collect and log evaluation metrics
        auc_roc, auc_pr, acc, precision, recall, specificity, f1 = evaluate(mymodel, x_test, y_test)

        with open('other/CNN_Paper_no_embedding_BERT.csv','a') as fd:
            fd.write(f'{dis_type},{iteration},{auc_roc},{auc_pr},{acc},{precision},{recall},{specificity},{f1},{time_e}\n')
        
        del x_train, x_test,text_inputs,tokenize,tokenized_inputs,bert_pack_inputs,encoder_inputs,outputs,x,y,mymodel
    
    # Average metrics
    running_time = time.time() - time_start

    with open('other/CNN_Paper_no_embedding_BERT.csv','a') as fd:
        fd.write(f'{dis_type}, 10 iteration running time, {running_time}\n')

Epoch 1/3


KeyboardInterrupt: 

In [None]:
time_2 = time.time() - time_1  
with open('other/CNN_Paper_no_embedding_BERT.csv','a') as fd:
    fd.write(f'{dis_type}, Total running time, {time_2}\n')