## <div style="text-align: center"> A Comparative Study of Text Classification of Unstructured Medical Notes with Various Levels of Class Imbalance  </div> 
### <div style="text-align: center"> Using Convolutional Neural Networks and Typical Sequence Neural Networks </div>
<div style="text-align: right"> Hongxia Lu, Louis Ehwerhemuepha, Cyril Rakovski </div>
<div style="text-align: right"> Date: 8/20/2021 </div>

## Table of contents
* Part 1: Import Data
* Part II: NLP Preprocessing
* Part III: Modelling

## Part I: Import Data
### About the data: 

* Extracted from: https://portal.dbmi.hms.harvard.edu/projects/n2c2-nlp/
* Each ID has a text document (discharge summary)
* Each text document was annotated with the presence of one or more disease (16 types of diseases)
* Here we focus on the presence/absence of one disease condition at a time as a binary classification problem

### 1. Import Libraries

In [187]:
import warnings
warnings.filterwarnings('ignore')

# For importing .xml files
import xml.etree.ElementTree as ET

# For handeling dataframes
import pandas as pd
import numpy as np
import re                                  # library for regular expression operations
import string                              # for string operations

import matplotlib.pyplot as plt            # for plotting

# For text preprocessing
import nltk                                # Natural Language Toolkit
from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer as ps  # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings
from wordcloud import WordCloud

# For building neural netwrok models
import sklearn
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras import layers, losses
from tensorflow.keras.layers import Dense, Conv1D, Flatten, MaxPooling1D, Dropout, Bidirectional
from keras.optimizers import SGD, Adam

# For model evaluation
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import average_precision_score, precision_recall_curve
from sklearn.metrics import auc, plot_precision_recall_curve
from sklearn.metrics import roc_auc_score
import time

In [189]:
# fix random seed for reproducibility
rd_seed = 1
np.random.seed(rd_seed)
tf.random.set_seed(rd_seed)

### 2. Import text data
The datasets on the Harvard website come as multiple datasets in .xml format, and the text data and labels data are stored in separate files. After importing all the text data and the label data, we combine them into one single dataset.

In [190]:
# Read the 1st set of text data
file = 'data/obesity_patient_records_training.xml'
root = ET.parse(file).getroot()

text = []
ids = []
for i, child in enumerate(root):
    for _, subchild in enumerate(child):
        temp = [element.text for element in subchild]
        text.append(temp[0])
        ids.append(subchild.attrib["id"])

# Create a dataframe 
text_1 = pd.DataFrame(list(zip(ids, text)), columns=["id", "text"])

In [191]:
# Read the 2nd set of text data
file = 'data/obesity_patient_records_training2.xml'
root = ET.parse(file).getroot()

text = []
ids = []
for i, child in enumerate(root):
    for _, subchild in enumerate(child):
        temp = [element.text for element in subchild]
        text.append(temp[0])
        ids.append(subchild.attrib["id"])

# Create a dataframe 
text_2 = pd.DataFrame(list(zip(ids, text)), columns=["id", "text"])

In [192]:
# Read the 3rd set of text data
file = 'data/obesity_patient_records_test.xml'
root_test = ET.parse(file).getroot()

text = []
ids = []
for i, child in enumerate(root_test):
    for _, subchild in enumerate(child):
        temp = [element.text for element in subchild]
        text.append(temp[0])
        ids.append(subchild.attrib["id"])

# Create a dataframe 
text_3 = pd.DataFrame(list(zip(ids, text)), columns=["id", "text"])

In [193]:
# Combine all three text datasets
text_all = text_1.append(text_2.append(text_3))

In [194]:
# Check for duplicates
np.sum(text_all.duplicated('id'))

0

### 3. Import labels data

In [195]:
# Read the 1st set of the labels data
file = 'data/obesity_standoff_intuitive_annotations_training.xml'
root_truth = ET.parse(file).getroot()

disease = []
label = []
ids = []
for i, child in enumerate(root_truth):
    for _, subchild in enumerate(child):       
        for _, element in enumerate(subchild):     
            disease.append(subchild.attrib['name'])
            label.append(element.attrib['judgment'])
            ids.append(element.attrib["id"])

# Create a dataframe 
labels_1 = pd.DataFrame(list(zip(ids, label, disease)), columns=["id", "label", "disease"])

In [196]:
# Read the 2nd set of the labels data
file = 'data/obesity_standoff_annotations_training_addendum3.xml'
root_test = ET.parse(file).getroot()

disease = []
label = []
ids = []
for i, child in enumerate(root_truth):
    for _, subchild in enumerate(child):       
        for _, element in enumerate(subchild):     
            disease.append(subchild.attrib['name'])
            label.append(element.attrib['judgment'])
            ids.append(element.attrib["id"])

# Create a dataframe 
labels_2 = pd.DataFrame(list(zip(ids, label, disease)), columns=["id", "label", "disease"])

In [197]:
# Read the 3rd set of the labels data
file = 'data/obesity_standoff_annotations_test_intuitive.xml'
root_truth = ET.parse(file).getroot()

disease = []
label = []
ids = []
for i, child in enumerate(root_truth):
    for _, subchild in enumerate(child):       
        for _, element in enumerate(subchild):     
            disease.append(subchild.attrib['name'])
            label.append(element.attrib['judgment'])
            ids.append(element.attrib["id"])

# Create a dataframe 
labels_3 = pd.DataFrame(list(zip(ids, label, disease)), columns=["id", "label", "disease"])

In [198]:
# Combine the 3 labels datasets
labels_all = labels_3.append(labels_1.append(labels_2))

In [199]:
# One patient can have multiple diseases
labels_all["disease"].value_counts()

Gout                    1696
OSA                     1689
Gallstones              1677
Hypertriglyceridemia    1660
Depression              1641
Diabetes                1623
Asthma                  1615
OA                      1594
PVD                     1579
CAD                     1562
Obesity                 1555
Hypertension            1508
Venous Insufficiency    1479
Hypercholesterolemia    1437
GERD                    1402
CHF                      924
Name: disease, dtype: int64

In [200]:
labels_all.shape

(24641, 3)

### 4. Create a dataset from the labels data that only shows the presence of the disease of interest
There are 16 disease conditions in the original labels data. We work with one disease at a time.

In [201]:
# In the disease column, code the disease of interest as 1 and other disease types as 0 to indicate 
# whether this row is about this particular disease
labels_all["disease"] = [1 if x=="Venous Insufficiency" else 0 for x in labels_all["disease"]] 

# In the label column, code Y as 1 and everything else as 0 to indicate whether a disease is present
# (it may or may not be about this particular disease)
labels_all["label"] = [1 if x=="Y" else 0 for x in labels_all["label"]] 

# Create a "This_Disease" column that indicates whether a patient has this particular disease or not
# A patient is coded as having this disease only when (disease is 1) AND (label is 1)
labels_all["This_Disease"] = labels_all["disease"] * labels_all["label"] 

In [202]:
# Now that all useful info in "disease" and "Label" are combined in column "This_Disease", we no longer need disease and label
labels_all = labels_all.iloc[:, [0,3]]
# There are duplicates after removing disease and label because there are 0's in the "disease" column if the patient had other disease info
cad_labels = labels_all.drop_duplicates(inplace = True)

### 5. Combine the text data and the labels data

In [203]:
# Combine the x_test and y_test sets
df = text_all.merge(labels_all, on="id", how="left")

In [204]:
# Some text data don't have labels
df.isnull().sum()

# Remove the data that don't have labels
df.dropna(axis=0, inplace=True)

In [205]:
# In column "This_Disease", if one id has both a 0 and a 1, take 1 and delete 0
# This happened when creating column "This_Disease", there is always a 0 for the ids that had other disease info
# Those disease condistions that are not of interest would result in a 0 in column "This_Disease".
df["this_disease"] = df.groupby(["id", "text"], squeeze=True)["This_Disease"].transform(lambda x: np.max(x))

# Remove the duplicates and the columns that are no longer needed
df = df.iloc[:,[0,1,3]].drop_duplicates()

In [206]:
df.shape

(1116, 3)

In [207]:
df.head(3)

Unnamed: 0,id,text,this_disease
0,1,\n490646815 | WMC | 31530471 | | 9629480 | 11/...,0.0
1,2,\n159644670 | VH | 60656526 | | 6334749 | 11/2...,0.0
2,4,\n368346277 | EMH | 64927307 | | 815098 | 3/29...,0.0


In [208]:
# Disease prevalence (this also shows how balanced/imblanced the two classes are in the data)
np.sum(df.this_disease)/len(df)

0.06541218637992832

### 6. Descriptive statistics before cleaning

In [209]:
# Descriptive statistics of the number of words after cleaning
print("Number of words before cleaning")
print("Quantiles: ", np.round(np.quantile([len(x.split()) for x in df["text"]], q = [0, 0.25, 0.5, 0.75, 1])))
print("Mean: ", round(np.mean([len(x.split()) for x in df["text"]])))
print("Standard deviation: ", round(np.std([len(x.split()) for x in df["text"]])))

Number of words before cleaning
Quantiles:  [ 146.  809. 1070. 1416. 4280.]
Mean:  1157
Standard deviation:  506


In [212]:
# Descriptive statistics of the number of characters after cleaning
print("Number of characters before cleaning")
print("Quantiles: ", np.round(np.quantile([len(x) for x in df["text"]], q = [0, 0.25, 0.5, 0.75, 1])))
print("Mean: ", round(np.mean([len(x) for x in df["text"]])))
print("Standard deviation: ", round(np.std([len(x) for x in df["text"]])))

Number of characters before cleaning
Quantiles:  [  903.  4741.  6286.  8354. 25842.]
Mean:  6790
Standard deviation:  2957


## Part II: NLP Preprocessing

### 1. Convert to lower case and remove noises

In [171]:
# Convert all words in text column to lower case in both the train and test sets
df['text'] = [str(x).lower() for x in df['text']]

In [172]:
# Remove noise
df['text']  = [re.sub("(\W|\d+|\n)", " ", elem).strip() for elem in df['text']] # remove spaces and digits and line breaks

In [173]:
# Remove punctuations
def remove_punctuations(text):
    return ' '.join(['' if (elem in string.punctuation) else elem for elem in text.split()])

df['text'] = [remove_punctuations(elem) for elem in df['text']]

### 2. Remove stop words and short words

Stop words are the most common words in any natural language which do not add much value in NLP modelling. They include words such as "the", "is", "in", "for", "where", "when", "to", "at".
Based on the domain, customized stop words can also be removed.

In [174]:
# Download the stopwords from NLTK
nltk.download('stopwords')

# Import the standard English stop words list from NLTK
stopwords_english = stopwords.words('english') 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hanna\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [175]:
# Remove standard English stop words
def remove_stopwords(text):
    return ' '.join(['' if (elem in stopwords_english) else elem for elem in text.split()])

df['text'] = [remove_stopwords(elem) for elem in df['text']]

In [176]:
# Remove words with a length of one or two
def remove_short_words(text):
    return ' '.join(['' if (len(elem) <= 2) else elem for elem in text.split()])

df['text'] = [remove_short_words(elem) for elem in df['text']]

### 3. Descriptive statistics after cleaning

In [186]:
# Descriptive statistics of the number of words after cleaning
print("Number of words before cleaning")
print("Quantiles: ", np.round(np.quantile([len(x.split()) for x in df["text"]], q = [0, 0.25, 0.5, 0.75, 1])))
print("Mean: ", round(np.mean([len(x.split()) for x in df["text"]])))
print("Standard deviation: ", round(np.std([len(x.split()) for x in df["text"]])))

Number of words before cleaning
Quantiles:  [  66.  414.  549.  730. 2212.]
Mean:  591
Standard deviation:  254


In [185]:
# Descriptive statistics of the number of characters after cleaning
print("Number of characters before cleaning")
print("Quantiles: ", np.round(np.quantile([len(x) for x in df["text"]], q = [0, 0.25, 0.5, 0.75, 1])))
print("Mean: ", round(np.mean([len(x) for x in df["text"]])))
print("Standard deviation: ", round(np.std([len(x) for x in df["text"]])))

Number of characters before cleaning
Quantiles:  [  542.  3262.  4327.  5785. 18134.]
Mean:  4686
Standard deviation:  2023


### 4. Split the data into training and test sets

In [268]:
# Dummify the label variable event
event_categorical = to_categorical(df['this_disease'])

# Split the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(df['text'], event_categorical, test_size=0.25, random_state=rd_seed)

### 5. Tokenize the text data
What the Tokenizer does is:
* First, it creates a word-index dictionary based on word frequency, so that every word gets an integer value as the index (an integer between 1 and the maximum number of unique words in the texts. 0 is reserved for padding.)  
* Then, it transforms each text to a sequence of integers. It basically takes each word in the text, looks it up in the word-index dictionary, and replaces it with its corresponding index. 

In [269]:
# Tokenize the text
tokenizer = Tokenizer(num_words=5000) # get the frequency of all tokens and use the 5000 most common ones
tokenizer.fit_on_texts(x_train)
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
vocab_size = len(tokenizer.word_index) + 1 # plus the reserved index 0

### 6. Pad the word sequences to make each sequence the same length
Most machine learning models (if not all) require the input data to be the same length. Text data often do not have the same length. However, we can pad the shorter text sequences with 0's to make their lengths the same. We can choose to pad the 0's at the beginning of a sequence (by setting padding="pre"), or at the end (by setting padding="post"). 

In [270]:
# Selected quantiles of the number of words in the texts to get an idea about the length of the text sequences
np.quantile([len(x) for x in x_train], q = [0, 0.5, 0.75, 0.9, 1])

array([ 104. ,  513. ,  692. ,  874.8, 2141. ])

In [271]:
np.quantile([len(x) for x in x_train], q = [0, 0.25, 0.5, 0.75, 1])

array([ 104.,  385.,  513.,  692., 2141.])

In [176]:
# Pad the sequences to make them to have the same length
maxlen = 525 # assumes the first 522 words are the most important, and make all sequences with length 522
x_train = pad_sequences(x_train, padding='post', maxlen=maxlen) 
x_test = pad_sequences(x_test, padding='post', maxlen=maxlen)

In [177]:
x_train.shape

(837, 525)

## Part III: Modeling

### 1. Define a function to evaluate model performance interms of Precision, Recall, and F1 Score
* Precision answers the question: what proportion of positive identifications was actually correct?
* Recall answers the question: what proportion of actual positives was identified correctly?
* F1 score, also called the F Score or the F Measure, conveys the balance between the precision and the recall

In [178]:
# Define a function to evaluate the model performance in terms of F1 score
def evaluate(model, X, y):
    pred = model.predict_classes(X)
    acc = np.sum(y == pred)/len(pred)

    tn, fp, fn, tp = confusion_matrix(y, pred).ravel()
    print("tn  fp fn tp")
    print(tn, fp, fn, tp)

    precision = tp/(tp + fp)
    recall = tp/(tp + fn)
    specificity = tn/(tn + fp)
    f1 = (2*precision*recall)/(precision + recall)
    print("\nAccuracy: " + str(round(acc, 3)) + "\nPrecision: " + str(round(precision,3)) + 
          "\nRecall: " + str(round(recall,3)) + 
          "\nSpecificity: " + str(round(specificity,3)) + "\nF1 Score: " + str(round(f1,3)))

### 2. CNN (Convolutional Neural Network) Model

* The batch size is a hyperparameter that defines the number of samples to work through before updating the internal model parameters (weights).
* The number of epochs is a hyperparameter that defines the number times that the learning algorithm will work through the entire training dataset.

In [179]:
# Set the number of epochs and the batch size
epochs = 20
batch_size = 32

In [180]:
cnnmodel = Sequential()
cnnmodel.add(layers.Embedding(input_dim=vocab_size, output_dim=64, input_length=maxlen))
cnnmodel.add(Conv1D(8, kernel_size=8, activation="relu"))
cnnmodel.add(MaxPooling1D(pool_size=2))
cnnmodel.add(Dropout(0.5))
cnnmodel.add(Flatten())
cnnmodel.add(Dense(units=2, activation='sigmoid'))
opt = keras.optimizers.Adam(learning_rate=0.003)
cnnmodel.compile(optimizer=opt, loss='binary_crossentropy', metrics=['AUC'])
cnnmodel.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 525, 64)           1519680   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 518, 8)            4104      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 259, 8)            0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 259, 8)            0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 2072)              0         
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 4146      
Total params: 1,527,930
Trainable params: 1,527,930
Non-trainable params: 0
____________________________________________

In [181]:
time_start = time.time()
cnnmodel.fit(x_train, y_train, epochs=epochs, batch_size=batch_size)
loss, accuracy = cnnmodel.evaluate(x_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = cnnmodel.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
time_end = time.time()

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Training Accuracy: 1.0000
Testing Accuracy:  0.9666


In [182]:
print("Running Time:", time_end - time_start)

Running Time: 36.179983615875244


In [183]:
# Predicted probabilities on test data
y_pred = cnnmodel.predict(x_test)

# AUC-ROC (area under the Receiver Operating Characteristic curve)
auc_roc = round(roc_auc_score(y_test[:,1], y_pred[:,1]),4)

# AUC-PR (area under the Precision Recall Curve)
precision, recall, thresholds = precision_recall_curve(y_test[:,1], y_pred[:,1])
auc_precision_recall = round(auc(recall, precision),4)

print("AUC_ROC: " + str(auc_roc))
print("AUC_PR: " + str(auc_precision_recall))

AUC_ROC: 0.8037
AUC_PR: 0.0995


In [184]:
# Accuracy and F1 Score
evaluate(cnnmodel, x_test, y_test[:,1])

tn  fp fn tp
269 0 10 0

Accuracy: 0.964
Precision: nan
Recall: 0.0
Specificity: 1.0
F1 Score: nan


### 3. RNN (Recurrent Neural Network) Model

In [None]:
# RNN model
rnnmodel = Sequential()
rnnmodel.add(layers.Embedding(input_dim=vocab_size, output_dim=64, input_length=maxlen))
rnnmodel.add(layers.SimpleRNN(units=8)) 
rnnmodel.add(Dropout(0.5))
rnnmodel.add(layers.Dense(2, activation='sigmoid'))
opt = keras.optimizers.Adam(learning_rate=0.003)
rnnmodel.compile(optimizer=opt,
              loss='binary_crossentropy',
              metrics=['AUC'])
rnnmodel.summary()

In [None]:
time_start = time.time()
rnnmodel.fit(x_train, y_train, epochs=epochs, batch_size=batch_size)
loss, accuracy = rnnmodel.evaluate(x_train, y_train)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = rnnmodel.evaluate(x_test, y_test)
print("Testing Accuracy:  {:.4f}".format(accuracy))
time_end = time.time()

In [None]:
print("Running Time:", time_end - time_start)

In [None]:
# Predicted probabilities on test data
y_pred = rnnmodel.predict(x_test)

# AUC-ROC (area under the Receiver Operating Characteristic curve)
auc_roc = round(roc_auc_score(y_test[:,1], y_pred[:,1]),4)

# AUC-PR (area under the Precision Recall Curve)
precision, recall, thresholds = precision_recall_curve(y_test[:,1], y_pred[:,1])
auc_precision_recall = round(auc(recall, precision),4)

print("AUC_ROC: " + str(auc_roc))
print("AUC_PR: " + str(auc_precision_recall))

In [None]:
# Accuracy and F1 Score
evaluate(rnnmodel, x_test, y_test[:,1])

### 3. GRU (Gated Recurrent Unit) Model

In [None]:
# GRU model
grumodel = Sequential()
grumodel.add(layers.Embedding(input_dim=vocab_size, output_dim=64, input_length=maxlen))
grumodel.add(layers.GRU(units=8)) 
grumodel.add(Dropout(0.5))
grumodel.add(layers.Dense(2, activation='sigmoid'))
opt = keras.optimizers.Adam(learning_rate=0.003)
grumodel.compile(optimizer=opt,
              loss='binary_crossentropy',
              metrics=['AUC'])
grumodel.summary()

In [None]:
time_start = time.time()
grumodel.fit(x_train, y_train, epochs=epochs, batch_size=batch_size)
loss, accuracy = grumodel.evaluate(x_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = grumodel.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
time_end = time.time()

In [None]:
print("Running Time:", time_end - time_start)

In [None]:
# Predicted probabilities on test data
y_pred = grumodel.predict(x_test)

# AUC-ROC (area under the Receiver Operating Characteristic curve)
auc_roc = round(roc_auc_score(y_test[:,1], y_pred[:,1]),4)

# AUC-PR (area under the Precision Recall Curve)
precision, recall, thresholds = precision_recall_curve(y_test[:,1], y_pred[:,1])
auc_precision_recall = round(auc(recall, precision),4)

print("AUC_ROC: " + str(auc_roc))
print("AUC_PR: " + str(auc_precision_recall))

In [None]:
# Accuracy and F1 Score
evaluate(grumodel, x_test, y_test[:,1])

### 4. LSTM (Long Short Term Memory) Model

In [None]:
# LSTM model
lstm_model = Sequential()
lstm_model.add(layers.Embedding(input_dim=vocab_size, output_dim=64, input_length=maxlen))
lstm_model.add(layers.LSTM(units=8)) 
lstm_model.add(Dropout(0.5))
lstm_model.add(layers.Dense(2, activation='sigmoid'))
opt = keras.optimizers.Adam(learning_rate=0.003)
lstm_model.compile(optimizer=opt,
              loss='binary_crossentropy',
              metrics=['accuracy'])
lstm_model.summary()

In [None]:
time_start = time.time()
lstm_model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size)
loss, accuracy = lstm_model.evaluate(x_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = lstm_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
time_end = time.time()

In [None]:
print("Running Time:", time_end - time_start)

In [None]:
# Predicted probabilities on test data
y_pred = lstm_model.predict(x_test)

# AUC-ROC (area under the Receiver Operating Characteristic curve)
auc_roc = round(roc_auc_score(y_test[:,1], y_pred[:,1]),4)

# AUC-PR (area under the Precision Recall Curve)
precision, recall, thresholds = precision_recall_curve(y_test[:,1], y_pred[:,1])
auc_precision_recall = round(auc(recall, precision),4)

print("AUC_ROC: " + str(auc_roc))
print("AUC_PR: " + str(auc_precision_recall))

In [None]:
# Accuracy and F1 Score
evaluate(lstm_model, x_test, y_test[:,1])

## Bi-Directional LSTM

In [None]:
# Bi-directional LSTM model
bi_lstm_model = Sequential()
bi_lstm_model.add(layers.Embedding(input_dim=vocab_size, output_dim=64, input_length=maxlen))
bi_lstm_model.add(layers.Bidirectional(layers.LSTM(8)))
bi_lstm_model.add(Dropout(0.5))
bi_lstm_model.add(layers.Dense(2, activation='sigmoid'))
opt = keras.optimizers.Adam(learning_rate=0.003)
bi_lstm_model.compile(optimizer=opt,
              loss=losses.categorical_crossentropy,
              metrics=['AUC'])
bi_lstm_model.summary()

In [None]:
time_start = time.time()
bi_lstm_model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size)
loss, accuracy = bi_lstm_model.evaluate(x_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = bi_lstm_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
time_end = time.time()

In [None]:
print("Running Time:", time_end - time_start)

In [None]:
# Predicted probabilities on test data
y_pred = bi_lstm_model.predict(x_test)

# AUC-ROC (area under the Receiver Operating Characteristic curve)
auc_roc = round(roc_auc_score(y_test[:,1], y_pred[:,1]),4)

# AUC-PR (area under the Precision Recall Curve)
precision, recall, thresholds = precision_recall_curve(y_test[:,1], y_pred[:,1])
auc_precision_recall = round(auc(recall, precision),4)

print("AUC_ROC: " + str(auc_roc))
print("AUC_PR: " + str(auc_precision_recall))

In [None]:
# Accuracy and F1 Score
evaluate(bi_lstm_model, x_test, y_test[:,1])