## LSTM Model 1 - base model

The code was adapted from a blog post by Aishwarya Singh (2019) called 'A Hands-On Introduction to Time Series Classification (with Python Code)'

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.preprocessing import sequence
from keras.optimizers import Adam
from keras.models import load_model
from keras.callbacks import ModelCheckpoint

# fix random seed for reproducibility
np.random.seed(7)

Using TensorFlow backend.


In [2]:
# Load data and target
df = pd.read_csv('Seq_data_2.csv')

In [3]:
df.shape

(216563, 77)

In [4]:
# Load target data
targets = pd.read_csv('Seq_target.csv')

In [5]:
targets.shape

(11976, 1)

In [6]:
# Set the sequence length
seq_len = 9

In [7]:
# Generate the sequences using Patient IDs
sequences = list()
patient_list = df['BrcId'].unique()
for patient in patient_list:
    values = df[df['BrcId']==patient].values
    sequences.append(values)

In [8]:
# Show the sequence stats
len_sequences = []
for one_seq in sequences:
    len_sequences.append(len(one_seq))
pd.Series(len_sequences).describe()

count    11976.000000
mean        18.083083
std         25.519482
min          2.000000
25%          6.000000
50%          9.000000
75%         19.000000
max        687.000000
dtype: float64

In [9]:
# truncate the sequence to length 9, pad values with -1
sequences=sequence.pad_sequences(sequences, maxlen=seq_len, padding='pre', dtype='float', truncating='pre', value=-1)

In [10]:
# Updated length is now 9 for all sequences
len_sequences = []
for one_seq in sequences:
    len_sequences.append(len(one_seq))
pd.Series(len_sequences).describe()

count    11976.0
mean         9.0
std          0.0
min          9.0
25%          9.0
50%          9.0
75%          9.0
max          9.0
dtype: float64

In [11]:
# Set the sequence data to df and targets to the target field
df = sequences
targets = np.array(targets['relapse_in_24M'])

In [12]:
# Split training and testing - 70/30
num_train_patients = round(.7*targets.shape[0])
num_test_patients = round(.3*targets.shape[0])

In [13]:
# Assign training and testing to index ranges
X_train = [df[i] for i in range(num_train_patients)] 
X_test = [df[i+num_train_patients] for i in range(num_test_patients)]
y_train = [targets[i] for i in range(num_train_patients)] 
y_test = [targets[i+num_train_patients] for i in range(num_test_patients)] 

In [14]:
# Convert to numpy arrays
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [15]:
X_train.shape

(8383, 9, 77)

In [16]:
# Remove BrcID
X_train= np.delete(X_train,0,axis=2)
X_test = np.delete(X_test,0,axis=2)

In [17]:
X_train.shape

(8383, 9, 76)

### Grid Search
This code was adapted from a blog post by Jason Brownlee (2019) called 'How to Grid Search Hyperparameters for Deep Learning Models in Python With Keras'.

In [None]:
# Use scikit-learn to grid search the batch size and epochs
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

# Function to create model, required for KerasClassifier
def create_model():
    model = Sequential()
    model.add(LSTM(100, input_shape=(seq_len, X_train.shape[2])))
    model.add(Dense(1, activation='sigmoid'))

    # Loss function is binary_crossentropy as it's binary classification, ADAM optimization algorithm
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# create model
model = KerasClassifier(build_fn=create_model, verbose=0)

# define the grid search parameters
batch_size = [1, 10, 20]
epochs = [2, 5, 10, 20]

param_grid = dict(batch_size=batch_size, epochs=epochs)
grid = GridSearchCV(estimator=model, param_grid=param_grid)
grid_result = grid.fit(X_train, y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
# Use scikit-learn to grid search the dropout rate
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

# Function to create model, required for KerasClassifier
def create_model(dropout=0.0, recurrent_dropout=0.0):
    model = Sequential()
    model.add(LSTM(100, input_shape=(seq_len, X_train.shape[2]),dropout=dropout, recurrent_dropout=recurrent_dropout))
    model.add(Dense(1, activation='sigmoid'))

    # Loss function is binary_crossentropy as it's binary classification, ADAM optimization algorithm
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# create model
model = KerasClassifier(build_fn=create_model, verbose=0, epochs=?, batch_size=?)

# define the grid search parameters
dropout = [0.0, 0.2, 0.4, 0.6, 0.8]
recurrent_dropout = [0.0, 0.2, 0.4, 0.6, 0.8]

param_grid = dict(dropout=dropout, recurrent_dropout=recurrent_dropout)
grid = GridSearchCV(estimator=model, param_grid=param_grid)
grid_result = grid.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
# Use scikit-learn to grid search the number of hidden units
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

# Function to create model, required for KerasClassifier
def create_model(units=10):
    model = Sequential()
    model.add(LSTM(units, input_shape=(seq_len, X_train.shape[2])))
    model.add(Dense(1, activation='sigmoid'))

    # Loss function is binary_crossentropy as it's binary classification, ADAM optimization algorithm
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# create model
model = KerasClassifier(build_fn=create_model, verbose=0, epochs=?, batch_size=?)

# define the grid search parameters
units = [10, 20, 30, 50, 100]

param_grid = dict(units=units)
grid = GridSearchCV(estimator=model, param_grid=param_grid)
grid_result = grid.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

### Build Model with best hyperparameters from Grid Search

In [None]:
# Create model with LSTM layer
best_model = Sequential()
best_model.add(LSTM(?, input_shape=(seq_len, X_train.shape[2])))
best_model.add(Dense(1, activation='sigmoid'))

# Loss function is binary_crossentropy as it's binary classification, ADAM optimization algorithm used
best_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Model summary
best_model.summary()

In [None]:
# Fit model  
history = best_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=?, batch_size=?)

In [None]:
# Plot training and validation loss 
# Code provided by Chollet (2018, p. 74)
import matplotlib.pyplot as plt

history_dict = history.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, loss_values, 'bo', label='Training Loss')
plt.plot(epochs, val_loss_values, 'b', label='Validation Loss')
plt.title('Training and Validaton Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
# Plot training and validation accuracy
# Code provided by Chollet (2018, p. 75)
plt.clf()
acc = history_dict['acc']
val_acc = history_dict['val_acc']
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and Validaton Acc')
plt.xlabel('Epochs')
plt.ylabel('Acc')
plt.legend()
plt.show()

In [None]:
# View the probability produced of each test record
# Code provided by Jason Brownlee (2019) in 'How to Calculate Precision, Recall, F1, and More for Deep Learning Models’
yhat_prob = best_model.predict(X_test)

In [None]:
# Code provided by Jason Brownlee (2019) in 'How to Calculate Precision, Recall, F1, and More for Deep Learning Models’
yhat_prob

In [None]:
# Predict classes for test set
# Code provided by Jason Brownlee (2019) in 'How to Calculate Precision, Recall, F1, and More for Deep Learning Models’
yhat_classes = best_model.predict_classes(X_test)

In [None]:
# Calculate additional metrics
# Code provided by Jason Brownlee (2019) in 'How to Calculate Precision, Recall, F1, and More for Deep Learning Models’
from sklearn import metrics
accuracy = metrics.accuracy_score(y_test, yhat_classes)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = metrics.precision_score(y_test, yhat_classes)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = metrics.recall_score(y_test, yhat_classes)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = metrics.f1_score(y_test, yhat_classes)
print('F1 score: %f' % f1)

In [None]:
# Confusion matrix 
# Code provided by Scikit-learn API Reference (2019) 
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, yhat_classes)

In [None]:
# True negatives, False positives, False negatives, True positives
# Code provided by Scikit-learn API Reference (2019) 
tn, fp, fn, tp = confusion_matrix(y_test, yhat_classes).ravel()
tn, fp, fn, tp

In [None]:
# Area Under ROC curve (AUC)
# Code provided by Scikit-learn API Reference (2019) 
fpr, tpr, thresholds = metrics.roc_curve(y_test, yhat_classes, pos_label=1)
metrics.auc(fpr, tpr)