the objective: use a LSTM model for prediction of sepsis shock using physionet2019 dataset


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

import time
# import custom libraries
import sys
sys.path.append("C:\\DATA\\Tasks\\lib\\hk")
import hk_psql
import dash_bootstrap_components

import plotly.express as px  # (version 4.7.0 or higher)
import plotly.graph_objects as go
from plotly.subplots import make_subplots


import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence


ADD_DATA = "C:\\DATA\\data\\raw\\physionet_challenge_2019\\"

%load_ext autoreload
%autoreload 2




## putting al .psv files into one .csv file

In [2]:
files = [f for f in os.listdir(ADD_DATA+'training_setA') ]
li=[]
for i,f in enumerate(files[:10000]):
    df = pd.read_csv(ADD_DATA+'training_setA\\'+f, delimiter='|')
    df['id'] = i
    li.append(df)
df_A = pd.concat(li, axis=0, ignore_index=True)
df_A.to_csv(ADD_DATA+'df_AA.csv', index=False)

In [3]:
files = [f for f in os.listdir(ADD_DATA+'training_setB') ]
li=[]
for i,f in enumerate(files[:10000]):
    df = pd.read_csv(ADD_DATA+'training_setB\\'+f, delimiter='|')
    df['id'] = i
    li.append(df)

df_B = pd.concat(li, axis=0, ignore_index=True)
df_B.to_csv(ADD_DATA+'df_BB.csv', index=False)

## Data preprocessing for LSTM

In [None]:
### Imputation

In [40]:
df_train = pd.read_csv(ADD_DATA+'df_AA.csv')
df_test = pd.read_csv(ADD_DATA+'df_BB.csv')
df_train.columns


Index(['HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2',
       'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN',
       'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct',
       'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium',
       'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC',
       'Fibrinogen', 'Platelets', 'Age', 'Gender', 'Unit1', 'Unit2',
       'HospAdmTime', 'ICULOS', 'SepsisLabel', 'id'],
      dtype='object')

In [41]:
# a link for imputation tutorial
# https://towardsdatascience.com/how-to-fill-missing-data-with-pandas-8cb875362a0d

cols = ['HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2',
       'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN',
       'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct',
       'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium',
       'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC',
       'Fibrinogen', 'Platelets']


df_train[cols] = df_train.groupby('id')[cols].ffill().bfill()
df_test[cols] = df_train.groupby('id')[cols].ffill().bfill()

df_train[cols] = df_train[cols].fillna( df_train[cols].mean() )
df_train[cols] = df_train[cols].fillna( 0 )

## Load train and test data

In [43]:
# Train data
# read csv and put data in a list of tensors (each patient)


train_data={'X':[], 'y':[]}
for id in df_train['id'].unique():
    train_data['X'].append(df_train[df_train['id']==id][cols].values)
    train_data['y'].append(df_train[df_train['id']==id]['SepsisLabel'].values)

print( train_data['X'][0].shape)   

(54, 34)


In [44]:
# Test data
# read csv and put data in a list of tensors (each patient)


test_data={'X':[], 'y':[]}
for id in df_test['id'].unique():
    test_data['X'].append(df_test[df_test['id']==id][cols].values)
    test_data['y'].append(df_test[df_test['id']==id]['SepsisLabel'].values)

print( test_data['X'][0].shape)  

(24, 34)


In [45]:
print(train_data['X'][0].shape, train_data['y'][0].shape)

(54, 34) (54,)


In [46]:
print('hi')

hi


In [47]:
aa=1

## Build LSTM model

### Params

In [48]:
# LSTM settings
DEBUG=True

# Hyper-parameters 
# input_size = 784 # 28x28
num_classes = 2
num_epochs = 100
batch_size = 8
learning_rate = 0.001

input_size = train_data['X'][0].shape[-1]
# sequence_length = 28 # it is variable in our example
hidden_size = 5
num_layers = 1

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device='cpu'
print(device)

cuda


### DataLoader


In [66]:
# create a DataLoader object

class MinimalDataset(Dataset):
    def __init__(self, X, y):
        super().__init__()

        self.X = list(map(lambda a: torch.tensor(a).float(), X))
        self.y = list(map(lambda a: torch.tensor(a).long(), y))

    def __getitem__(self, idx):
        
        X = self.X[idx]
        y = self.y[idx]
        
        sample = {"X": X, "y": y}

        return X, y
        

    def __len__(self):
        return len(self.X)

def collate_fn(batch):
    # for padding
    # https://suzyahyah.github.io/pytorch/2019/07/01/DataLoader-Pad-Pack-Sequence.html
    #X = [i[0] for i in batch]
    #y = [i[1] for i in batch]

    (xx, yy) = zip(*batch)
    x_lens = [len(x) for x in xx]
    y_lens = [len(y) for y in yy]

    xx_pad = pad_sequence(xx, batch_first=True, padding_value=0)
    yy_pad = pad_sequence(yy, batch_first=True, padding_value=0)

    return xx_pad, yy_pad, x_lens, y_lens

train_dataset = MinimalDataset(train_data['X'], train_data['y'])
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

test_dataset = MinimalDataset(test_data['X'], test_data['y'])
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


In [50]:
print(len(train_loader))
#print(train_dataset[0])

#print(train_dataset[0])
for i, (X,y) in enumerate(train_loader):
    print(type(X))
    print(len(X))
    print(X[0].shape)
    print(y[0].shape)
    term

13
<class 'list'>
8
torch.Size([54, 34])
torch.Size([54])


NameError: name 'term' is not defined

### LSTM structure

In [79]:


class myLSTM(nn.Module):
    """
    input_size - will be 1 in this example since we have only 1 predictor (a sequence of previous values)
    hidden_size - Can be chosen to dictate how much hidden "long term memory" the network will have
    output_size - This will be equal to the prediciton_periods input to get_x_y_pairs
    """
    def __init__(self, input_size, hidden_size, output_size=2):
        super(myLSTM, self).__init__()
        self.hidden_size = hidden_size
        
        self.lstm = nn.LSTM(input_size = input_size, hidden_size = hidden_size, batch_first=True)
        
        self.linear = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()
        self.hidden=None
    def forward(self, x, x_lens):
        
        # initialize the hidden state.
        """
        x: (batch, seq_len, input_size)

        out: (batch, seq_len, hidden_size)
        h_n, c_n: (num_layers * num_directions, batch, hidden_size)
        """
        
        states = (self.h_n, self.c_n)

        if DEBUG: print('input.shape = ',x.shape, x.get_device())
        if DEBUG: print('states[0].shape = ', states[0].shape, states[0].get_device())

        # if variable length: do padding

        #x = pad_sequence(x, batch_first=True)
        #x_lens = list(map(len, x))
        #x = pack_padded_sequence(x, x_lens, batch_first=True, enforce_sorted=False)
        x_padded_packed = pack_padded_sequence(x, x_lens, batch_first=True, enforce_sorted=False)

        x_padded_packed, (self.h_n, self.c_n) = self.lstm(x_padded_packed, states)
        output_padded, output_lengths = pad_packed_sequence(x_padded_packed, batch_first=True)
        if DEBUG: print('output_lengths',output_lengths)
        if DEBUG: print('lstm_out.shape = ',output_padded.shape)
        #if DEBUG: print('lstm_out.view(len(x), -1).shape = ',lstm_out.view(len(x), -1).shape)

        # if variable length: unpadd
        #lstm_out = pad_packed_sequence(lstm_out, batch_first=True)


        
        out = self.linear(output_padded)
        if DEBUG: print('fc_out.shape = ',out.shape)

        #predictions = nn.functional.log_softmax(out)
        #predictions = torch.swapaxes(predictions, 1, 2)
        #predictions = predictions.transpose(1,2)
        return out

    def init_states(self, batch_size, device='cpu'):
        '''
        Initiate hidden states.
        '''
        # Shape for hidden state and cell state: num_layers * num_directions, batch, hidden_size
        
        #h_0 = torch.randn(1, self.batch_size, self.hidden_dim)
        #c_0 = torch.randn(1, self.batch_size, self.hidden_dim)

        # The Variable API is now semi-deprecated, so we use nn.Parameter instead.
        # Note: For Variable API requires_grad=False by default;
        # For Parameter API requires_grad=True by default.
        
        #h_0 = nn.Parameter(h_0, requires_grad=True)
        #c_0 = nn.Parameter(c_0, requires_grad=True)

        """initialize the hidden and cell states"""
        self.h_n = torch.zeros(1, batch_size, self.hidden_size).float().to(device)
        self.c_n = torch.zeros(1, batch_size, self.hidden_size).float().to(device)
        
        




        

In [53]:
# https://pypi.org/project/pytorch-model-summary/


from pytorch_model_summary import summary

model = myLSTM(input_size, hidden_size)
model.init_states(batch_size)


print(summary(model, torch.zeros((batch_size, 24, input_size)), show_input=False, show_hierarchical=True))


input.shape =  torch.Size([8, 24, 34]) -1
states[0].shape =  torch.Size([1, 8, 5]) -1
lstm_out.shape =  torch.Size([8, 24, 5])
fc_out.shape =  torch.Size([8, 24, 2])
----------------------------------------------------------------------------------------
      Layer (type)                         Output Shape         Param #     Tr. Param #
            LSTM-1     [8, 24, 5], [1, 8, 5], [1, 8, 5]             820             820
          Linear-2                           [8, 24, 2]              12              12
Total params: 832
Trainable params: 832
Non-trainable params: 0
----------------------------------------------------------------------------------------



myLSTM(
  (lstm): LSTM(34, 5, batch_first=True), 820 params
  (linear): Linear(in_features=5, out_features=2, bias=True), 12 params
  (sigmoid): Sigmoid(), 0 params
), 832 params





  predictions = nn.functional.log_softmax(out)


### Train function

In [88]:

def train(model, train_loader, criterion, optimizer, num_epochs=100):
    model.train()
    train_loss = 0
    
    n_total_steps = len(train_loader)
    for epoch in range(num_epochs):
        
        #model.init_states(batch_size, device=device)

        for i, (X_pad, y_pad, X_lens, y_lens) in enumerate(train_loader):

            model.init_states(batch_size, device=device)
            
            
            X_padded = X_pad.to(device)
            y_padded = y_pad.to(device)

            print('X_padded.shape = ', X_padded.shape)
            print('y_padded.shape = ', y_padded.shape)
            print('len(lens) = ', (X_lens))

            optimizer.zero_grad()
            
            
            # Forward pass
            y_pred = model(X_padded, x_lens=X_lens)
            print('y_pred.shape = ', y_pred.shape)

            y_pred = torch.swapaxes(y_pred, 1, 2)
            loss = criterion(y_pred, y_padded)
            print('loss',loss)
            # y_pred [N, C, d1,d2], y_target = [N,d1,d2]

            # Backward and optimize
            

            

            torch.autograd.set_detect_anomaly(True)
            
            loss.backward(retain_graph=True)
            optimizer.step()

            if (i+1) % 10 == 0:
                print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')
            train_loss += loss.item() / len(train_loader)
        
    return train_loss


            

### Test function

In [72]:
def test(model, test_loader):
    """Return the accuracy on the input data set"""
    # Set the network to evalution mode
    model.eval()
    model.init_states(batch_size)
    
    # Total number of correctly classified samples
    n_correct = 0
    
    with torch.no_grad():
        for i, (X, y) in enumerate(train_loader):
            X_padded = pad_sequence(X, batch_first=True)
            y_padded = pad_sequence(y, batch_first=True)

            y_pred = model(X_padded)
           
            # Compute the accuracy on the mini-batch.
            # Update n_correct which count the total
            # number of correctly classified samples.
            #
            # (3 lines of code)
            # YOUR CODE HERE
            max_val,arg_val = torch.max(y_pred,axis=1)
            n_corr_batch = torch.eq(y_padded, arg_val).sum().item()
            n_correct +=n_corr_batch
            
    return float(n_correct)/float(len(test_loader.dataset))

## Training

In [89]:
model = myLSTM(input_size, hidden_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

epochs = []
train_losses = []
test_accuracies = []
beg = time.perf_counter()
print(f"Start training for {num_epochs} epochs")

for epoch in range(num_epochs):
    
    # Perform training
    train_loss = train(model, train_loader, criterion, optimizer)
    epochs.append(epoch)
    train_losses.append(train_loss)
    # Test on validation set
    accuracy = test(model, test_loader)
    test_accuracies.append(accuracy)
    print(f"epoch {epoch} loss {train_loss:.03f} accuracy {accuracy:.03f}")
print(f"Training took {time.perf_counter()-beg:.2f} seconds")    

Start training for 100 epochs
X_padded.shape =  torch.Size([8, 54, 34])
y_padded.shape =  torch.Size([8, 54])
len(lens) =  [54, 23, 48, 29, 48, 17, 45, 40]
input.shape =  torch.Size([8, 54, 34]) 0
states[0].shape =  torch.Size([1, 8, 5]) 0
output_lengths tensor([54, 23, 48, 29, 48, 17, 45, 40])
lstm_out.shape =  torch.Size([8, 54, 5])
fc_out.shape =  torch.Size([8, 54, 2])
y_pred.shape =  torch.Size([8, 54, 2])
loss tensor(0.7947, device='cuda:0', grad_fn=<NllLoss2DBackward0>)
X_padded.shape =  torch.Size([8, 258, 34])
y_padded.shape =  torch.Size([8, 258])
len(lens) =  [258, 23, 34, 21, 39, 42, 15, 19]
input.shape =  torch.Size([8, 258, 34]) 0
states[0].shape =  torch.Size([1, 8, 5]) 0
output_lengths tensor([258,  23,  34,  21,  39,  42,  15,  19])
lstm_out.shape =  torch.Size([8, 258, 5])
fc_out.shape =  torch.Size([8, 258, 2])
y_pred.shape =  torch.Size([8, 258, 2])
loss tensor(0.6851, device='cuda:0', grad_fn=<NllLoss2DBackward0>)


RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [20, 34]] is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

In [None]:
N, C = 5, 4
loss = nn.NLLLoss()
data = torch.randn(N, 16, 10, 10)
conv = nn.Conv2d(16, C, (3, 3))
m = nn.LogSoftmax(dim=1)
target = torch.empty(N, 8, 8, dtype=torch.long).random_(0, C)
pred = m(conv(data))
output = loss(pred, target)
print(pred.shape, target.shape)

In [None]:
def collate_fn(train_data):
    train_data.sort(key=lambda data: len(data), reverse=True)
    data_length = [len(data) for data in train_data]
    train_data =  torch.nn.utils.rnn.pad_sequence(train_data, batch_first=True, padding_value=0)
    return train_data, data_length

In [None]:
train_dataloader = DataLoader(train_data, batch_size=2, collate_fn=collate_fn)

for data, length in train_dataloader:
    print(data)
    print(length)

In [None]:
for data, length in train_dataloader:
    data = rnn_utils.pack_padded_sequence(data, length, batch_first=True)
    print(data)

In [None]:
net = nn.LSTM(1, 5, batch_first=True)


In [None]:
train_data = MyData(train_x)
train_dataloader = DataLoader(train_data, batch_size=2, collate_fn=collate_fn)

flag = 0
for data, length in train_dataloader:
    data = torch.nn.utils.rnn.pack_padded_sequence(data, length, batch_first=True)
    output, hidden = net(data)
    if flag == 0:
        print(output)
        flag = 1