## Import modules

In [5]:
import torch
from torch.utils.data import Dataset
from torchvision import datasets
from torch.utils.tensorboard import SummaryWriter
import matplotlib.pyplot as plt
import torch.nn as nn
import math
import numpy as np
from sklearn.model_selection import KFold

In [6]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

## Dataset

In [3]:
from torch.utils.data import Dataset
from enum import Enum
import pandas as pd
import numpy as np
import os


class VitalsDataset(Dataset):
    def __init__(self, processed_dir: str, train: bool):
        self.processed_dir = processed_dir

        if train:
            self.data_path = os.path.join(self.processed_dir, 'train/')
            self.index_path = os.path.join(
                self.processed_dir, 'train_idxs.npy')
        else:
            self.data_path = os.path.join(self.processed_dir, 'test/')
            self.index_path = os.path.join(self.processed_dir, 'test_idxs.npy')

        try:
            self.idxs = np.load(self.index_path)
            self.vitals = pd.read_csv(
                os.path.join(self.data_path, 'vitals.csv'))
            self.labels = pd.read_csv(os.path.join(self.data_path, 'labels.csv'))
        except FileNotFoundError as e:
            print("Make sure data has been processed: ", e)

        self.vitals.set_index(['pat_id', 'hours_in'], inplace=True)
        self.labels.set_index('pat_id', inplace=True)

        self.num_feats = self.vitals.shape[1]
        self.num_classes = self.labels.shape[1]

    def __len__(self):
        return len(self.idxs)

    def __getitem__(self, idx):
        pat_id = self.idxs[idx]

        vit = self.vitals.loc[pat_id]
        vit = self._format_ts_batch(vit)

        lbl = self.labels.loc[pat_id].values

        vit = torch.from_numpy(vit).float()
        lbl = torch.from_numpy(lbl)

        return vit, lbl

    def _format_ts_batch(self, batch_ts_df):
        if batch_ts_df.index.nlevels == 1:
            return batch_ts_df.values
            
        batch_ts = batch_ts_df.groupby(level=0).apply(lambda x: x.values).values.tolist()
        max_seq_len = max([seq.shape[0] for seq in batch_ts])
    
        for i, seq in enumerate(batch_ts):
            null_rows = np.zeros((max_seq_len-seq.shape[0], self.num_feats))
            batch_ts[i] = np.vstack([seq, null_rows])
    
        return np.array(batch_ts)

## Model

In [119]:
class TimeSeriesModel(nn.Module):
    def __init__(self, input_size, hidden_size, lstm_layers, dense_units):
        super().__init__()

        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=lstm_layers, batch_first=True)

        self.dense_layers = nn.Sequential()
        dense_units = [hidden_size] + dense_units
        num_dense_layers = len(dense_units)-1
        
        for dl_idx in range(num_dense_layers):
            inp_shape, out_shape = dense_units[dl_idx:dl_idx+2]
            self.dense_layers.append(nn.Linear(inp_shape, out_shape))

    def forward(self, x):
        x = self.lstm(x)[0]
        x = self.dense_layers(x)

        return x    

## Train the model

Load the data

In [12]:
train_ds = VitalsDataset('../data/processed', True)
test_ds = VitalsDataset('../data/processed', False)

Select the device

In [98]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

Define the model

In [134]:
HIDDEN_SIZE=32
NUM_LAYERS=2

model = TimeSeriesModel(train_ds.vitals.shape[1], HIDDEN_SIZE, NUM_LAYERS, [16,8,2]).to(device)

Define constants and utilities for training model

In [135]:
NUM_EPOCHS = 5
BATCH_SIZE = 32
WRITE_FREQ = 50

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()
writer = SummaryWriter()

Determine indices for each fold

In [136]:
KFOLDS = 7

train_idxs = np.arange(0,len(train_ds))

kfolder = KFold(KFOLDS, shuffle=True)
split_idxs = list(kfolder.split(train_idxs))

A single training epoch

In [137]:
def train_epoch(train_idxs, epoch_idx):
    num_batches = math.ceil(len(train_idxs)/BATCH_SIZE)

    running_loss, last_loss = 0, 0
    
    # Loop through each batch
    for batch_idx in range(num_batches):
        idxs = train_idxs[batch_idx*BATCH_SIZE:min((batch_idx+1)*BATCH_SIZE, len(train_idxs))]
        inputs, labels = train_ds[idxs]
        inputs, labels = inputs.to(device), labels.to(device)
 
        optimizer.zero_grad()
        
        outputs = model(inputs)
        loss = loss_fn(outputs, labels)
        
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if batch_idx % WRITE_FREQ == WRITE_FREQ-1:
            last_loss = running_loss / WRITE_FREQ
            print(f"\t batch {batch_idx+1} loss: {last_loss}")
            running_loss = 0
            log_idx = (epoch_idx * num_batches) + batch_idx + 1
            writer.add_scalar('Loss/Train', last_loss, log_idx)
            
    return last_loss

A single validation epoch

In [138]:
def val_epoch(val_idxs, epoch_idx):
    model.eval()
    
    num_batches = math.ceil(len(val_idxs)/BATCH_SIZE)
    
    running_vloss = 0
    
    with torch.no_grad():
        for batch_idx in range(num_batches):
            idxs = val_idxs[batch_idx*BATCH_SIZE:min((batch_idx+1)*BATCH_SIZE, len(val_idxs))]
            vinputs, vlabels = train_ds[idxs]
            vinputs, vlabels = vinputs.to(device), vlabels.to(device)
    
            voutputs = model(vinputs)
            vloss = loss_fn(voutputs, vlabels)
    
            running_vloss += vloss

    avg_vloss = running_vloss / num_batches
    writer.add_scalar('Loss/Valid', avg_vloss, epoch_idx+1)
    
    return avg_vloss

Training loop

In [139]:
best_vloss = 1e9

for epoch_idx in range(NUM_EPOCHS):
    print(f"Epoch {epoch_idx+1}")

    kfold_idx = epoch_idx % KFOLDS
    train_idxs, val_idxs = split_idxs[kfold_idx] 
    
    model.train(True)
    avg_loss = train_epoch(train_idxs, epoch_idx)

    avg_vloss = val_epoch(val_idxs, epoch_idx)
    print(f'LOSS train {avg_loss} valid {avg_vloss}')

    writer.add_scalars('Training vs Validation Loss', {'Training': avg_loss, 'Validation': avg_vloss}, epoch_idx+1)
    writer.flush()

    if avg_vloss < best_vloss:
        bst_vloss = avg_vloss
        model_path = f"model_{epoch_idx}"
        torch.save(model.state_dict(), model_path)

writer.close()

Epoch 1
	 batch 50 loss: 4.274215307235718
	 batch 100 loss: 0.8940871798992157
	 batch 150 loss: 0.665357985496521
	 batch 200 loss: 0.4936798173189163
	 batch 250 loss: 0.34500348269939424
	 batch 300 loss: 0.3088918437063694
	 batch 350 loss: 0.33345985144376755
	 batch 400 loss: 0.3173807245492935
	 batch 450 loss: 0.29585755854845047
	 batch 500 loss: 0.3209942053258419
	 batch 550 loss: 0.35591398358345033
	 batch 600 loss: 0.31334610626101495
	 batch 650 loss: 0.3096887290477753
	 batch 700 loss: 0.3063186648488045
LOSS train 0.3063186648488045 valid 0.31509333848953247
Epoch 2
	 batch 50 loss: 0.32112288117408755
	 batch 100 loss: 0.3003802926838398
	 batch 150 loss: 0.31788292616605757
	 batch 200 loss: 0.29329873085021974
	 batch 250 loss: 0.3240022474527359
	 batch 300 loss: 0.30495404794812203
	 batch 350 loss: 0.3206598922610283
	 batch 400 loss: 0.30912388265132906
	 batch 450 loss: 0.2967716883122921
	 batch 500 loss: 0.3163670578598976
	 batch 550 loss: 0.35370143204927