# Tests:

In [1]:
import yaml
from tqdm import tqdm 
from datasets import  load_from_disk
from torch.utils.data import DataLoader, Dataset
from PatientTrajectoryForecasting.utils.utils import (
    load_data,
    get_paths,
)

from torch import optim
import torch.nn as nn
import torch
import os

In [7]:
class Encoder(nn.Module):
    def __init__(self, source_vocab_size, embedding_dim = 2000, hidden_dim = 2000, n_layers = 1, dropout_prob = 0.5):
        super().__init__()
 
        self.embedding = nn.Embedding(source_vocab_size, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, n_layers, dropout=dropout_prob, batch_first = True)
 
        self.dropout = nn.Dropout(dropout_prob)
 
    def forward(self, input_batch):
        embed = self.dropout(self.embedding(input_batch))
        outputs, hidden = self.rnn(embed)
 
        return outputs, hidden

In [8]:
class OneStepDecoder(nn.Module):
    def __init__(self, target_vocab_size, embedding_dim = 2000, hidden_dim = 2000, n_layers = 1, dropout_prob = 0.5):
        super().__init__()
        # self.target_vocab_size will be used later
        self.target_vocab_size = target_vocab_size
 
        self.embedding = nn.Embedding(target_vocab_size, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, n_layers, dropout=dropout_prob, batch_first = True)
        self.fc = nn.Linear(hidden_dim, target_vocab_size)
        self.dropout = nn.Dropout(dropout_prob)
 
    def forward(self, target_token, hidden):
        embedding_layer = self.dropout(self.embedding(target_token))
        embedding_layer = embedding_layer.unsqueeze(1) if embedding_layer.ndim == 2 else embedding_layer
        output, hidden = self.rnn(embedding_layer, hidden)
 
        linear = self.fc(output.squeeze(0))
 
        return linear, hidden

In [9]:
class Decoder(nn.Module):
    def __init__(self, one_step_decoder, device):
        super().__init__()
        self.one_step_decoder = one_step_decoder
        self.device = device
 
    def forward(self, target, hidden):
        batch_size, target_len  = target.shape[0], target.shape[1]
        target_vocab_size = self.one_step_decoder.target_vocab_size
        # Store the predictions in an array for loss calculations
        predictions = torch.zeros(batch_size, target_len, target_vocab_size).to(self.device)
        # Take the very first word token, which will be sos
        seq = target[:, 0]
        
        # Loop through all the time steps
        for t in range(target_len):
            predict, hidden = self.one_step_decoder(seq, hidden)
 
            predictions[:,t] = predict.squeeze(1)
            seq = predict.argmax(-1).squeeze(1)      
        
        return predictions

In [10]:
class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
 
        self.encoder = encoder
        self.decoder = decoder        
 
    def forward(self, source, target):
        
        outputs, hidden = self.encoder(source)
        outputs= self.decoder(target, hidden)
                    
        return outputs

In [11]:
class ForcastWithNotes(Dataset):
    def __init__(self, source_sequences, target_sequences, hospital_ids, tokenized_notes):
        self.source_sequences = source_sequences
        self.target_sequences = target_sequences
        self.hospital_ids = hospital_ids
        self.tokenized_notes = load_from_disk(tokenized_notes)
    def __len__(self):
        return len(self.source_sequences)
    def __getitem__(self, idx):
        hospital_ids = self.hospital_ids[idx]
        hospital_ids_lens = len(hospital_ids)

        return  {'source_sequences':torch.tensor(self.source_sequences[idx]),
                 'target_sequences': torch.tensor(self.target_sequences[idx]),
                 'tokenized_notes':self.tokenized_notes[hospital_ids],
                 'hospital_ids_lens': hospital_ids_lens}

def custom_collate_fn(batch):
    source_sequences = [item['source_sequences'] for item in batch]
    target_sequences = [item['target_sequences'] for item in batch]
    
    source_sequences = torch.stack(source_sequences, dim=0)
    target_sequences = torch.stack(target_sequences, dim=0)

    return {
        'source_sequences': source_sequences,
        'target_sequences': target_sequences,
    }

In [12]:
with open('PatientTrajectoryForecasting/paths.yaml', 'r') as file:
        path_config = yaml.safe_load(file)

train_data_path = get_paths(path_config,
                        'SDP',
                        False,
                        False,
                        train = True,
                        processed_data = True,
                        with_notes = True)


source_sequences, target_sequences, source_tokens_to_ids, target_tokens_to_ids, _, __, hospital_ids_source = load_data(train_data_path['processed_data_path'],
                                                                                                                   processed_data = True, reindexed = True)
# Load the datasets
train_dataset = torch.load('final_dataset/train_dataset.pth')
val_dataset = torch.load('final_dataset/val_dataset.pth')
test_dataset = torch.load('final_dataset/test_dataset.pth')

old_to_new_ids_source file not availble, mapping is the same as the old one


In [13]:
def create_model(source_tokens_to_ids, target_tokens_to_ids):
    # Define the required dimensions and hyper parameters
    embedding_dim = 2000
    hidden_dim = 2000
    dropout = 0.5
 
    # Instanciate the models
    encoder = Encoder(len(source_tokens_to_ids), embedding_dim, hidden_dim, n_layers=2, dropout_prob=dropout)
    one_step_decoder = OneStepDecoder(len(target_tokens_to_ids), embedding_dim, hidden_dim, n_layers=2, dropout_prob=dropout)
    decoder = Decoder(one_step_decoder, DEVICE)
 
    model = EncoderDecoder(encoder, decoder)
 
    model = model.to(DEVICE)
 
    # Define the optimizer
    optimizer = optim.Adadelta(model.parameters())
 
    # Makes sure the CrossEntropyLoss ignores the padding tokens.
    TARGET_PAD_IDX = target_tokens_to_ids['PAD'] # target pad token
    criterion = nn.CrossEntropyLoss(ignore_index=TARGET_PAD_IDX)
 
    return model, optimizer, criterion

In [15]:
batch_size = 384

train_dataloader = DataLoader(train_dataset,
                                  shuffle = True,
                                  batch_size = batch_size,
                                  num_workers = int(os.environ["SLURM_CPUS_PER_TASK"]),
                                  pin_memory = True,
                                  collate_fn = custom_collate_fn)

val_dataloader = DataLoader(val_dataset,
                            shuffle = False,
                            batch_size = batch_size * 2,
                            num_workers = int(os.environ["SLURM_CPUS_PER_TASK"]),
                            pin_memory = True,
                            collate_fn = custom_collate_fn)


test_dataloader = DataLoader(test_dataset,
                             shuffle = False,
                             batch_size = batch_size * 2,
                             num_workers = int(os.environ["SLURM_CPUS_PER_TASK"]),
                             pin_memory = True,
                             collate_fn = custom_collate_fn)

In [16]:
def train(train_iterator, valid_iterator, source_tokens_to_ids, target_tokens_to_ids, epochs=20):
    model, optimizer, criterion = create_model(source_tokens_to_ids, target_tokens_to_ids)
 
 
    for epoch in range(1, epochs + 1):
        pbar = tqdm(total=len(train_iterator), bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}', unit=' batches', ncols=200)
 
        training_loss = []
        # set training mode
        model.train()
 
        # Loop through the training batch
        for i, batch in enumerate(train_iterator):
            # Get the source and target tokens
            src = batch['source_sequences'].to(DEVICE)
            trg = batch['target_sequences'].to(DEVICE)
 
            optimizer.zero_grad()
 
            # Forward pass
            output = model(src, trg)
 
            # reshape the output
            output_dim = output.shape[-1]
 
            # Discard the first token as this will always be 0
            output = output[:,1:].contiguous().view(-1, output.size(-1))
 
            # Discard the bos token from target
            trg = trg[:,1:].contiguous().view(-1)
 
            # Calculate the loss
            loss = criterion(output, trg)
 
            # back propagation
            loss.backward()
 
            optimizer.step()
 
            training_loss.append(loss.item())
 
            pbar.set_postfix(
                epoch=f" {epoch}, train loss= {round(sum(training_loss) / len(training_loss), 4)}", refresh=True)
            pbar.update()
 
        with torch.inference_mode():
            # Set the model to eval
            model.eval()
 
            validation_loss = []
 
            # Loop through the validation batch
            for i, batch in enumerate(valid_iterator):
                src = batch['source_sequences'].to(DEVICE)
                trg = batch['target_sequences'].to(DEVICE)
 
                # Forward pass
                output = model(src, trg)
 
                output = output[:,1:].contiguous().view(-1, output.size(-1))
                trg = trg[:,1:].contiguous().view(-1)
 
                # Calculate Loss
                loss = criterion(output, trg)
 
                validation_loss.append(loss.item())
 
        pbar.set_postfix(
            epoch=f" {epoch}, train loss= {round(sum(training_loss) / len(training_loss), 4)}, val loss= {round(sum(validation_loss) / len(validation_loss), 4)}",
            refresh=False)
 
        pbar.close()
 
    return model

In [17]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = train(train_dataloader, test_dataloader, source_tokens_to_ids, target_tokens_to_ids, epochs=20)

100%|██████████| 69/69 [02:04<00:00,  1.80s/ batches, epoch=1, train loss= 5.0685, val loss= 5.2653]                                                                                                    
100%|██████████| 69/69 [02:02<00:00,  1.77s/ batches, epoch=2, train loss= 4.7469, val loss= 5.1992]                                                                                                    
100%|██████████| 69/69 [02:01<00:00,  1.76s/ batches, epoch=3, train loss= 4.7225, val loss= 5.3681]                                                                                                    
100%|██████████| 69/69 [02:01<00:00,  1.76s/ batches, epoch=4, train loss= 4.718, val loss= 5.1544]                                                                                                     
100%|██████████| 69/69 [02:01<00:00,  1.76s/ batches, epoch=5, train loss= 4.7057, val loss= 5.4226]                                                                                                

In [19]:
from typing import Dict

In [20]:
def get_sequences_doctor_ai(model, dataloader : torch.utils.data.dataloader.DataLoader,
                            target_tokens_to_ids : Dict[str, int] =  None, max_len : int = 150,
                            DEVICE : str ='cuda:0'):
    model.eval()
    pred_trgs = []
    targets = []
    with torch.inference_mode():
        for batch in tqdm(test_dataloader, desc='scoring'):
            batch_pred_trgs = []
            batch_targets = []
            source_input_ids, target_input_ids = batch['source_sequences'].to(DEVICE),batch['target_sequences'].to(DEVICE)
            outputsDecoder, hidden = model.encoder(source_input_ids)
            next_token = torch.tensor(target_tokens_to_ids['BOS'], device= DEVICE).repeat(source_input_ids.size(0)).unsqueeze(1)
            pred_trg = torch.tensor(target_tokens_to_ids['BOS'], device= DEVICE).repeat(source_input_ids.size(0)).unsqueeze(1)
            for i in range(max_len):
                embedding_layer = model.decoder.one_step_decoder.dropout(model.decoder.one_step_decoder.embedding(next_token))
                outputs, hidden = model.decoder.one_step_decoder.rnn(embedding_layer, hidden)
        
                linear = model.decoder.one_step_decoder.fc(outputs.squeeze(0))
                next_token = torch.argmax(linear, dim=-1)
                pred_trg = torch.cat((pred_trg, next_token), dim=1)
                eov_mask = next_token == target_tokens_to_ids['EOV']
                if eov_mask.any():
                    # extend with sequences that have reached EOV
                    batch_pred_trgs.extend(pred_trg[eov_mask.squeeze(-1)].tolist())
                    batch_targets.extend(target_input_ids[eov_mask.squeeze(-1)].tolist())
                    # break if all have reached EOV
                    if eov_mask.all():
                        break  
                    # edit corresponding target sequences
                    target_input_ids = target_input_ids[~eov_mask.squeeze(-1)]
                    pred_trg = pred_trg[~eov_mask.squeeze(-1)]
                    hidden = hidden[:,~eov_mask.squeeze(-1),:]
                    next_token = [~eov_mask.squeeze(-1)]
            if source_input_ids.size(0) != len(batch_pred_trgs):
                    batch_pred_trgs.extend(pred_trg.tolist())
                    batch_targets.extend(target_input_ids.tolist())
            pred_trgs.extend(batch_pred_trgs)
            targets.extend(batch_targets)
    return pred_trgs, targets

In [26]:
pred_trgs, targets = get_sequences_doctor_ai(model, test_dataloader, target_tokens_to_ids, max_len=96)

scoring: 100%|██████████| 10/10 [00:08<00:00,  1.23it/s]


In [29]:
cd PatientTrajectoryForecasting

/home/sifal.klioui/PatientTrajectoryForecasting


In [31]:
from utils.eval import mapk, recallTop


ks = [20, 40, 60]

test_mapk = {f"test_map@{k}": mapk(targets, pred_trgs, k) for k in ks}
test_recallk = {f"test_recall@{k}": recallTop(targets, pred_trgs, rank = [k])[0] for k in ks}
print(test_mapk, test_recallk)

{'test_map@20': 0.15164410490104094, 'test_map@40': 0.15204619577357814, 'test_map@60': 0.15204619577357814} {'test_recall@20': 0.21991012781971214, 'test_recall@40': 0.21991012781971214, 'test_recall@60': 0.21991012781971214}
