In [24]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
#Nathan's path
%cd drive/MyDrive/work/S2/NLP/medal

#Someone else's path
#%cd /content/drive/MyDrive/Dauphine/M2/S2/NLP/medal

[Errno 2] No such file or directory: 'drive/MyDrive/work/S2/NLP/medal'
/content/drive/MyDrive/work/S2/NLP/medal


In [26]:
%pip install transformers fasttext -q

## Pre-training

In [None]:
ARGS = {
    "savedir":"./csv_logs",
    "model":"electra",
    "data_dir":"./data/pretraining",
    "data_filename":"medal.csv",
    "adam_path":"./toy_data/valid_adam.txt",
    "embs_path":"./data",
    "use_scheduler":True,
    "lr":2e-6,
    "clip":0,
    "dropout":0.1,
    "epochs":10,
    "accum_num":1,
    "save_every":1,
    "eval_every":200000,
    "batchsize":8,
    "hidden_size":512,
    "rnn_layers":3,
    "da_layers":1,
    "pretrained_model": "./models/electra.pt"
    }

In [None]:
import argparse
import os
import time
import pandas as pd

import torch
import torch.optim as optim
from torch import nn

from models.rnn import RNN
from models.lstm_sa import RNNAtt
from models.electra import Electra
from transformers import ElectraTokenizer
from utils import load_dataframes, load_model, train_loop
from models.tokenizer_and_dataset import \
    FastTextTokenizer, EmbeddingsDataset, HuggingfaceDataset

from torch.utils.tensorboard import SummaryWriter

EXPERIMENT_DIR = ARGS.get('savedir')
N_EPOCHS = ARGS.get('get('epochs')
BATCH_SIZE = ARGS.get('batchsize')
N_CPU_CORES = ARGS.get('ncpu')
MODEL_TYPE = ARGS.get('model')
USE_PRETRAIN = True

DEVICE = torch.device(f"cuda" if torch.cuda.is_available() else "cpu")

# Load data

train, valid, test, label_to_ix = load_dataframes (
                                    data_dir=ARGS.get('data_dir'), 
                                    data_filename=ARGS.get('data_filename'),
                                    adam_path=ARGS.get('adam_path')
                                    )
print("Data loaded")

# Create tokenizer objects
if MODEL_TYPE == "electra":
    tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
else:
    # Create word index and load Fasttext embedding matrix
    tokenizer = FastTextTokenizer(verbose=True)
    tokenizer.build_word_index(train.TEXT, valid.TEXT, test.TEXT, list(label_to_ix.keys()))
    tokenizer.build_embedding_matrix(ARGS.get('embs_path'))

# Create torch Dataset objects
if MODEL_TYPE == "electra":
    train_data = HuggingfaceDataset(train, tokenizer=tokenizer, device=DEVICE)
    valid_data = HuggingfaceDataset(valid, tokenizer=tokenizer, device=DEVICE)
else:
    train_data = EmbeddingsDataset(train, tokenizer=tokenizer, device=DEVICE)
    valid_data = EmbeddingsDataset(valid, tokenizer=tokenizer, device=DEVICE)
print("Dataset created")

# Define network, loss function and optimizer

net = Electra(
        output_size=len(label_to_ix),
        device=DEVICE,
    )
print('model: {}'.format(net))


if torch.cuda.device_count() > 1:
    net.to(DEVICE)
    print("Using", torch.cuda.device_count(), "GPUs")
    net = nn.DataParallel(net)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), ARGS.get('lr'))
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.2, patience=8) \
    if ARGS.get('use_scheduler') else None

# Create save directory
time_stamp = time.strftime("%m-%d-%H-%M", time.localtime())
save_dir = os.path.join(EXPERIMENT_DIR, time_stamp)
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# Save configs
model_desc_output = [": ".join([str(k), str(v)]) for k, v in ARGS.items()]
with open(os.path.join(save_dir, 'configs.txt'), 'w') as file:
    file.writelines("\n".join(model_desc_output))

# Set up tensorboard
writer = SummaryWriter(f"runs/{MODEL_TYPE}-{time_stamp}")

# Train network
net, logs = train_loop(
    net, MODEL_TYPE, optimizer, criterion, train_data, valid_data, save_dir=save_dir, n_epochs=N_EPOCHS, \
        batch_size=BATCH_SIZE, verbose=True, scheduler=scheduler, save_every=ARGS.get('save_every'), \
        eval_every=ARGS.get('eval_every'), clip=ARGS.get('clip'), writer=writer, accum_num=ARGS.get('accum_num'),
)

# Save Model
torch.save(net, os.path.join(save_dir, 'model.pt'))

# Save Logs
log_df = pd.DataFrame(logs)
log_df.to_csv(os.path.join(save_dir, 'logs.csv'))


## Mortality prediction

In [47]:
import multiprocessing
CORES = multiprocessing.cpu_count() # Count the number of cores in a computer

ARGS = {
    "savedir":"./csv_logs",
    "model":"electra",
    "data_dir":"./data/downstream",
    "data_filename":"mimic.csv",
    "adam_path":"./toy_data/valid_adam.txt",
    "embs_path":"./data",
    "task":"mimic-mortality",
    "use_scheduler":True,
    "lr":2e-6,
    "clip":0,
    "dropout":0.1,
    "epochs":3,
    "save_every":1,
    "eval_every":10000,
    "batchsize":8,
    "hidden_size":512,
    "da_layers":1,
    "ncpu":CORES
    }
#,"pretrained_model": "./models/electra.pt"

In [None]:
import os
import time
import sys

import torch
import torch.optim as optim
from torch import nn
from torch.utils.data import DataLoader

from transformers import ElectraTokenizer
from downstream.utils import load_mimic_mortality, load_mimic_diagnosis, load_model, predict, evaluate, train_loop
from downstream.electra import Electra
import pandas as pd
import numpy as np

from downstream.tokenizer_and_dataset import FastTextTokenizer, MimicDataset, HuggingfaceDataset

from torch.utils.tensorboard import SummaryWriter


EXPERIMENT_DIR = ARGS.get('savedir')
N_EPOCHS = ARGS.get('epochs')
BATCH_SIZE = ARGS.get('batchsize')
N_CPU_CORES = ARGS.get('ncpu')
MODEL_TYPE = ARGS.get('model')
TASK = ARGS.get('task')
TEST = ARGS.get('test')
USE_PRETRAIN = True if ARGS.get('pretrained_model') else False

if TEST and not USE_PRETRAIN:
    raise Exception("no model preovided for testing")

if not USE_PRETRAIN:
    print("No pretrained model provided. Will train from scratch.")

# Prelim
torch.set_num_threads(N_CPU_CORES)
DEVICE = torch.device(f"cuda" if torch.cuda.is_available() else "cpu")

# Load data
if TASK in ['mimic-mortality']:
    train, valid, test = load_mimic_mortality(ARGS.get('data_dir'), ARGS.get('data_filename'))
elif TASK in ['mimic-diagnosis']:
    train, valid, test, diag_to_idx = \
        load_mimic_diagnosis(ARGS.get('data_dir'), ARGS.get('data_filename'), ARGS.get('diag_to_idx_path'))
print("Data loaded")

# Create tokenizer objects
if MODEL_TYPE == "electra":
    tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
else:
    # Create word index and load Fasttext embedding matrix
    tokenizer = FastTextTokenizer(verbose=True)
    tokenizer.build_word_index(train.TEXT, valid.TEXT, test.TEXT)
    tokenizer.build_embedding_matrix(ARGS.get('embs_path'))

if TASK in ['mimic-mortality']:
    output_size = 1
    label_col = 'LABEL_NUM'
elif TASK in ['mimic-diagnosis']:
    output_size = len(diag_to_idx)
    label_col = 'DIAG'

# Create torch Dataset objects
if MODEL_TYPE in ["rnnsoft", "rnn"]:
    if TEST:
        test_data = MimicDataset(test, tokenizer=tokenizer, task=TASK, label_col=label_col, output_size=output_size, device=DEVICE)
    else:
        train_data = MimicDataset(train, tokenizer=tokenizer, task=TASK, label_col=label_col, output_size=output_size, device=DEVICE)
        valid_data = MimicDataset(valid, tokenizer=tokenizer, task=TASK, label_col=label_col, output_size=output_size, device=DEVICE)
else:
    if TEST:
        test_data = HuggingfaceDataset(test, tokenizer=tokenizer, task=TASK, label_col=label_col, output_size=output_size, device=DEVICE)
    else:
        train_data = HuggingfaceDataset(train, tokenizer=tokenizer, task=TASK, label_col=label_col, output_size=output_size, device=DEVICE)
        valid_data = HuggingfaceDataset(valid, tokenizer=tokenizer, task=TASK, label_col=label_col, output_size=output_size, device=DEVICE)
print("Dataset created")

# Define network, loss function and optimizer


net = Electra(
    output_size=output_size,
    device=DEVICE,
)
if USE_PRETRAIN:
    net = load_model(net, ARGS.get('pretrained_model'), DEVICE)

print('model: {}'.format(net))
if TASK in ['mimic-mortality', 'mimic-diagnosis']:
    criterion = nn.BCELoss()
if not TEST:
    optimizer = optim.Adam(net.parameters(), ARGS.get('lr'))
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.2, patience=8) \
        if ARGS.get('use_scheduler') else None

# Create save directory
time_stamp = time.strftime("%m-%d-%H-%M", time.localtime())
save_dir = os.path.join(EXPERIMENT_DIR, time_stamp)
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# Save configs
model_desc_output = [": ".join([str(k), str(v)]) for k, v in ARGS.items()]
with open(os.path.join(save_dir, 'configs.txt'), 'w') as file:
    file.writelines("\n".join(model_desc_output))

if not TEST:
    writer = SummaryWriter(f"runs/{TASK}/{MODEL_TYPE}-{time_stamp}")
    # Train network
    net, logs = train_loop(
        net, optimizer, criterion, train_data, valid_data, save_dir=save_dir, task=TASK, n_epochs=N_EPOCHS, \
            batch_size=BATCH_SIZE, verbose=True, scheduler=scheduler, save_every=ARGS.get('save_every'), \
            eval_every=ARGS.get('eval_every'), writer=writer,
    )
else:
    # Test
    logs = {k: [] for k in ['test_loss', 'test_metric']}
    if TASK == 'mimic-diagnosis':
        logs['test_top_5_recall'] = []
        logs['test_top_30_recall'] = []
    test_loader = DataLoader(
        range(len(test)), 
        shuffle=False, 
        batch_size=BATCH_SIZE
    )
    if TASK == 'mimic-mortality':
        test_preds = predict(net, test_loader, test_data, verbose=True).cpu().numpy()
        np.save(os.path.join(save_dir, 'test_preds.npy'), test_preds)
        test_loss, test_metric = evaluate(net, test_loader, test_data, criterion, verbose=True, task=TASK)
    elif TASK == 'mimic-diagnosis':
        test_loss, test_metrics = evaluate(net, test_loader, test_data, criterion, verbose=True, task=TASK)
        test_metric = test_metrics['top_10_recall']
    print(f"Test Loss: {test_loss:.4f} \tTest Metric:{test_metric:.4f}")
    if TASK == 'mimic-diagnosis':
        print(f"Test Top 5 Recall: {test_metrics['top_5_recall']:.4f} \tTest Top 30 Recall:{test_metrics['top_30_recall']:.4f}")
    print("="*50)
    logs['test_loss'].append(test_loss)
    logs['test_metric'].append(test_metric)
    if TASK == 'mimic-diagnosis':
        logs['test_top_5_recall'].append(test_metrics['top_5_recall'])
        logs['test_top_30_recall'].append(test_metrics['top_30_recall'])

# Save Model
if not TEST:
    torch.save(net.state_dict(), os.path.join(save_dir, 'model.pt'))

# Save Logs
log_df = pd.DataFrame(logs)
log_df.to_csv(os.path.join(save_dir, 'logs.csv'))

No pretrained model provided. Will train from scratch.
Data loaded
Dataset created


Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


model: Electra(
  (dropout): Dropout(p=0.1, inplace=False)
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
        

  0%|          | 0/25602 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  0%|          | 124/25602 [00:29<1:36:52,  4.38it/s]