In [17]:
from pathlib import Path
from tqdm import tqdm
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

from mmvae_hub.mimic.utils import filter_labels

In [18]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(DEVICE)
MODEL_NAME = 'distilbert-base-uncased'
BATCH_SIZE = 10
DL_WORKERS = 1
NUM_EPOCHS = 50

cuda


In [19]:
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME)

# Create Dataset

In [20]:
class MimicFindings(Dataset):
    """
    Custom Dataset for loading the uni-modal mimic text data
    """

    def __init__(self, split: str):
        """
        split: string, either train, eval or test
        """
        str_label = ['Finding']
        # dir_dataset = Path('/Users/Hendrik/Documents/master3/leomed_klugh/files_small_128')
        dir_dataset = Path('/mnt/data/hendrik/mimic_scratch/files_small_128')
        findings = pd.read_csv(dir_dataset / f'{split}_findings.csv')
        labels = filter_labels(pd.read_csv(dir_dataset / f'{split}_labels.csv').fillna(0), str_label, False, 'train')

        self.df = labels.merge(findings)

        # tokenize findings
        self.encodings = tokenizer(self.df['findings'].tolist(), return_tensors="pt", padding=True, truncation=True,
                                max_length=256)
        self.labels = self.df['Finding'].tolist()

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).long()
        return item

    def __len__(self):
        return len(self.df)

In [21]:
train_ds = MimicFindings('train')
eval_ds = MimicFindings('eval')
train_ds.df.head()

Unnamed: 0.1,Unnamed: 0,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices,uid,Finding,No Finding,findings
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014_1...,0.0,True,"There is no focal consolidation, pleural effus..."
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab_e...,0.0,True,"The cardiac, mediastinal and hilar contours ar..."
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,b75df1bd-0f22d631-52d73526-2ae7b85a-d843b39d_8...,0.0,True,As compared to the prior examination dated ___...
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,b75df1bd-0f22d631-52d73526-2ae7b85a-d843b39d_9...,0.0,True,As compared to the prior examination dated ___...
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2a280266-c8bae121-54d75383-cac046f4-ca37aa16_0...,0.0,True,PA and lateral views of the chest provided. ...


Load the model from a pretrained checkpoint.

In [22]:
unique_labels, counts = np.unique(train_ds.df["Finding"], return_counts=True)
print(unique_labels, counts)
model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(unique_labels)).to(DEVICE)

optimizer = torch.optim.Adam([
    {'params': model.distilbert.parameters(), 'lr': 1e-5},
    {'params': model.classifier.parameters(), 'lr': 1e-3}
])

[0. 1.] [47218 14529]


In [23]:
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=DL_WORKERS)
eval_loader = DataLoader(eval_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=DL_WORKERS)

Taken from https://huggingface.co/transformers/custom_datasets.html

In [24]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_ds,         # training dataset
    eval_dataset=eval_ds             # evaluation dataset
)

trainer.train()

[20:14:37 CEST] Enabling eager execution
[20:14:37 CEST] Enabling v2 tensorshape
[20:14:37 CEST] Enabling resource variables
[20:14:37 CEST] Enabling tensor equality
[20:14:37 CEST] Enabling control flow v2
***** Running training *****
  Num examples = 61747
  Num Epochs = 10
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 4830
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss
10,0.7028
20,0.6815
30,0.6361
40,0.5804
50,0.5337
60,0.5344
70,0.4834
80,0.492
90,0.4516
100,0.4314


Saving model checkpoint to ./results/checkpoint-500
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Saving model checkpoint to ./results/checkpoint-1000
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Saving model checkpoint to ./results/checkpoint-1500
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Saving model checkpoint to ./results/checkpoint-2000
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Saving model checkpoint to ./results/checkpoint-2500
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Saving model checkpoint to ./results/checkpoint-3000
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Saving model checkpoint to ./results/checkpoint-3500
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Saving model checkpoint to ./results/checkpoint-4000
  item = {key: torch.tensor(val

TrainOutput(global_step=4830, training_loss=0.1719891672857553, metrics={'train_runtime': 4033.3668, 'train_samples_per_second': 153.09, 'train_steps_per_second': 1.198, 'total_flos': 6.35024025979392e+16, 'train_loss': 0.1719891672857553, 'epoch': 10.0})

In [25]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 487
  Batch size = 128
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'eval_loss': 0.8109534978866577,
 'eval_runtime': 0.8584,
 'eval_samples_per_second': 567.35,
 'eval_steps_per_second': 4.66,
 'epoch': 10.0}

In [26]:
model = trainer.model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [None]:
for epoch in tqdm(range(NUM_EPOCHS)):
    model.train()
    for model_inputs, labels in tqdm(train_loader, total = len(train_loader)):
        labels = labels.to(DEVICE)
        
        output = model(**model_inputs, labels=labels)
        loss, logits = output[:2]

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

In [42]:
predictions, targets = [], []
model.eval()

with torch.no_grad():
    for batch in eval_loader:
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)
        output = model(input_ids, attention_mask=attention_mask, labels=labels)
        #print(output)
        logits = output.logits
        # take the argmax of the logits
        predictions.extend(logits.argmax(dim=1).tolist())
        targets.extend(labels.cpu())

from sklearn import metrics

accuracy = metrics.accuracy_score(targets, predictions)
print("accuracy", accuracy)
classification_report = metrics.classification_report(targets, predictions)
print(classification_report)


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


accuracy 0.8275154004106776
              precision    recall  f1-score   support

           0       0.88      0.90      0.89       372
           1       0.64      0.61      0.62       115

    accuracy                           0.83       487
   macro avg       0.76      0.75      0.76       487
weighted avg       0.82      0.83      0.83       487

