# Environmental Setup

In [1]:
!pip install -U --quiet evaluate transformers accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m43.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!gdown --id 1Q9aBywii_NVhaB3NObe4niEvkafGNxUE # MACCROBAT2020.zip
!unzip  /content/MACCROBAT2020.zip -d /content/MACCROBAT2020

# !gdown --id 1n0bqRvYDMwWtl6XWmO3GD99_3pAcVoS1 # MACCROBAT2018.zip
# !unzip /content/MACCROBAT2018.zip -d /content/MACCROBAT2018

Downloading...
From: https://drive.google.com/uc?id=1Q9aBywii_NVhaB3NObe4niEvkafGNxUE
To: /content/MACCROBAT2020.zip
100% 1.07M/1.07M [00:00<00:00, 124MB/s]
Archive:  /content/MACCROBAT2020.zip
  inflating: /content/MACCROBAT2020/26530965.ann  
  inflating: /content/MACCROBAT2020/25410883.ann  
  inflating: /content/MACCROBAT2020/23864579.ann  
  inflating: /content/MACCROBAT2020/23468586.ann  
  inflating: /content/MACCROBAT2020/23155491.ann  
  inflating: /content/MACCROBAT2020/23124805.ann  
  inflating: /content/MACCROBAT2020/22520024.ann  
  inflating: /content/MACCROBAT2020/19610147.ann  
  inflating: /content/MACCROBAT2020/19307547.ann  
  inflating: /content/MACCROBAT2020/19816630.ann  
  inflating: /content/MACCROBAT2020/21672201.ann  
  inflating: /content/MACCROBAT2020/25572898.ann  
  inflating: /content/MACCROBAT2020/23033875.ann  
  inflating: /content/MACCROBAT2020/23033875.txt  
  inflating: /content/MACCROBAT2020/21129213.ann  
  inflating: /content/MACCROBAT2020/28154

# Experiment

In [3]:
model_id = "d4data/biomedical-ner-all"
dataset_folder = '/content/MACCROBAT2020'

## Preprocessing

In [4]:
import os

from typing import List, Dict, Tuple

class DS_Preprocessing:
    def __init__(self, dataset_folder, tokenizer):
        self.file_ids = [f.split(".")[0] for f in os.listdir(dataset_folder) if f.endswith('.txt')]

        self.text_files = [f + ".txt" for f in self.file_ids]
        self.anno_files = [f + ".ann" for f in self.file_ids]

        self.n_samples = len(self.file_ids)
        self.texts: List[str]= []

        for i in range(self.n_samples):
            file_path = os.path.join(dataset_folder, self.text_files[i])
            with open(file_path, 'r') as f:
                self.texts.append(f.read())

        self.tags: List[Dict[str, str]] = []
        for i in range(self.n_samples):
            file_path = os.path.join(dataset_folder, self.anno_files[i])
            with open(file_path, 'r') as f:
                text_bound_ann = [t.split('\t') for t in f.read().split('\n') if t.startswith("T")]
                text_bound_list = []
                for text_b in text_bound_ann:
                    label = text_b[1].split(" ")
                    try:
                        tag = {
                            "text": text_b[-1],
                            "label": label[0],
                            "start": int(label[1]),
                            "end": int(label[2])
                        }
                        text_bound_list.append(tag)
                    except:
                        pass
                self.tags.append(text_bound_list)

            self.tokenizer = tokenizer

    def process(self) -> Tuple[List[List[str]], List[List[str]]]:
        input_texts = []
        input_labels = []

        for idx in range(self.n_samples):
            full_text = self.texts[idx]
            tags = self.tags[idx]

            label_offset = []
            continuous_label_offset = []

            for tag in tags:
               offset = list(range(int(tag['start']), int(tag['end']) + 1))
               label_offset.append(offset)
               continuous_label_offset.append(offset)

            all_offset = list(range(len(full_text)))
            zero_offset = [offset for offset in all_offset if offset not in continuous_label_offset]
            zero_offset = DS_Preprocessing.find_continuous_ranges(zero_offset)

            self.tokens = []
            self.labels = []
            self._merge_offset(full_text, tags, zero_offset, label_offset)
            assert len(self.tokens) == len(self.labels), f"Token's length and labels' length are not equal."

            input_texts.append(self.tokens)
            input_labels.append(self.labels)

        return input_texts, input_labels

    def _merge_offset(self, full_text, tags, zero_offset, label_offset):
        i = j = 0
        while i < len(zero_offset) and j < len(label_offset):
            if zero_offset[i][0] < label_offset[j][0]:
                self._add_zero(full_text, zero_offset, i)
                i += 1
            else:
                self._add_label(full_text, label_offset, j, tags)
                j += 1

        while i < len(zero_offset):
            self._add_zero(full_text, zero_offset, i)
            i += 1

        while j < len(label_offset):
            self._add_label(full_text, label_offset, j, tags)
            j += 1

    def _add_zero(self, full_text, offset, index):
        start , *_ ,end = offset[index] if len(offset[index]) > 1 else (offset[index][0], offset[index][0]+1)
        text = full_text[start: end]
        text_tokens = self.tokenizer.tokenize(text)
        self.tokens.extend(text_tokens)
        self.labels.extend(["0"]*len(text_tokens))

    def _add_label(self, full_text, offset, index, tags):
        start , *_ ,end = offset[index] if len(offset[index]) > 1 else (offset[index][0], offset[index][0]+1)
        text = full_text[start: end]
        text_tokens = self.tokenizer.tokenize(text)
        self.tokens.extend(text_tokens)
        self.labels.extend(
            [f"B-{tags[index]['label']}"] + [f"I-{tags[index]['label']}"]*(len(text_tokens) - 1)
        )

    @staticmethod
    def find_continuous_ranges(data: List[int]):
        if not data:
            return []
        ranges = []
        start = data[0]
        prev = data[0]
        for num in data[1: ]:
            if num != prev +1:
                ranges.append(list(range(start, prev + 1)))
                start = num
            prev = num
        ranges.append(list(range(start, prev + 1)))
        return ranges

In [5]:
from transformers import AutoTokenizer, DataCollatorForTokenClassification

tokenizer = AutoTokenizer.from_pretrained(model_id)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [6]:
ds_builder = DS_Preprocessing(dataset_folder, tokenizer)
input_texts, input_labels = ds_builder.process()

Token indices sequence length is longer than the specified maximum sequence length for this model (534 > 512). Running this sequence through the model will result in indexing errors


In [7]:
input_labels[0]

['0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0'

In [8]:
def build_label2id ( tokens : List [ List [str ]]):
  label2id = {}
  id_counter = 0
  for token in [ token for sublist in tokens for token in sublist ]:
    if token not in label2id :
      label2id [ token ] = id_counter
      id_counter += 1
  return label2id

label2id = build_label2id(input_labels)
id2label = {v: k for k, v in label2id.items()}

In [9]:
label2id

{'0': 0,
 'B-Age': 1,
 'I-Age': 2,
 'B-History': 3,
 'I-History': 4,
 'B-Medication': 5,
 'I-Medication': 6,
 'B-Duration': 7,
 'I-Duration': 8,
 'B-Clinical_event': 9,
 'B-Date': 10,
 'I-Date': 11,
 'B-Sign_symptom': 12,
 'B-Detailed_description': 13,
 'I-Detailed_description': 14,
 'B-Biological_structure': 15,
 'I-Biological_structure': 16,
 'B-Diagnostic_procedure': 17,
 'I-Diagnostic_procedure': 18,
 'B-Lab_value': 19,
 'B-Coreference': 20,
 'I-Coreference': 21,
 'I-Sign_symptom': 22,
 'B-Frequency': 23,
 'I-Frequency': 24,
 'I-Lab_value': 25,
 'B-Disease_disorder': 26,
 'I-Disease_disorder': 27,
 'B-Distance': 28,
 'I-Distance': 29,
 'B-Severity': 30,
 'B-Dosage': 31,
 'I-Dosage': 32,
 'B-Sex': 33,
 'I-Clinical_event': 34,
 'B-Texture': 35,
 'I-Texture': 36,
 'B-Time': 37,
 'I-Time': 38,
 'B-Other_entity': 39,
 'B-Administration': 40,
 'B-Therapeutic_procedure': 41,
 'I-Therapeutic_procedure': 42,
 'B-Shape': 43,
 'B-Volume': 44,
 'I-Volume': 45,
 'I-Shape': 46,
 'B-Color': 47,
 

In [10]:
len(label2id)

83

In [11]:
from sklearn.model_selection import train_test_split

inputs_train, inputs_val, labels_train, labels_val = train_test_split(input_texts,
                                                                      input_labels,
                                                                      test_size=0.2,
                                                                      random_state=42)

## Dataload

In [12]:
import torch

from torch.utils.data import Dataset

MAX_LEN = 512

class NER_MRB(Dataset):
    def __init__(self, input_texts, input_labels, tokenizer, label2id, max_len=MAX_LEN):
        super().__init__()
        self.tokens = input_texts
        self.labels = input_labels
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_len = max_len

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, idx):
        input_token = self.tokens[idx]
        label_token = [self.label2id[label] for label in self.labels[idx]]

        input_token = self.tokenizer.convert_tokens_to_ids(input_token)
        attention_mask = [1] * len(input_token)

        input_ids = self.pad_and_truncate(input_token, pad_id=self.tokenizer.pad_token_id)
        labels = self.pad_and_truncate(label_token, pad_id=label2id['0'])
        attention_mask = self.pad_and_truncate(attention_mask, pad_id=0)

        return {
            "input_ids": torch.as_tensor(input_ids),
            "labels": torch.as_tensor(labels),
            "attention_mask": torch.as_tensor(attention_mask)
        }

    def pad_and_truncate(self, inputs, pad_id: int):
        if len(inputs) < self.max_len:
            padded_inputs = inputs + [pad_id]*(self.max_len - len(inputs))
        else:
            padded_inputs = inputs[: self.max_len]
        return padded_inputs

    def label2id(self, labels):
        result = [self.label2id[label] for label in labels]
        return result

In [13]:
from torch.utils.data import DataLoader

# ===== PYTORCH DATASET =====
train_ds = NER_MRB(inputs_train, labels_train, tokenizer, label2id)
val_ds = NER_MRB(inputs_val, labels_val, tokenizer, label2id)


# ===== PYTORCH DATALOADER =====
train_dl = DataLoader(
    train_ds,
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
val_dl = DataLoader(
    val_ds, collate_fn=data_collator, batch_size=8
)

In [14]:
next(iter(train_dl))

{'input_ids': tensor([[ 2023,  6421,  1011,  ...,  2102, 26261,  3372],
        [ 1037,  4601,  1011,  ...,     0,     0,     0],
        [ 1037,  3438,  1011,  ..., 12314, 12859,  1998],
        ...,
        [ 1037,  4261,  1011,  ...,  1996,  5729,  3255],
        [ 1037,  2450,  1999,  ...,  2053,  2569,  2828],
        [ 1037,  2654,  1011,  ..., 12565,  9413, 21906]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0],
        ...,
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ..., 13, 14, 14],
        [ 0,  0,  0,  ..., 18, 17, 18]])}

## Model Implementation

In [15]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(model_id,
                                                        label2id=label2id,
                                                        id2label=id2label,
                                                        ignore_mismatched_sizes=True,
                                                        num_labels=len(label2id))

config.json:   0%|          | 0.00/5.00k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/266M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at d4data/biomedical-ner-all and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([84]) in the checkpoint and torch.Size([83]) in the model instantiated
- classifier.weight: found shape torch.Size([84, 768]) in the checkpoint and torch.Size([83, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# !rm -rf /content/log

## Metrics

In [17]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    mask = labels != 0
    predictions = np.argmax(predictions, axis=-1)
    result = accuracy.compute(predictions=predictions[mask],
                              references=labels[mask])
    return result


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

## Training

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="/content/log",
    learning_rate=1e-4,
    num_train_epochs=20,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    optim='adamw_torch'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


trainer.train()

In [20]:
test_sentence = """A 76-year old woman presented with a 24-month history of enlarging mass involving the back history of trauma.
Physical examination showed a mass of an 3x4 cm in diameter, localized in the right inter-scapular region.
The mass was ulcerative helophytic, grayish in colour, hard in consistency and easily bleeding on manipulation.
The remainder of the examination was unremarkable; no lymphadenopathy and no abdominal masses were felt.
After resection, the histological examinations of the specimens have concluded for basal cell carcinoma.
A local recurrence was observed 18 months later; the patient was admitted to our institution for Lumpectomy (Figure 1).
Histopathological examination revealed a syringomatous pattern infiltrating the dermis (Figure 2, Figure 3), subcutis and skeletal muscle.
The neoplastic epithelial cells were arranged in interconnecting cords with microcystic areas.
Nests, cords, and tubules of the tumour extended into the dermis and into the adjacent muscle.
Many lobules showed squamous differentiation.
Sclerosis of stroma around the cords was present.
Tumour cells were not connected to the epidermis.
The immunohistochemical analysis showed positivity for anti-CK7 (Figure 4), AE1/AE3 and negativity for anti CEA and anti CK20.
Based upon her histological and immunohistochemical presentation, the diagnosis of syringomatous eccrine carcinoma was established.
Radiotherapy of the involved area was performed (70 Gy, 35 sessions)

"""

# tokenization
input = torch.as_tensor ([tokenizer.convert_tokens_to_ids(test_sentence.split())])

input = input.to("cuda")

# prediction
outputs = model(input)
_, preds = torch.max(outputs.logits, -1)
preds = preds[0].cpu().numpy()

# decode
for token, pred in zip(test_sentence.split(), preds):
    print(f"{token}\t{id2label[pred]}")

A	0
76-year	0
old	0
woman	0
presented	0
with	0
a	0
24-month	0
history	0
of	0
enlarging	0
mass	0
involving	0
the	0
back	0
history	0
of	0
trauma.	0
Physical	0
examination	0
showed	0
a	0
mass	0
of	0
an	0
3x4	0
cm	0
in	0
diameter,	0
localized	0
in	0
the	0
right	0
inter-scapular	0
region.	0
The	0
mass	0
was	0
ulcerative	0
helophytic,	0
grayish	0
in	0
colour,	0
hard	0
in	0
consistency	0
and	0
easily	0
bleeding	0
on	0
manipulation.	0
The	0
remainder	0
of	0
the	0
examination	0
was	0
unremarkable;	0
no	0
lymphadenopathy	0
and	0
no	0
abdominal	0
masses	0
were	0
felt.	0
After	0
resection,	0
the	0
histological	0
examinations	0
of	0
the	0
specimens	0
have	0
concluded	0
for	0
basal	0
cell	0
carcinoma.	0
A	0
local	0
recurrence	0
was	0
observed	0
18	0
months	0
later;	0
the	0
patient	0
was	0
admitted	0
to	0
our	0
institution	0
for	0
Lumpectomy	0
(Figure	0
1).	0
Histopathological	0
examination	0
revealed	0
a	0
syringomatous	0
pattern	0
infiltrating	0
the	0
dermis	0
(Figure	0
2,	0
Figure	0
3),	0
subcutis