In [1]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


# **Libraries**

In [2]:
import os
from typing import List, Dict, Tuple
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
import evaluate
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset

-------------------------------
# **Data Sample**

In [3]:
data_dir = '/kaggle/input/maccrobat2018'
for file in os.listdir(data_dir):
    if file.endswith('.txt'):
        file_path = os.path.join(data_dir, file)
        with open(file_path, 'r') as f:
            print(f.read())

A 53 year old female without significant past medical history developed severe viral pneumonia, with rapid, progressive deterioration in her respiratory status.
She developed ARDS and mechanical ventilatory management using ARDS protocol were unable to maintain adequate oxygenation.
As a result, bedside VV-ECMO was planned.
Transesophageal echocardiography (TEE) was performed to visualize proper positioning of the guidewire and cannula.
Using the Seldinger technique, the right internal jugular vein was accessed and a guide wire was placed.
Placement of the guidewire into the IVC proved difficult due to repeated migration of the guidewire into the right ventricle.
After multiple attempts, the guidewire was visualized to course properly from the SVC to the IVC.
After a bolus dose of 5000 units of intravenous heparin was given, the right internal jugular venous access site was dilated.
Just as the final dilatation was completed and upon dilator exchange with simultaneous advancement of th

In [4]:
class Preprocessing_Maccrobat2018:
    def __init__(self, dataset_dir, tokenizer):
        self.file_ids = []
        for file in os.listdir(dataset_dir):
            if file.endswith('.txt'):
                file_name = file.split(".")[0] # Split after "." | Take 1st Index
                self.file_ids.append(file_name)
                
        self.text_files = []
        self.anno_files = []
        for file in self.file_ids:
            text_files = file +".txt"
            anno_files = file + ".ann"
            self.text_files.append(text_files)
            self.anno_files.append(anno_files)
        
        self.num_samples = len(self.file_ids)
        self.texts: List[str] = []

        for i in range(self.num_samples):
            file_path = os.path.join(dataset_dir, self.text_files[i])
            with open(file_path, 'r') as f:
                self.texts.append(f.read())

        self.tags: List[Dict[str, str]] = []
        for i in range(self.num_samples):
            file_path = os.path.join(dataset_dir, self.anno_files[i])
            with open(file_path, 'r') as f:
                text_bound_ann = [
                    txt.split("\t") for txt in f.read().split("\n") if txt.startswith("T")
                ]
                text_bound_lst = []
                for text_b in text_bound_ann:
                    label = text_b[1].split(" ")
                    try:
                        _ = int(label[1])
                        _ = int(label[2])
                        tag = {
                            "text": text_b[-1],
                            "label": label[0],
                            "start": label[1],
                            "end": label[2]
                        }
                        text_bound_lst.append(tag)
                    except:
                        pass
                self.tags.append(text_bound_lst)
        self.tokenizer = tokenizer
        
    def process(self) -> Tuple[List[List[str]], List[List[str]]]:
        inputs_texts = []
        inputs_labels = []

        for idx in range(self.num_samples):
            full_text = self.texts[idx]
            tags = self.tags[idx]

            label_offset = []
            continuous_label_offset = []
            for tag in tags:
                offset = list(
                    range(
                        int(
                            tag['start']
                        ), int(
                            tag['end']
                        ) + 1
                    )
                )
                label_offset.append(offset)
                continuous_label_offset.extend(offset)
            all_offset = list(
                range(
                    len(full_text)
                )
            )
            zero_offset = [
                offset for offset in all_offset if offset not in continuous_label_offset
            ]
            zero_offset = Preprocessing_Maccrobat2018.find_continuous_ranges(zero_offset)
            self.tokens = []
            self.labels = []
            self._merge_offset(full_text, tags, zero_offset, label_offset)
            inputs_texts.append(self.tokens)
            inputs_labels.append(self.labels)
            
        return inputs_texts, inputs_labels

    def _merge_offset(self, full_text, tags, zero_offset, label_offset):
        i = j = 0
        while i < len(zero_offset) and j < len(label_offset):
            if zero_offset[i][0] < label_offset[j][0]:
                self._add_zero(full_text, zero_offset, i)
                i += 1
            else:
                self._add_label(full_text, label_offset, j, tags)
                j += 1
        while i < len(zero_offset):
            self._add_zero(full_text, zero_offset, i)
        while j < len(label_offset):
            self._add_label(full_text, label_offset, j, tags)
            j += 1

    def _add_zero(self, full_text, offset, index):
        start, *_, end = offset[index] if len(offset[index]) > 1 else (offset[index][0], offset[index][0] + 1)
        text = full_text[start:end]
        text_tokens = self.tokenizer.tokenize(text)

        self.tokens.extend(text_tokens)
        self.labels.extend(["O"] * len(text_tokens))

    def _add_label(self, full_text, offset, index, tags):
        start, *_, end = offset[index] if len(offset[index]) > 1 else (offset[index][0], offset[index][0] + 1)
        text = full_text[start:end]
        text_tokens = self.tokenizer.tokenize(text)

        self.tokens.extend(text_tokens)
        self.labels.extend(
            [f"B-{tags[index]['label']}"] + [f"I-{tags[index]['label']}"] * (len(text_tokens) - 1)
        )

    @staticmethod
    def build_label2id(tokens: List[List[str]]):
        label2id = {}
        id_counter = 0

        for token in [token for sublist in tokens for token in sublist]:
            if token not in label2id:
                label2id[token] = id_counter
                id_counter += 1

        return label2id

    @staticmethod
    def find_continuous_ranges(data: List[int]):
        if not data:
            return []
        ranges = []
        start = data[0]
        prev = data[0]
        for number in data[1:]:
            if number != prev + 1:
                ranges.append(list(
                    range(start, prev + 1)
                ))
            prev = number
        ranges.append(
            list(
                range(start, prev+ 1)
            )
        )
        return ranges

In [5]:
tokenizer = AutoTokenizer.from_pretrained(
    "d4data/biomedical-ner-all"
)
dataset_dir = "/kaggle/input/maccrobat2018"

Maccrobat_builder = Preprocessing_Maccrobat2018(dataset_dir, tokenizer)
input_texts, input_labels = Maccrobat_builder.process()

label2id = Preprocessing_Maccrobat2018.build_label2id(input_labels)
id2label = {v: k for k, v in label2id.items()}

tokenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors


-----------------------
# **Train/Test Split**

In [6]:
inputs_train, inputs_val, labels_train, labels_val = train_test_split(
    input_texts,
    input_labels,
    test_size = .2,
    random_state = 42
)

------------------------------------
# **DataLoader**

In [7]:
max_len = 512
class NER_Dataset(Dataset):
    def __init__(
        self, input_texts, input_labels, tokenizer, label2id,
        max_len = max_len
    ):
        super().__init__()
        self.tokens = input_texts
        self.labels = input_labels
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_len = max_len

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, idx):
        input_token = self.tokens[idx]
        label_tokens = [
            self.label2id[label] for label in self.labels[idx]
        ]
        input_token = self.tokenizer.convert_tokens_to_ids(input_token)
        attention_mask = [1] * len(input_token)

        return{
            "input_ids": self.pad_and_truncate(input_token,
                                             pad_id = self.tokenizer.pad_token_id),
            "labels": self.pad_and_truncate(label_tokens, pad_id = 0),
            "attention_mask": self.pad_and_truncate(attention_mask, pad_id = 0)
        }

    # Padding/Truncate to sentences that don't have the same length as max_len
    def pad_and_truncate(self, inputs, pad_id):
        if len(inputs) < self.max_len:
            padded_inputs = inputs + [pad_id]*(self.max_len - len(inputs))
        else:
            padded_inputs = inputs[:self.max_len] # Take til max_len reached
        return torch.as_tensor(padded_inputs)

    def label2id(self, labels):
        return[self.label2id[label] for label in labels]

In [8]:
train_set = NER_Dataset(inputs_train, labels_train, tokenizer, label2id)
val_set = NER_Dataset(inputs_val, labels_val, tokenizer, label2id)

-----------------
# **Model**

In [9]:
from transformers import AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained(
    "d4data/biomedical-ner-all",
    label2id = label2id,
    id2label = id2label,
    ignore_mismatched_sizes = True
)

config.json:   0%|          | 0.00/5.00k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/266M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at d4data/biomedical-ner-all and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([84]) in the checkpoint and torch.Size([83]) in the model instantiated
- classifier.weight: found shape torch.Size([84, 768]) in the checkpoint and torch.Size([83, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


--------------------------------
# **Compute Metric**

In [10]:
accuracy = evaluate.load('accuracy')

ignore_label = len(label2id)

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    mask = labels != ignore_label
    predictions = np.argmax(preds, axis = -1)
    return accuracy.compute(predictions = predictions[mask], references = labels[mask])

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

-----------------------
# **Train**

In [11]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir = 'out_dir',
    learning_rate = 1e-5,
    logging_steps = 1,
    logging_dir = './logs',
    logging_strategy = 'epoch',
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 10,
    eval_strategy = "epoch",
    save_strategy = "epoch",
    load_best_model_at_end = True,
    optim = 'adamw_torch',
    report_to = 'none'
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_set,
    eval_dataset = val_set,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,3.8321,2.548877,0.999463
2,1.6717,0.674094,1.0
3,0.3995,0.109539,1.0
4,0.0804,0.023753,1.0
5,0.0249,0.009994,1.0
6,0.0141,0.006726,1.0
7,0.0109,0.005526,1.0
8,0.0097,0.004999,1.0
9,0.0091,0.004752,1.0
10,0.0089,0.004676,1.0


TrainOutput(global_step=100, training_loss=0.6061166539788246, metrics={'train_runtime': 3287.4285, 'train_samples_per_second': 0.487, 'train_steps_per_second': 0.03, 'total_flos': 209351122944000.0, 'train_loss': 0.6061166539788246, 'epoch': 10.0})

----------------------
# **Infer**

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
test_sentence = """
A 20 year-old male patient presented with testicular cancer and constipation.
Diagnosis of invasive species.
"""

inpt = torch.as_tensor([tokenizer.convert_tokens_to_ids
                         (test_sentence.split())])
inpt = inpt.to(device)

outputs = model(inpt)
_, preds = torch.max(outputs.logits, -1)
preds = preds[0].cpu().numpy()

# Decode
for token, pred in zip(test_sentence.split(), preds):
    print(
        f"{token}\t{id2label[pred]}"
    )

A	O
20	O
year-old	O
male	O
patient	O
presented	O
with	O
testicular	O
cancer	O
and	O
constipation.	O
Diagnosis	O
of	O
invasive	O
species.	O
