In [None]:
!pip install evaluate transformers accelerate



In [None]:
!nvidia-smi


Sun Nov 16 08:36:06 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   66C    P0             33W /   70W |    3606MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
import os
from typing import List, Dict, Tuple

class Preprocessing_Maccrobat:
    def __init__(self, dataset_folder, tokenizer):
        # Tạo list lưu các file id
        self.file_ids = [f.split(".")[0] for f in os.listdir(dataset_folder) if f.endswith('.txt')]

        # Tạo list lưu các file .txt, .ann
        self.text_files = [f+".txt" for f in self.file_ids]
        self.anno_files = [f+".ann" for f in self.file_ids]

        # Số lượng file cần xử lý
        self.num_samples = len(self.file_ids)

        # Lấy ra tất cả các câu được lưu trong các file text (.txt)
        self.texts: List[str] = []

        for i in range(self.num_samples):
            file_path = os.path.join(dataset_folder, self.text_files[i])
            with open(file_path, "r") as f:
                self.texts.append(f.read())

        # Lấy ra tất cả các term, mỗi term sẽ có các thông tin như label, term, start, end
        self.tags: List[Dict[str: str]] = []
        for i in range(self.num_samples):
            file_path = os.path.join(dataset_folder, self.anno_files[i])
            with open(file_path, "r") as f:
                text_bound_ann = [t.split("\t") for t in f.read().split("\n") if t.startswith("T")]
                text_bound_lst = []
                for text_b in text_bound_ann:
                    label = text_b[1].split(" ")
                    try:
                        _ = int(label[1])
                        _ = int(label[2])
                        tag = {
                            "text": text_b[-1],
                            "label": label[0],
                            "start": label[1],
                            "end": label[2]
                        }
                        text_bound_lst.append(tag)
                    except:
                        pass

                self.tags.append(text_bound_lst)

        # Tokenizer
        self.tokenizer = tokenizer

    # Tạo phương thức process():
    # 1. Đọc file .txt => extract full text
    # 2. Đọc file .ann => lấy ra các tags (các entity)
    # 3. Tìm những text có label (có nhãn) -> label_offset (tạo phương thức riêng để xử lý)
    # 4. Tìm những text không có label (không có nhãn) -> zero_offset (tạo phương thức riêng để xử lý)
    # 5. Gộp label_offset và zero_offset theo thứ tự vị trí (tạo phương thức riêng để xử lý)
    #       Nếu zero xuất hiện trước -> _add_zero -> thêm "O"
    #       Nếu label xuất hiện trước -> _add_label -> thêm "B-" (Begin) và "I-" (Inside)
    # 6. Kết quả: tokens + labels

    # Ví dụ: kết quả thu được sau khi dùng phương thức process()
    # tokens = ["Patient", "has", "head", "##ache", "and", "fe", "##ver", "."]
    # labels = ["O", "O", "B-Symptom", "I-Symptom", "O", "B-Symptom", "I-Symptom", "O"]
    def process(self):
        # Khai báo list input_texts: danh sách các câu đã được tokenize
        input_texts = []
        # Khai báo input_labels: danh sách nhãn B - I - O tương ứng
        input_labels = []

        # Lặp qua từng file id cần được xử lý
        for idx in range(self.num_samples):
            # 1. Đọc file .txt => extract full text
            full_text = self.texts[idx]
            # 2. Đọc file .ann => lấy ra các tags (các entity)
            tags = self.tags[idx]

            # 3. Khai báo label_offset: danh sách các đoạn có label
            label_offset = []
            # Khai báo continuous_label_offset: gộp tất cả các offset (text) có label -> để tìm vùng không có label
            continuous_label_offset = []
            for tag in tags:
                offset = list(range(int(tag["start"]), int(tag["end"]) + 1))
                label_offset.append(offset)
                continuous_label_offset.extend(offset)

            all_offset = list(range(len(full_text)))
            # zero_offset: các vị trí không có label -> chuyển thành các đoạn liên tục (find_continuous_range)
            zero_offset = [offset for offset in all_offset if offset not in continuous_label_offset]
            zero_offset = Preprocessing_Maccrobat.find_continuous_range(zero_offset)

            # 5. Khởi tạo danh sách token (các câu) và label (nhãn) tương ứng cho mỗi offdet trong câu
            self.tokens = []
            self.labels = []
            # Chúng ta cần phương thức _merge_offset để gộp label_offset và zero_offset lại theo thứ tự
            self._merge_offset(full_text, tags, zero_offset, label_offset)

            input_texts.append(self.tokens)
            input_labels.append(self.labels)

        return input_texts, input_labels


    def _merge_offset(self, full_text, tags, zero_offset, label_offset):
        # zero: [[0, 1, 2], [6, 7]] label: [[3, 4, 5]] => [[0, 1, 2, 3, 4, 5, 6, 7], [10, 11, 12, 13, 14]]
        i = j = 0
        # So sánh vị trí bắt đầu của vùng không label và có label
        # Ưu tiên thêm vùng xuất hiện trước trong văn bản
        while i < len(zero_offset) and j < len(label_offset):
            if zero_offset[i][0] < label_offset[j][0]:
                self._add_zero(full_text, zero_offset, i)
                i += 1
            else:
                self._add_label(full_text, label_offset, j, tags)
                j += 1

        # Thêm các vùng còn lại (nếu có)
        while i < len(zero_offset):
            self._add_zero(full_text, zero_offset, i)
            i += 1

        while j < len(label_offset):
            self._add_label(full_text, label_offset, j, tags)
            j += 1


    # Code phương thức _add_zero() - Thêm vùng không có label
    def _add_zero(self, full_text, offset, index):
        start, *_, end = offset[index] if len(offset[index]) > 1 else (offset[index][0], offset[index][0] + 1)
        text = full_text[start:end]
        text_tokens = self.tokenizer.tokenize(text)

        self.tokens.extend(text_tokens)
        self.labels.extend(
            ["O"] * len(text_tokens)
        )

    # Code phương thức _add_label() - Thêm vùng có label
    def _add_label(self, full_text, offset, index, tags):
        start, *_, end = offset[index] if len(offset[index]) > 1 else (offset[index][0], offset[index][0] + 1)
        text = full_text[start:end]
        text_tokens = self.tokenizer.tokenize(text)

        self.tokens.extend(text_tokens)
        # "headache" -> tokenize thành ["head", "##ache"] -> nhãn: ["B-Symptom", "I-Symptom"]
        self.labels.extend(
            [f"B-{tags[index]["label"]}"] + [f"I-{tags[index]["label"]}"] * (len(text_tokens) - 1)
        )

    @staticmethod
    def build_label2id(tokens: List[List[str]]):
        label2id = {}
        id_counter = 0
        for token in [token for sublist in tokens for token in sublist]:
            if token not in label2id:
                label2id[token] = id_counter
                id_counter += 1
        return label2id

    # Chuyển thành các đoạn liên tục
    # [0, 1, 2, 6, 7] => zero_offset = [[0, 1, 2], [6, 7]], label_offset = [3, 4, 5]
    @staticmethod # Khai báo phương thức là staticmethod => không cần đưa tham số self vào (vì đây là phương thức độc lập nhưng nằm trong class)
    def find_continuous_range(data): # [0, 1, 2, 6, 7]
        if not data:
            return []
        ranges = []
        start = data[0]
        prev = data[0]

        for number in data[1:]: # [1, 2, 6, 7]
            if number != prev + 1: # Mất đi tính liên tục
                ranges.append(list(range(start, prev + 1)))
                start = number
            prev = number
        ranges.append(list(range(start, prev + 1)))
        return ranges

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")

In [None]:
dataset_folder = r"/content/drive/MyDrive/Cybersoft/NLP & LLM/NLP_03/Resource/MACCROBAT2020"

Maccrobat_builder = Preprocessing_Maccrobat(dataset_folder, tokenizer)
input_texts, input_labels = Maccrobat_builder.process()

In [None]:
label2id = Preprocessing_Maccrobat.build_label2id(input_labels)
id2label = {v: k for k, v in label2id.items()}

## DataLoader

In [None]:
from sklearn.model_selection import train_test_split

input_train, input_val, labels_train, labels_val = train_test_split(
    input_texts,
    input_labels,
    test_size=0.2,
    random_state=42
)

In [None]:
import torch
from torch.utils.data import Dataset

MAX_LEN = 512

class NER_Dataset(Dataset):
    def __init__(self, input_texts, input_labels, tokenizer, label2id, max_len=MAX_LEN):
        super().__init__()
        self.tokens = input_texts
        self.labels = input_labels
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_len = max_len

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, idx):
        # Lấy tokens và labels dựa vào index
        input_token = self.tokens[idx]
        # Chuyển label từ string sang index dùng label2id
        label_token = [self.label2id[label] for label in self.labels[idx]]

        # Chuyển tokens sang input_ids dùng tokenizer
        input_token = self.tokenizer.convert_tokens_to_ids(input_token)
        attention_mask = [1] * len(input_token) # Model sẽ phải chú ý đến tất cả các token như nhau

        input_ids = self.pad_and_truncate(input_token, pad_id=self.tokenizer.pad_token_id)
        labels = self.pad_and_truncate(label_token, pad_id=0)
        attention_mask = self.pad_and_truncate(attention_mask, pad_id=0)

        return {
            "input_ids": torch.as_tensor(input_ids),
            "labels": torch.as_tensor(labels),
            "attention_mask": torch.as_tensor(attention_mask)
        }


    # Khai báo phương thức pad_and_truncate
    # Thêm pad cho những câu ngắn, truncate (cắt) nếu câu quá dài
    def pad_and_truncate(self, inputs, pad_id):
        if len(inputs) < self.max_len: # Thêm pad (id của <pad>, pad_id)
            padded_inputs = inputs + [pad_id] * (self.max_len - len(inputs))
        else: # truncate
            padded_inputs = inputs[:self.max_len]

        return padded_inputs

    def label2id(self, labels):
        return [self.label2id[label] for label in labels]

In [None]:
train_set = NER_Dataset(input_train, labels_train, tokenizer, label2id)
val_set = NER_Dataset(input_val, labels_val, tokenizer, label2id)

## Model

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "d4data/biomedical-ner-all",
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at d4data/biomedical-ner-all and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([84]) in the checkpoint and torch.Size([83]) in the model instantiated
- classifier.weight: found shape torch.Size([84, 768]) in the checkpoint and torch.Size([83, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model

DistilBertForTokenClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
   

## Fine-tuning

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    mask = labels != 0
    predictions = np.argmax(predictions, axis=-1)
    return accuracy.compute(predictions=predictions[mask], references=labels[mask])

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="ner-biomedical-maccrobat2020",
    learning_rate=1e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    optim="adamw_torch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,2.6383,1.434759,0.437846
2,1.2953,0.880317,0.625139
3,0.8477,0.680408,0.721555
4,0.6207,0.590564,0.760344
5,0.4705,0.543355,0.783524
6,0.3675,0.54744,0.783986
7,0.302,0.551066,0.796269
8,0.2492,0.547319,0.796731
9,0.2108,0.546332,0.80218
10,0.1777,0.563673,0.805597


TrainOutput(global_step=200, training_loss=0.41233884513378144, metrics={'train_runtime': 766.9392, 'train_samples_per_second': 4.172, 'train_steps_per_second': 0.261, 'total_flos': 418702245888000.0, 'train_loss': 0.41233884513378144, 'epoch': 20.0})

In [None]:


def inference(sentence, model, tokenizer, device="cuda"):
    # 1. Tokenize input với tokenizer chuẩn (trả về input_ids + attention_mask)
    encoding = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    # 2. Dự đoán
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    # 3. Lấy nhãn dự đoán (argmax)
    preds = torch.argmax(logits, dim=-1).squeeze(0)  # [seq_len]

    # 4. Map ids → labels
    preds_labels = [model.config.id2label[p.item()] for p in preds]

    # 5. Lấy token đã tokenize
    tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze(0))

    return tokens, preds_labels


In [None]:
def merge_entity(tokens, preds_labels):
    """
    tokens: list[str] - token đã tokenize (có thể có subword)
    preds_labels: list[str] - nhãn dự đoán dạng "B-Symptom", "I-Symptom", "O"

    Trả về: list[tuple(entity_type, text)]
    """
    merged_list = []
    temp_tokens = []
    current_label = None

    for token, label in zip(tokens, preds_labels):
        # Lấy type thực sự (bỏ B-/I-), giữ O
        type_label = label.split("-")[-1]

        if type_label == "O":
            if temp_tokens:
                merged_list.append((current_label, " ".join(temp_tokens).replace(" ##", "")))
                temp_tokens = []
                current_label = None
            merged_list.append((type_label, token.replace("##", "")))
        else:
            if current_label == type_label:
                temp_tokens.append(token)
            else:
                if temp_tokens:
                    merged_list.append((current_label, " ".join(temp_tokens).replace(" ##", "")))
                temp_tokens = [token]
                current_label = type_label

    if temp_tokens:
        merged_list.append((current_label, " ".join(temp_tokens).replace(" ##", "")))

    return merged_list


In [None]:
sentence = """A 48 year - old female presented with vaginal bleeding and abnormal Pap smears .
Upon diagnosis of invasive non - keratinizing SCC of the cervix ,
she underwent a radical hysterectomy with salpingo - oophorectomy
which demonstrated positive spread to the pelvic lymph nodes and the parametrium .
Pathological examination revealed that the tumour also extensively involved the lower uterine segment .
"""
tokens, preds_labels = inference(sentence, model, tokenizer)
results = merge_entity(tokens, preds_labels)

In [None]:
results

[('O', '[CLS]'),
 ('O', 'a'),
 ('Age', '48 year - old'),
 ('Sex', 'female'),
 ('Clinical_event', 'presented'),
 ('O', 'with'),
 ('Biological_structure', 'vaginal'),
 ('Sign_symptom', 'bleeding'),
 ('O', 'and'),
 ('Lab_value', 'abnormal'),
 ('Diagnostic_procedure', 'pa'),
 ('Sign_symptom', '##p'),
 ('O', 'sm'),
 ('O', 'ears'),
 ('O', '.'),
 ('O', 'upon'),
 ('O', 'diagnosis'),
 ('O', 'of'),
 ('Detailed_description', 'invasive non - keratinizing'),
 ('Disease_disorder', 'scc'),
 ('O', 'of'),
 ('O', 'the'),
 ('Biological_structure', 'cervi'),
 ('O', 'x'),
 ('O', ','),
 ('O', 'she'),
 ('O', 'underwent'),
 ('O', 'a'),
 ('Detailed_description', 'radical'),
 ('Therapeutic_procedure', 'hysterectomy'),
 ('O', 'with'),
 ('Therapeutic_procedure', 'salpingo - oophorectomy'),
 ('O', 'which'),
 ('O', 'demonstrated'),
 ('Lab_value', 'positive'),
 ('O', 'spread'),
 ('O', 'to'),
 ('O', 'the'),
 ('Biological_structure', 'pelvic lymph nodes'),
 ('O', 'and'),
 ('O', 'the'),
 ('Biological_structure', 'param

In [None]:
import torch
import transformers
import evaluate
import accelerate
import sklearn

print("torch:", torch.__version__)
print("transformers:", transformers.__version__)
print("evaluate:", evaluate.__version__)
print("accelerate:", accelerate.__version__)
print("scikit-learn:", sklearn.__version__)


torch: 2.8.0+cu126
transformers: 4.57.1
evaluate: 0.4.6
accelerate: 1.11.0
scikit-learn: 1.6.1


In [None]:
# Lưu mô hình và tokenizer
model_path = "./ner-biomedical-maccrobat2020-final"

trainer.save_model(model_path)   # Lưu model + config + weights
tokenizer.save_pretrained(model_path)  # Lưu tokenizer


('./ner-biomedical-maccrobat2020-final/tokenizer_config.json',
 './ner-biomedical-maccrobat2020-final/special_tokens_map.json',
 './ner-biomedical-maccrobat2020-final/vocab.txt',
 './ner-biomedical-maccrobat2020-final/added_tokens.json',
 './ner-biomedical-maccrobat2020-final/tokenizer.json')

In [None]:
from transformers import AutoModelForTokenClassification, AutoTokenizer

model = AutoModelForTokenClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

model.eval()
model.to("cuda")


DistilBertForTokenClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
   

In [None]:
sentence = """A 48 year - old female presented with vaginal bleeding and abnormal Pap smears .
Upon diagnosis of invasive non - keratinizing SCC of the cervix ,
she underwent a radical hysterectomy with salpingo - oophorectomy
which demonstrated positive spread to the pelvic lymph nodes and the parametrium .
Pathological examination revealed that the tumour also extensively involved the lower uterine segment .
"""
tokens, preds_labels = inference(sentence, model, tokenizer)
results = merge_entity(tokens, preds_labels)

In [None]:
results

[('O', '[CLS]'),
 ('O', 'a'),
 ('Age', '48 year - old'),
 ('Sex', 'female'),
 ('Clinical_event', 'presented'),
 ('O', 'with'),
 ('Biological_structure', 'vaginal'),
 ('Sign_symptom', 'bleeding'),
 ('O', 'and'),
 ('Lab_value', 'abnormal'),
 ('Diagnostic_procedure', 'pa'),
 ('Sign_symptom', '##p'),
 ('O', 'sm'),
 ('O', 'ears'),
 ('O', '.'),
 ('O', 'upon'),
 ('O', 'diagnosis'),
 ('O', 'of'),
 ('Detailed_description', 'invasive non - keratinizing'),
 ('Disease_disorder', 'scc'),
 ('O', 'of'),
 ('O', 'the'),
 ('Biological_structure', 'cervi'),
 ('O', 'x'),
 ('O', ','),
 ('O', 'she'),
 ('O', 'underwent'),
 ('O', 'a'),
 ('Detailed_description', 'radical'),
 ('Therapeutic_procedure', 'hysterectomy'),
 ('O', 'with'),
 ('Therapeutic_procedure', 'salpingo - oophorectomy'),
 ('O', 'which'),
 ('O', 'demonstrated'),
 ('Lab_value', 'positive'),
 ('O', 'spread'),
 ('O', 'to'),
 ('O', 'the'),
 ('Biological_structure', 'pelvic lymph nodes'),
 ('O', 'and'),
 ('O', 'the'),
 ('Biological_structure', 'param

In [None]:
from google.colab import files
import shutil

# Nén thư mục model thành file zip
shutil.make_archive("ner-biomedical-maccrobat2020-final", 'zip', "./ner-biomedical-maccrobat2020-final")

# Tải file zip về máy
files.download("ner-biomedical-maccrobat2020-final.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>