In [42]:
import pandas as pd
from dataset import read_ner_file
import torch 
import torch.nn as nn 
from torch.utils.data import DataLoader, Dataset
import numpy as np 
import os
import matplotlib.pyplot as plt 
from transformers import AdamW
from tqdm import tqdm 
import torcheval 

In [43]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
device = torch.device("cuda")

In [44]:
LABEL_2_ID = {'B-PATIENT_ID': 0, 
    'I-PATIENT_ID': 1, 
    'B-NAME': 2, 
    'I-NAME': 3, 
    'B-AGE': 4, 
    'I-AGE': 5, 
    'B-GENDER': 6, 
    'I-GENDER': 7, 
    'B-JOB': 8, 
    'I-JOB': 9, 
    'B-LOCATION': 10, 
    'I-LOCATION': 11, 
    'B-ORGANIZATION': 12, 
    'I-ORGANIZATION': 13, 
    'B-SYMPTOM_AND_DISEASE': 14, 
    'I-SYMPTOM_AND_DISEASE': 15, 
    'B-TRANSPORTATION': 16, 
    'I-TRANSPORTATION': 17, 
    'B-DATE': 18, 
    'I-DATE': 19, 
    'O': 20
}

ID_2_LABEL = {0: 'B-PATIENT_ID', 
    1: 'I-PATIENT_ID', 
    2: 'B-NAME', 
    3: 'I-NAME', 
    4: 'B-AGE', 
    5: 'I-AGE', 
    6: 'B-GENDER', 
    7: 'I-GENDER', 
    8: 'B-JOB', 
    9: 'I-JOB', 
    10: 'B-LOCATION', 
    11: 'I-LOCATION', 
    12: 'B-ORGANIZATION', 
    13: 'I-ORGANIZATION', 
    14: 'B-SYMPTOM_AND_DISEASE', 
    15: 'I-SYMPTOM_AND_DISEASE', 
    16: 'B-TRANSPORTATION', 
    17: 'I-TRANSPORTATION', 
    18: 'B-DATE', 
    19: 'I-DATE', 
    20: 'O'
}

In [45]:
df_train = read_ner_file("./data/syllable/train_syllable.conll")
# df_test = read_ner_file("./data/syllable/test_syllable.conll")

In [46]:
df_train = pd.DataFrame(data=df_train)
df_train = df_train.convert_dtypes()

In [47]:
df_train

Unnamed: 0,words,tokens
0,"[Đồng, thời, ,, bệnh, viện, tiếp, tục, thực, h...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,"["", Số, bệnh, viện, có, thể, tiếp, nhận, bệnh,...","[O, O, O, O, O, O, O, O, O, O, O, B-SYMPTOM_AN..."
2,"[Ngoài, ra, ,, những, người, tiếp, xúc, gián, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[Bà, này, khi, trở, về, quá, cảnh, Doha, (, Qa...","[O, O, O, O, O, O, O, B-LOCATION, O, B-LOCATIO..."
4,"["", Bệnh, nhân, 523, "", và, chồng, là, "", bệnh...","[O, O, O, B-PATIENT_ID, O, O, O, O, O, O, O, B..."
...,...,...
5023,"[Liên, quan, đến, Bệnh, viện, Bạch, Mai, ,, ôn...","[O, O, O, B-LOCATION, I-LOCATION, I-LOCATION, ..."
5024,"[Mẫu, lần, hai, ngày, 22/7, kết, quả, sàng, lọ...","[O, O, O, O, B-DATE, O, O, O, O, O, O, O]"
5025,"[Đây, là, 5, trường, hợp, dương, tính, được, B...","[O, O, O, O, O, O, O, O, B-ORGANIZATION, I-ORG..."
5026,"[Lúc, 17h, ngày, 7, -, 3, ,, Viện, Vệ, sinh, D...","[O, O, O, B-DATE, I-DATE, I-DATE, O, B-ORGANIZ..."


In [48]:
a = df_train["tokens"].loc[0]
print(a)
print(type(a))

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O']
<class 'list'>


In [49]:
tokens = df_train["tokens"]

def get_token_type_count(tokens: pd.Series, classname): 
    tokens = tokens.apply(func=lambda x: True if classname in x else False)
    pos = tokens[tokens == True].count()
    return pos 


for key in LABEL_2_ID.keys(): 
    print(key)
    print(f"Token type: {key} has {get_token_type_count(tokens=tokens, classname=key)} occurences")

B-PATIENT_ID
Token type: B-PATIENT_ID has 1960 occurences
I-PATIENT_ID
Token type: I-PATIENT_ID has 6 occurences
B-NAME
Token type: B-NAME has 288 occurences
I-NAME
Token type: I-NAME has 44 occurences
B-AGE
Token type: B-AGE has 611 occurences
I-AGE
Token type: I-AGE has 2 occurences
B-GENDER
Token type: B-GENDER has 503 occurences
I-GENDER
Token type: I-GENDER has 13 occurences
B-JOB
Token type: B-JOB has 196 occurences
I-JOB
Token type: I-JOB has 194 occurences
B-LOCATION
Token type: B-LOCATION has 2926 occurences
I-LOCATION
Token type: I-LOCATION has 2851 occurences
B-ORGANIZATION
Token type: B-ORGANIZATION has 983 occurences
I-ORGANIZATION
Token type: I-ORGANIZATION has 974 occurences
B-SYMPTOM_AND_DISEASE
Token type: B-SYMPTOM_AND_DISEASE has 618 occurences
I-SYMPTOM_AND_DISEASE
Token type: I-SYMPTOM_AND_DISEASE has 536 occurences
B-TRANSPORTATION
Token type: B-TRANSPORTATION has 213 occurences
I-TRANSPORTATION
Token type: I-TRANSPORTATION has 54 occurences
B-DATE
Token type: B-D

# Model 

In [50]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

model = AutoModelForMaskedLM.from_pretrained("uitnlp/visobert")

In [51]:
model

XLMRobertaForMaskedLM(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(15004, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)

In [52]:
model.lm_head.decoder = nn.Linear(in_features=768, out_features=len(ID_2_LABEL), bias=True)

In [53]:
model = model.to(device)
print(model)

XLMRobertaForMaskedLM(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(15004, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)

In [54]:
for params in model.base_model.parameters(): 
    params.requires_grad = False

# Tokenizer

In [55]:
tokenizer = AutoTokenizer.from_pretrained("uitnlp/visobert")

In [56]:
print(tokenizer)

XLMRobertaTokenizerFast(name_or_path='uitnlp/visobert', vocab_size=15002, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	15001: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}


# Train

In [57]:
def converter(tokens): 
    converted_tokens = [] 

    for token in tokens: 
        converted_tokens.append(LABEL_2_ID[token])

    return converted_tokens

In [58]:
df_train["tokens"] = df_train["tokens"].apply(func=converter)
df_train["tokens"].count()

5028

In [59]:
df_train["tokens"]

0       [20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 2...
1       [20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 1...
2       [20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 2...
3       [20, 20, 20, 20, 20, 20, 20, 10, 20, 10, 20, 2...
4       [20, 20, 20, 0, 20, 20, 20, 20, 20, 20, 20, 0,...
                              ...                        
5023    [20, 20, 20, 10, 11, 11, 11, 20, 20, 20, 20, 2...
5024     [20, 20, 20, 20, 18, 20, 20, 20, 20, 20, 20, 20]
5025    [20, 20, 20, 20, 20, 20, 20, 20, 12, 13, 13, 2...
5026    [20, 20, 20, 18, 19, 19, 20, 12, 13, 13, 13, 1...
5027    [20, 18, 20, 20, 20, 20, 20, 20, 20, 20, 20, 1...
Name: tokens, Length: 5028, dtype: object

In [60]:
df_train["words"].count()

5028

In [61]:
train_tokens = tokenizer(df_train["words"].to_list(), truncation=True, padding=True, return_tensors="pt", is_split_into_words=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [62]:
word_ids = train_tokens.word_ids(batch_index=0)
print(word_ids)
print(train_tokens["input_ids"][0])
reconstruct = [] 

for word_idx in word_ids: 
    if word_idx != None: 
        reconstruct.append(df_train["tokens"].loc[0][word_idx])

print(reconstruct)
print(df_train["tokens"].loc[0])

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, N

In [63]:
print(train_tokens["input_ids"].shape)
print(train_tokens["attention_mask"].shape) 

max_len = train_tokens["input_ids"].shape[-1]
print(max_len)

torch.Size([5028, 241])
torch.Size([5028, 241])
241


In [64]:
def align_tokens(tokens, label_all_tokens=True): 
    labels = [] 
    for i, label in enumerate(df_train["tokens"]):
        word_ids = tokens.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            # set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokens["labels"] = labels
    return tokens

In [65]:
train_tokens = align_tokens(train_tokens)

In [66]:
labels = train_tokens["labels"]

print(len(labels))

5028


In [67]:
class VisoDataset(Dataset): 
    def __init__(self, tokens: pd.Series, labels: list): 
        self.labels = labels
        self.input_ids = tokens["input_ids"]
        self.attention_mask = tokens["attention_mask"]

        self.length = len(self.input_ids)

    def __len__(self): 
        return self.length 

    def __getitem__(self, idx): 
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        input_id = self.input_ids[idx]
        attention_mask = self.attention_mask[idx]

        return {
            "labels": label, 
            "input_ids": input_id, 
            "attention_mask": attention_mask,
        }

In [68]:
train_dataset = VisoDataset(tokens=train_tokens, labels=train_tokens["labels"])

In [69]:
print(train_dataset[0]["labels"].shape) 
print(train_dataset[1]["labels"].shape) 

torch.Size([241])
torch.Size([241])


In [70]:
train_loader = DataLoader(dataset=train_dataset, batch_size=128, shuffle=True, num_workers=16, pin_memory=True)

In [71]:
for input_dict in train_loader: 
    print(f"input_ids: {input_dict['input_ids']}")
    print(f"labels: {input_dict['labels']}")

    break 

input_ids: tensor([[   0, 2589,  136,  ...,    1,    1,    1],
        [   0,  483,   77,  ...,    1,    1,    1],
        [   0, 1654,   29,  ...,    1,    1,    1],
        ...,
        [   0, 1272, 1213,  ...,    1,    1,    1],
        [   0, 2367,  136,  ...,    1,    1,    1],
        [   0, 1201, 2186,  ...,    1,    1,    1]])
labels: tensor([[-100,   20,   20,  ..., -100, -100, -100],
        [-100,   20,   20,  ..., -100, -100, -100],
        [-100,   20,   20,  ..., -100, -100, -100],
        ...,
        [-100,   20,   20,  ..., -100, -100, -100],
        [-100,   20,   20,  ..., -100, -100, -100],
        [-100,   20,   10,  ..., -100, -100, -100]])


In [72]:
optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss(reduction="mean", ignore_index=-100)

output_metrics = {
    "train": {
        "loss": [], 
        "accuracy": [], 
        "precision": [], 
        "recall": [], 
        "f1": [], 
    }, 
    "eval": {
        "loss": [], 
        "accuracy": [], 
        "precision": [], 
        "recall": [], 
        "f1": [], 
    }
}



In [73]:
import torcheval.metrics


epochs = 15

for i in tqdm(range(epochs), desc="Epochs", total=epochs): 

    # training loop
    epoch_loss = 0.0 
    epoch_accuracy = 0.0
    epoch_precision = 0.0
    epoch_recall = 0.0
    epoch_f1 = 0.0
    total_steps = len(train_loader)
    model = model.train()

    for input_dict in tqdm(train_loader, desc=f"Training batches epoch {i}", total=total_steps):
        input_ids = input_dict["input_ids"].to(device)
        labels = input_dict["labels"].to(device)
        attention_mask = input_dict["attention_mask"].to(device)


        logits = model(input_ids, attention_mask)["logits"]

        labels = labels.view(-1)
        logits = logits.view(-1, 21) 

        # compute loss 
        loss = criterion(logits, labels)

        # backpropagation 
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # update running loss
        epoch_loss += loss.item()

        # compute metrics
        with torch.no_grad(): 
            epoch_accuracy += torcheval.metrics.functional.multiclass_accuracy(input=logits, target=labels, average="micro").item()
            epoch_f1 += torcheval.metrics.functional.multiclass_f1_score(input=logits, target=labels, average="micro").item()
            epoch_precision += torcheval.metrics.functional.multiclass_precision(input=logits, target=labels, average="micro").item()
            epoch_recall += torcheval.metrics.functional.multiclass_recall(input=logits, target=labels, average="micro").item()

    output_metrics["train"]["loss"].append(epoch_loss / total_steps)
    output_metrics["train"]["accuracy"].append(epoch_accuracy / total_steps)
    output_metrics["train"]["precision"].append(epoch_precision / total_steps)
    output_metrics["train"]["recall"].append(epoch_recall / total_steps)
    output_metrics["train"]["f1"].append(epoch_f1 / total_steps)

    # evaluation loop
    model = model.eval()
    epoch_loss = 0.0 
    epoch_accuracy = 0.0
    epoch_precision = 0.0
    epoch_recall = 0.0
    epoch_f1 = 0.0
    total_steps = len(train_loader)

    with torch.no_grad(): 
        for input_dict in tqdm(train_loader, desc=f"Testing batches epoch {i}", total=total_steps): 
            input_ids = input_dict["input_ids"].to(device)
            labels = input_dict["labels"].to(device)
            attention_mask = input_dict["attention_mask"].to(device)


            logits = model(input_ids, attention_mask)["logits"]

            labels = labels.view(-1)
            logits = logits.view(-1, 21) 

            # compute loss 
            loss = criterion(logits, labels)

            # update running loss
            epoch_loss += loss.item()

            # compute metrics
            epoch_accuracy += torcheval.metrics.functional.multiclass_accuracy(input=logits, target=labels, average="micro").item()
            epoch_f1 += torcheval.metrics.functional.multiclass_f1_score(input=logits, target=labels, average="micro").item()
            epoch_precision += torcheval.metrics.functional.multiclass_precision(input=logits, target=labels, average="micro").item()
            epoch_recall += torcheval.metrics.functional.multiclass_recall(input=logits, target=labels, average="micro").item()

    output_metrics["eval"]["loss"].append(epoch_loss / total_steps)
    output_metrics["eval"]["accuracy"].append(epoch_accuracy / total_steps)
    output_metrics["eval"]["precision"].append(epoch_precision / total_steps)
    output_metrics["eval"]["recall"].append(epoch_recall / total_steps)
    output_metrics["eval"]["f1"].append(epoch_f1 / total_steps)

Epochs: 100%|██████████| 3/3 [07:36<00:00, 152.17s/it]


In [None]:
output_metrics