### Bài 3

In [1]:
!pip install datasets seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=728fc5f0d3847ea540b804dc34da969a5f5e780e2b27da26d2be3b8eaf240ff9
  Stored in directory: /root/.cache/pip/wheels/5f/b8/73/0b2c1a76b701a677653dd79ece07cfabd7457989dbfbdcd8d7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
import numpy as np
from seqeval.metrics import f1_score as ner_f1_score
from seqeval.metrics import classification_report as ner_report
from datasets import disable_progress_bar
from datasets import utils

disable_progress_bar()
utils.logging.set_verbosity_error()

data_files = {
    "train": "https://raw.githubusercontent.com/VinAIResearch/PhoNER_COVID19/main/data/word/train_word.json",
    "validation": "https://raw.githubusercontent.com/VinAIResearch/PhoNER_COVID19/main/data/word/dev_word.json",
    "test": "https://raw.githubusercontent.com/VinAIResearch/PhoNER_COVID19/main/data/word/test_word.json"
}

dataset = load_dataset("json", data_files=data_files)

train_sentences = dataset["train"]["words"]
dev_sentences   = dataset["validation"]["words"]
test_sentences  = dataset["test"]["words"]

train_tags_raw = dataset["train"]["tags"]
dev_tags_raw   = dataset["validation"]["tags"]
test_tags_raw  = dataset["test"]["tags"]

unique_tags = set(tag for tags in train_tags_raw for tag in tags)
tag_names = sorted(list(unique_tags))
num_tags = len(tag_names)

tag2id = {tag: i for i, tag in enumerate(tag_names)}

def convert_tags_to_ids(tags_list_raw):
    return [[tag2id[tag] for tag in tags] for tags in tags_list_raw]

train_tags = convert_tags_to_ids(train_tags_raw)
dev_tags   = convert_tags_to_ids(dev_tags_raw)
test_tags  = convert_tags_to_ids(test_tags_raw)

print(f"Số lượng nhãn: {num_tags}")
print(f"Danh sách nhãn: {tag_names}")

Số lượng nhãn: 20
Danh sách nhãn: ['B-AGE', 'B-DATE', 'B-GENDER', 'B-JOB', 'B-LOCATION', 'B-NAME', 'B-ORGANIZATION', 'B-PATIENT_ID', 'B-SYMPTOM_AND_DISEASE', 'B-TRANSPORTATION', 'I-AGE', 'I-DATE', 'I-JOB', 'I-LOCATION', 'I-NAME', 'I-ORGANIZATION', 'I-PATIENT_ID', 'I-SYMPTOM_AND_DISEASE', 'I-TRANSPORTATION', 'O']


In [11]:
word_set = set()
for sentence in train_sentences:
    for word in sentence:
        word_set.add(word)

vocab = {word: i+2 for i, word in enumerate(word_set)}
vocab["<PAD>"] = 0
vocab["<UNK>"] = 1
vocab_size = len(vocab)
print(f"Kích thước từ điển: {vocab_size}")

class NerDataset(Dataset):
    def __init__(self, sentences, tags, vocab, max_len=100):
        self.sentences = sentences
        self.tags = tags
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        token_list = self.sentences[idx]
        tag_list = self.tags[idx]

        token_ids = [self.vocab.get(w, self.vocab["<UNK>"]) for w in token_list]

        if len(token_ids) < self.max_len:
            pad_len = self.max_len - len(token_ids)
            token_ids = token_ids + [self.vocab["<PAD>"]] * pad_len
            tag_list  = tag_list  + [-100] * pad_len
        else:
            token_ids = token_ids[:self.max_len]
            tag_list  = tag_list[:self.max_len]

        return torch.tensor(token_ids), torch.tensor(tag_list)

MAX_LEN = 100
BATCH_SIZE = 64

train_ds = NerDataset(train_sentences, train_tags, vocab, MAX_LEN)
dev_ds   = NerDataset(dev_sentences, dev_tags, vocab, MAX_LEN)
test_ds  = NerDataset(test_sentences, test_tags, vocab, MAX_LEN)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
dev_loader   = DataLoader(dev_ds, batch_size=BATCH_SIZE)
test_loader  = DataLoader(test_ds, batch_size=BATCH_SIZE)

Kích thước từ điển: 5243


In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

EMBEDDING_DIM = 128
HIDDEN_SIZE = 256
N_LAYERS = 5
LR = 0.001
EPOCHS = 20

class BiLSTM_NER(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, n_layers, n_tags):
        super(BiLSTM_NER, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=n_layers,
            batch_first=True,
            bidirectional=True,
            dropout=0.2
        )
        self.classifier = nn.Linear(hidden_size * 2, n_tags)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        embeds = self.embedding(x)
        lstm_out, _ = self.lstm(embeds)
        out = self.dropout(lstm_out)
        logits = self.classifier(out)
        return logits

model = BiLSTM_NER(vocab_size, EMBEDDING_DIM, HIDDEN_SIZE, N_LAYERS, num_tags)
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss(ignore_index=-100)

def train_and_evaluate_ner(model, train_loader, dev_loader, epochs):
    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for tokens, tags in train_loader:
            tokens, tags = tokens.to(device), tags.to(device)

            optimizer.zero_grad()
            outputs = model(tokens)

            loss = criterion(outputs.view(-1, num_tags), tags.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)

        model.eval()
        true_labels = []
        pred_labels = []

        with torch.no_grad():
            for tokens, tags in dev_loader:
                tokens, tags = tokens.to(device), tags.to(device)
                outputs = model(tokens)
                preds = torch.argmax(outputs, dim=2)

                preds_np = preds.cpu().numpy()
                tags_np = tags.cpu().numpy()

                for i in range(len(tags_np)):
                    temp_true = []
                    temp_pred = []
                    for j in range(len(tags_np[i])):
                        if tags_np[i][j] != -100:
                            temp_true.append(tag_names[tags_np[i][j]])
                            temp_pred.append(tag_names[preds_np[i][j]])

                    true_labels.append(temp_true)
                    pred_labels.append(temp_pred)

        dev_f1 = ner_f1_score(true_labels, pred_labels)

        print(f"Epoch {epoch+1}/{epochs}")
        print(f"Train Loss: {avg_loss:.4f}")
        print(f"F1: {dev_f1:.12f}")
        print("-" * 30)

train_and_evaluate_ner(model, train_loader, dev_loader, epochs=EPOCHS)

Epoch 1/20
Train Loss: 1.0896
F1: 0.000000000000
------------------------------
Epoch 2/20
Train Loss: 0.5650
F1: 0.539579762585
------------------------------
Epoch 3/20
Train Loss: 0.2106
F1: 0.760666893501
------------------------------
Epoch 4/20
Train Loss: 0.1228
F1: 0.826672076824
------------------------------
Epoch 5/20
Train Loss: 0.0832
F1: 0.848814025438
------------------------------
Epoch 6/20
Train Loss: 0.0614
F1: 0.855394656333
------------------------------
Epoch 7/20
Train Loss: 0.0510
F1: 0.860454147295
------------------------------
Epoch 8/20
Train Loss: 0.0380
F1: 0.861620540180
------------------------------
Epoch 9/20
Train Loss: 0.0318
F1: 0.876703134439
------------------------------
Epoch 10/20
Train Loss: 0.0235
F1: 0.869196608801
------------------------------
Epoch 11/20
Train Loss: 0.0202
F1: 0.875763747454
------------------------------
Epoch 12/20
Train Loss: 0.0168
F1: 0.872163439143
------------------------------
Epoch 13/20
Train Loss: 0.0167
F1: 0.

In [13]:
def evaluate_ner(model, test_loader, tag_names):
    model.eval()
    true_labels = []
    pred_labels = []

    print("Đánh giá trên tập test:")
    with torch.no_grad():
        for tokens, tags in test_loader:
            tokens, tags = tokens.to(device), tags.to(device)
            outputs = model(tokens)
            preds = torch.argmax(outputs, dim=2)
            preds = preds.cpu().numpy()
            tags = tags.cpu().numpy()

            for i in range(len(tags)):
                temp_true = []
                temp_pred = []
                for j in range(len(tags[i])):
                    if tags[i][j] != -100:
                        temp_true.append(tag_names[tags[i][j]])
                        temp_pred.append(tag_names[preds[i][j]])

                true_labels.append(temp_true)
                pred_labels.append(temp_pred)

    f1 = ner_f1_score(true_labels, pred_labels)
    print(f"F1-Score: {f1:.4f}")
    print(ner_report(true_labels, pred_labels, digits=4))

evaluate_ner(model, test_loader, tag_names)

Đánh giá trên tập test:
F1-Score: 0.8792
                     precision    recall  f1-score   support

                AGE     0.9354    0.9322    0.9338       575
               DATE     0.9509    0.9624    0.9566      1650
             GENDER     0.9251    0.9231    0.9241       455
                JOB     0.4962    0.3757    0.4276       173
           LOCATION     0.8879    0.8713    0.8795      4435
               NAME     0.8629    0.5346    0.6602       318
       ORGANIZATION     0.7938    0.7938    0.7938       771
         PATIENT_ID     0.9545    0.9286    0.9414      1988
SYMPTOM_AND_DISEASE     0.8250    0.7386    0.7794      1136
     TRANSPORTATION     0.9118    0.8031    0.8540       193

          micro avg     0.8958    0.8633    0.8792     11694
          macro avg     0.8543    0.7863    0.8150     11694
       weighted avg     0.8935    0.8633    0.8768     11694

