# lab6_rnn_for_ner
- Task 1: Tải và Tiền xử lý Dữ liệu
- Task 2: Tạo PyTorch Dataset và DataLoader
- Task 3: Xây dựng Mô hình RNN
- Task 4: Huấn luyện Mô hình
- Task 5: Đánh giá Mô hình

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from datasets import load_dataset
from collections import Counter
import numpy as np
import random

# Thiết lập seed để đảm bảo tính tái lập
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cpu


**Task 1: Tải và Tiền xử lý Dữ liệu**

In [2]:
# 1. Tải dữ liệu từ Hugging Face
dataset = load_dataset("conll2003")
print(dataset)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})


In [3]:
# 2. Trích xuất câu và nhãn

# Lấy danh sách tên nhãn từ features
label_names = dataset["train"].features["ner_tags"].feature.names
print(f"Label names: {label_names}")

def extract_data(split_name):
    tokens_list = dataset[split_name]["tokens"]
    tags_list_int = dataset[split_name]["ner_tags"]
    
    # Chuyển đổi nhãn số sang string
    tags_list_str = [[label_names[i] for i in tags] for tags in tags_list_int]
    return tokens_list, tags_list_str

train_sentences, train_tags = extract_data("train")
val_sentences, val_tags = extract_data("validation")
test_sentences, test_tags = extract_data("test")

print(f"Train size: {len(train_sentences)}")
print(f"Validation size: {len(val_sentences)}")
print(f"Test size: {len(test_sentences)}")
print(f"Example sentence: {train_sentences[0]}")
print(f"Example tags: {train_tags[0]}")

Label names: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
Train size: 14041
Validation size: 3250
Test size: 3453
Example sentence: ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
Example tags: ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']
Train size: 14041
Validation size: 3250
Test size: 3453
Example sentence: ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
Example tags: ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']


In [4]:
# 3. Xây dựng Từ điển (Vocabulary)
def build_vocab(sentences, tags_list, min_freq=1):
    word_counter = Counter()
    tag_set = set()
    
    # Đếm số lần xuất hiện của từ
    for sent in sentences:
        word_counter.update(sent)
    
    # Thu thập tập hợp nhãn
    for tags in tags_list:
        tag_set.update(tags)
        
    # Word vocabulary
    word_to_ix = {"<PAD>": 0, "<UNK>": 1}
    for word, count in word_counter.items():
        if count >= min_freq:
            word_to_ix[word] = len(word_to_ix)
            
    # Tag vocabulary
    # Đảm bảo <PAD> tag có index riêng nếu cần, hoặc dùng index đặc biệt trong loss
    # Ở đây ta thêm <PAD> vào tag_to_ix để tiện cho việc padding label
    tag_to_ix = {"<PAD>": 0}
    for tag in sorted(list(tag_set)):
        tag_to_ix[tag] = len(tag_to_ix)
        
    return word_to_ix, tag_to_ix

word_to_ix, tag_to_ix = build_vocab(train_sentences, train_tags)
print(f"Word vocab size: {len(word_to_ix)}")
print(f"Tag vocab size: {len(tag_to_ix)}")
print(f"Tag mapping: {tag_to_ix}")

Word vocab size: 23625
Tag vocab size: 10
Tag mapping: {'<PAD>': 0, 'B-LOC': 1, 'B-MISC': 2, 'B-ORG': 3, 'B-PER': 4, 'I-LOC': 5, 'I-MISC': 6, 'I-ORG': 7, 'I-PER': 8, 'O': 9}


**Task 2: Tạo PyTorch Dataset và DataLoader**

In [5]:
class NERDataset(Dataset):
    def __init__(self, sentences, tags, word_to_ix, tag_to_ix):
        self.sentences = sentences
        self.tags = tags
        self.word_to_ix = word_to_ix
        self.tag_to_ix = tag_to_ix
        
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        tag_seq = self.tags[idx]
        
        # Convert to indices
        word_indices = [self.word_to_ix.get(w, self.word_to_ix["<UNK>"]) for w in sentence]
        tag_indices = [self.tag_to_ix[t] for t in tag_seq]
        
        return torch.tensor(word_indices, dtype=torch.long), torch.tensor(tag_indices, dtype=torch.long)

def collate_fn(batch):
    # batch là list các tuple (sentence_indices, tag_indices)
    sentences, tags = zip(*batch)
    
    # Lấy độ dài thực tế của mỗi câu để dùng cho pack_padded_sequence
    lengths = torch.tensor([len(s) for s in sentences])
    
    # Pad sentences và tags
    # padding_value cho sentence là index của <PAD> (0)
    padded_sentences = pad_sequence(sentences, batch_first=True, padding_value=word_to_ix["<PAD>"])
    
    # padding_value cho tags là index của <PAD> tag (0)
    # Lưu ý: Khi tính loss cần ignore index này
    padded_tags = pad_sequence(tags, batch_first=True, padding_value=tag_to_ix["<PAD>"])
    
    return padded_sentences, padded_tags, lengths

In [6]:
BATCH_SIZE = 128

train_dataset = NERDataset(train_sentences, train_tags, word_to_ix, tag_to_ix)
val_dataset = NERDataset(val_sentences, val_tags, word_to_ix, tag_to_ix)
test_dataset = NERDataset(test_sentences, test_tags, word_to_ix, tag_to_ix)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

print(f"Number of batches in train_loader: {len(train_loader)}")

Number of batches in train_loader: 110


**Task 3: Xây dựng Mô hình RNN**

In [7]:
class SimpleRNNForNER(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size, pad_idx):
        super(SimpleRNNForNER, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        # Sử dụng LSTM để có kết quả tốt hơn RNN thường
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        # Vì dùng bidirectional nên input của linear là hidden_dim * 2
        self.fc = nn.Linear(hidden_dim * 2, output_size)
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, x, lengths):
        # x: [batch_size, seq_len]
        embedded = self.embedding(x)
        embedded = self.dropout(embedded)
        
        # Pack sequence
        # lengths cần phải ở trên CPU
        packed_embedded = pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        # Unpack sequence
        output, output_lengths = pad_packed_sequence(packed_output, batch_first=True)
        
        # output: [batch_size, seq_len, hidden_dim * 2]
        predictions = self.fc(self.dropout(output))
        return predictions

**Task 4: Huấn luyện Mô hình**

In [8]:
# Khởi tạo mô hình
VOCAB_SIZE = len(word_to_ix)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_SIZE = len(tag_to_ix)
PAD_IDX = word_to_ix["<PAD>"]

model = SimpleRNNForNER(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_SIZE, PAD_IDX)
model = model.to(device)

# Optimizer và Loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# ignore_index là index của <PAD> tag để không tính loss cho phần padding
TAG_PAD_IDX = tag_to_ix["<PAD>"]
criterion = nn.CrossEntropyLoss(ignore_index=TAG_PAD_IDX)

print(model)

SimpleRNNForNER(
  (embedding): Embedding(23625, 100, padding_idx=0)
  (rnn): LSTM(100, 256, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=512, out_features=10, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)


In [9]:
def train(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    correct_preds = 0
    total_preds = 0
    
    for batch in iterator:
        text, tags, lengths = batch
        text = text.to(device)
        tags = tags.to(device)
        
        optimizer.zero_grad()
        
        predictions = model(text, lengths)
        # predictions: [batch_size, seq_len, output_size]
        # tags: [batch_size, seq_len]
        
        # Reshape để tính loss
        predictions = predictions.view(-1, predictions.shape[-1])
        tags = tags.view(-1)
        
        loss = criterion(predictions, tags)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
        # Tính accuracy
        predicted_tags = torch.argmax(predictions, dim=-1)
        mask = tags != TAG_PAD_IDX
        correct = (predicted_tags == tags) & mask
        correct_preds += correct.sum().item()
        total_preds += mask.sum().item()
        
    return epoch_loss / len(iterator), correct_preds / total_preds

EPOCHS = 10

for epoch in range(EPOCHS):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    print(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Accuracy: {train_acc*100:.2f}%')

Epoch: 01 | Train Loss: 0.819 | Train Accuracy: 82.22%
Epoch: 02 | Train Loss: 0.521 | Train Accuracy: 86.07%
Epoch: 02 | Train Loss: 0.521 | Train Accuracy: 86.07%
Epoch: 03 | Train Loss: 0.399 | Train Accuracy: 88.46%
Epoch: 03 | Train Loss: 0.399 | Train Accuracy: 88.46%
Epoch: 04 | Train Loss: 0.316 | Train Accuracy: 90.67%
Epoch: 04 | Train Loss: 0.316 | Train Accuracy: 90.67%
Epoch: 05 | Train Loss: 0.260 | Train Accuracy: 92.21%
Epoch: 05 | Train Loss: 0.260 | Train Accuracy: 92.21%
Epoch: 06 | Train Loss: 0.219 | Train Accuracy: 93.36%
Epoch: 06 | Train Loss: 0.219 | Train Accuracy: 93.36%
Epoch: 07 | Train Loss: 0.186 | Train Accuracy: 94.39%
Epoch: 07 | Train Loss: 0.186 | Train Accuracy: 94.39%
Epoch: 08 | Train Loss: 0.161 | Train Accuracy: 95.04%
Epoch: 08 | Train Loss: 0.161 | Train Accuracy: 95.04%
Epoch: 09 | Train Loss: 0.140 | Train Accuracy: 95.68%
Epoch: 09 | Train Loss: 0.140 | Train Accuracy: 95.68%
Epoch: 10 | Train Loss: 0.121 | Train Accuracy: 96.21%
Epoch: 10 

**Task 5: Đánh giá Mô hình**

In [10]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    correct_preds = 0
    total_preds = 0
    
    with torch.no_grad():
        for batch in iterator:
            text, tags, lengths = batch
            text = text.to(device)
            tags = tags.to(device)
            
            predictions = model(text, lengths)
            
            # Tính loss
            predictions_flat = predictions.view(-1, predictions.shape[-1])
            tags_flat = tags.view(-1)
            loss = criterion(predictions_flat, tags_flat)
            epoch_loss += loss.item()
            
            # Tính accuracy
            # Lấy nhãn dự đoán có xác suất cao nhất
            predicted_tags = torch.argmax(predictions, dim=-1)
            
            # Mask để bỏ qua padding
            mask = tags != TAG_PAD_IDX
            
            correct = (predicted_tags == tags) & mask
            correct_preds += correct.sum().item()
            total_preds += mask.sum().item()
            
    return epoch_loss / len(iterator), correct_preds / total_preds

In [11]:

val_loss, val_acc = evaluate(model, val_loader, criterion)
print(f'Validation Loss: {val_loss:.3f} | Validation Accuracy: {val_acc*100:.2f}%')

Validation Loss: 0.242 | Validation Accuracy: 94.45%


In [12]:
# Test trên tập test
test_loss, test_acc = evaluate(model, test_loader, criterion)
print(f'Test Loss: {test_loss:.3f} | Test Accuracy: {test_acc*100:.2f}%')

Test Loss: 0.349 | Test Accuracy: 92.60%


In [15]:
# Hàm dự đoán cho câu mới
def predict_sentence(model, sentence, word_to_ix, tag_to_ix):
    model.eval()
    
    # Tokenize đơn giản (tách theo khoảng trắng)
    tokens = sentence.split()
    
    # Chuyển tokens sang indices
    token_indices = [word_to_ix.get(w, word_to_ix["<UNK>"]) for w in tokens]
    
    # Chuyển sang tensor và thêm dimension batch (1, seq_len)
    token_tensor = torch.tensor([token_indices], dtype=torch.long).to(device)
    lengths = torch.tensor([len(token_indices)], dtype=torch.long)
    
    # Forward pass
    with torch.no_grad():
        predictions = model(token_tensor, lengths)
        # predictions: [1, seq_len, output_size]
        
        # Lấy nhãn có xác suất cao nhất
        predicted_indices = torch.argmax(predictions, dim=-1).squeeze(0).tolist()
        
    # Map indices ngược lại thành tags
    ix_to_tag = {v: k for k, v in tag_to_ix.items()}
    predicted_tags = [ix_to_tag[idx] for idx in predicted_indices]
    
    return list(zip(tokens, predicted_tags))

In [16]:
# Test với câu ví dụ
sentence = "VNU University is located in Hanoi"
predictions = predict_sentence(model, sentence, word_to_ix, tag_to_ix)

print("Sentence:", sentence)
print("Predictions:")
for word, tag in predictions:
    print(f"{word}: {tag}")

Sentence: VNU University is located in Hanoi
Predictions:
VNU: B-ORG
University: I-ORG
is: O
located: O
in: O
Hanoi: O
