# Import Git

In [14]:
!git clone https://github.com/KhoiDang-KHDL/DS201_Lab03-RNN


fatal: destination path 'DS201_Lab03-RNN' already exists and is not an empty directory.


In [15]:
import sys
sys.path.append('/kaggle/working/DS201_Lab03-RNN') 

# Check thư mục
import os
print(os.listdir('/kaggle/working/DS201_Lab03-RNN'))

['seq2seq.py', '__pycache__', 'UIT-VSFC-20251121T013658Z-1-001', 'uit_vsfc.py', 'PhoNER', 'PhoNer.py', 'gru.py', 'lstm.py', '.git', 'main.ipynb']


# Thư viện

In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import os
from sklearn.metrics import f1_score, accuracy_score
import numpy as np
import uit_vsfc

from uit_vsfc import Vocab, UIT_VSFC, collate_fn
from lstm import LSTMModel
from gru import GRUModel

# Cấu hình cho bài 1,2

In [17]:
BATCH_SIZE = 64
HIDDEN_SIZE = 128  
N_LAYERS = 2
LEARNING_RATE = 0.0001
EPOCHS = 10
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Train, Eval bài 1,2

In [18]:
def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(DEVICE)
            labels = batch['label'].to(DEVICE)
            
            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            
            # Lấy nhãn dự đoán
            _, predicted = torch.max(outputs, 1)
            
            # Chuyển về cpu để tính trong sklearn
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            
    avg_loss = total_loss / len(dataloader)
    
    # Tính Accuracy và F1-Score
    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')
    
    return avg_loss, acc, f1

In [19]:
def train(model, train_loader, dev_loader, optimizer, criterion, epochs):
    print(f"Bắt đầu huấn luyện trên thiết bị: {DEVICE}")
    
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        
        for i, batch in enumerate(train_loader):
            input_ids = batch['input_ids'].to(DEVICE)
            labels = batch['label'].to(DEVICE)
            
            optimizer.zero_grad()
            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            
            if (i + 1) % 10 == 0:
                print(f"Epoch [{epoch+1}/{epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}")

        dev_loss, dev_acc, dev_f1 = evaluate(model, dev_loader, criterion)
        print(f"--- Hết Epoch {epoch+1} ---")
        print(f"Train Loss: {running_loss/len(train_loader):.4f}")
        print(f"Dev Loss: {dev_loss:.4f} | Dev Acc: {dev_acc*100:.2f}% | Dev F1: {dev_f1*100:.2f}%")
        print("-" * 30)

# Bài 1: main

In [20]:
def main():

    base_path = r'/kaggle/working/DS201_Lab03-RNN/UIT-VSFC-20251121T013658Z-1-001/UIT-VSFC'
    
    train_path = os.path.join(base_path, 'UIT-VSFC-train.json')
    dev_path = os.path.join(base_path, 'UIT-VSFC-dev.json')
    test_path = os.path.join(base_path, 'UIT-VSFC-test.json')

    print("Đang xây dựng bộ từ điển (Vocab):")
    vocab = Vocab(base_path)
    print(f"Kích thước từ điển: {vocab.len}")

    print("Đang tải dữ liệu...")
    train_dataset = UIT_VSFC(train_path, vocab)
    dev_dataset = UIT_VSFC(dev_path, vocab)
    test_dataset = UIT_VSFC(test_path, vocab)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
    dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

    # --- MODEL ---
    model = LSTMModel(
        vocab_size=vocab.len,
        hidden_size=HIDDEN_SIZE,
        embedding_dim=128,
        n_layers=N_LAYERS,
        n_labels=vocab.n_labels,
        padding_idx=vocab.w2i[vocab.pad]
    ).to(DEVICE)

    # ---  OPTIMIZER, LOSS ---
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

    train(model, train_loader, dev_loader, optimizer, criterion, EPOCHS)

    # Evaluate
    print("\nĐang đánh giá trên tập Test...")
    test_loss, test_acc, test_f1 = evaluate(model, test_loader, criterion)
    print(f"Test Loss: {test_loss:.4f}")
    print(f"Test Accuracy: {test_acc*100:.2f}%")
    print(f"Test F1-Score: {test_f1*100:.2f}%")

if __name__ == '__main__':
    main()

Đang xây dựng bộ từ điển (Vocab):
Kích thước từ điển: 2848
Đang tải dữ liệu...
Bắt đầu huấn luyện trên thiết bị: cuda
Epoch [1/10], Step [10/179], Loss: 1.3549
Epoch [1/10], Step [20/179], Loss: 1.2920
Epoch [1/10], Step [30/179], Loss: 1.2046
Epoch [1/10], Step [40/179], Loss: 1.1101
Epoch [1/10], Step [50/179], Loss: 0.9714
Epoch [1/10], Step [60/179], Loss: 0.7937
Epoch [1/10], Step [70/179], Loss: 0.6979
Epoch [1/10], Step [80/179], Loss: 0.7561
Epoch [1/10], Step [90/179], Loss: 0.7496
Epoch [1/10], Step [100/179], Loss: 0.8525
Epoch [1/10], Step [110/179], Loss: 0.6517
Epoch [1/10], Step [120/179], Loss: 0.7445
Epoch [1/10], Step [130/179], Loss: 0.5005
Epoch [1/10], Step [140/179], Loss: 0.5608
Epoch [1/10], Step [150/179], Loss: 0.5385
Epoch [1/10], Step [160/179], Loss: 0.5815
Epoch [1/10], Step [170/179], Loss: 0.5770
--- Hết Epoch 1 ---
Train Loss: 0.8384
Dev Loss: 0.5777 | Dev Acc: 78.71% | Dev F1: 36.39%
------------------------------
Epoch [2/10], Step [10/179], Loss: 0.5

# Bài 2: main

In [21]:
def main():

    base_path = r'/kaggle/working/DS201_Lab03-RNN/UIT-VSFC-20251121T013658Z-1-001/UIT-VSFC'
    
    train_path = os.path.join(base_path, 'UIT-VSFC-train.json')
    dev_path = os.path.join(base_path, 'UIT-VSFC-dev.json')
    test_path = os.path.join(base_path, 'UIT-VSFC-test.json')

    print("Đang xây dựng bộ từ điển (Vocab)...")
    vocab = Vocab(base_path)
    print(f"Kích thước từ điển: {vocab.len}")

    print("Đang tải dữ liệu...")
    train_dataset = UIT_VSFC(train_path, vocab)
    dev_dataset = UIT_VSFC(dev_path, vocab)
    test_dataset = UIT_VSFC(test_path, vocab)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
    dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

    # ---  MODEL ---
    model = GRUModel(
        vocab_size=vocab.len,
        embedding_dim=128,
        hidden_size=HIDDEN_SIZE,
        n_layers=N_LAYERS,
        n_labels=vocab.n_labels,
        padding_idx=vocab.w2i[vocab.pad]
    ).to(DEVICE)

    # ---  OPTIMIZER, LOSS ---
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

    train(model, train_loader, dev_loader, optimizer, criterion, EPOCHS)
    
    # Evaluate
    print("\nĐang đánh giá trên tập Test...")
    test_loss, test_acc, test_f1 = evaluate(model, test_loader, criterion)
    print(f"Test Loss: {test_loss:.4f}")
    print(f"Test Accuracy: {test_acc*100:.2f}%")
    print(f"Test F1-Score: {test_f1*100:.2f}%")

if __name__ == '__main__':
    main()

Đang xây dựng bộ từ điển (Vocab)...
Kích thước từ điển: 2848
Đang tải dữ liệu...
Bắt đầu huấn luyện trên thiết bị: cuda
Epoch [1/10], Step [10/179], Loss: 1.1843
Epoch [1/10], Step [20/179], Loss: 1.0253
Epoch [1/10], Step [30/179], Loss: 0.8681
Epoch [1/10], Step [40/179], Loss: 0.9069
Epoch [1/10], Step [50/179], Loss: 0.8451
Epoch [1/10], Step [60/179], Loss: 0.8173
Epoch [1/10], Step [70/179], Loss: 0.7996
Epoch [1/10], Step [80/179], Loss: 0.7197
Epoch [1/10], Step [90/179], Loss: 0.9797
Epoch [1/10], Step [100/179], Loss: 0.8058
Epoch [1/10], Step [110/179], Loss: 0.6664
Epoch [1/10], Step [120/179], Loss: 0.7911
Epoch [1/10], Step [130/179], Loss: 0.5435
Epoch [1/10], Step [140/179], Loss: 0.5836
Epoch [1/10], Step [150/179], Loss: 0.4921
Epoch [1/10], Step [160/179], Loss: 0.7148
Epoch [1/10], Step [170/179], Loss: 0.4905
--- Hết Epoch 1 ---
Train Loss: 0.7551
Dev Loss: 0.5495 | Dev Acc: 80.04% | Dev F1: 37.70%
------------------------------
Epoch [2/10], Step [10/179], Loss: 0

# Bài 3

## Import module

In [22]:
from PhoNer import Vocab, PhoNER, collate_fn 
from seq2seq import Encoder, Decoder, Seq2Seq

## Cấu hình

In [23]:
# --- CONFIG ---
BATCH_SIZE = 32 
ENC_EMB_DIM = 128
DEC_EMB_DIM = 128
HIDDEN_DIM = 256 
N_LAYERS = 5       
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
EPOCHS = 10
LEARNING_RATE = 0.00001
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


## Train, Eval bài 3

In [24]:
def evaluate(model, iterator, criterion, tag_pad_idx):
    model.eval()
    epoch_loss = 0
    
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in iterator:
            src = batch['input_ids'].to(DEVICE)
            trg = batch['label'].to(DEVICE)

            output = model(src, trg, 0) 
            
            # output: [batch, seq_len, output_dim]
            # trg: [batch, seq_len]
            
            # Reshape để tính loss
            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim) # Bỏ token đầu
            trg = trg[:, 1:].reshape(-1)                   # Bỏ token đầu
            
            loss = criterion(output, trg)
            epoch_loss += loss.item()
            
            preds = output.argmax(dim=1)
            
            preds_list = preds.cpu().numpy()
            trg_list = trg.cpu().numpy()

            valid_indices = np.where(trg_list != tag_pad_idx)[0]
            
            all_preds.extend(preds_list[valid_indices])
            all_labels.extend(trg_list[valid_indices])
            
    avg_loss = epoch_loss / len(iterator)
    f1 = f1_score(all_labels, all_preds, average='macro')
    
    return avg_loss, f1

In [25]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        src = batch['input_ids'].to(DEVICE)
        trg = batch['label'].to(DEVICE)
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        # Reshape output và trg để tính CrossEntropy
        
        # output shape: [batch, seq_len, n_tags] -> flatten -> [batch * seq, n_tags]
        # trg shape: [batch, seq_len] -> flatten -> [batch * seq]

        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        trg = trg[:, 1:].reshape(-1)
        
        loss = criterion(output, trg)
        loss.backward()
        
        # tránh exploding gradient với LSTM nhiều lớp
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        epoch_loss += loss.item()
        
        if (i+1) % 10 == 0:
             print(f"Step {i+1}/{len(iterator)} | Loss: {loss.item():.4f}")
        
    return epoch_loss / len(iterator)

## Main bài 3

In [26]:
def main():
    base_path = r'/kaggle/working/DS201_Lab03-RNN/PhoNER' 
    
    print("Đang đọc dữ liệu...")
    vocab = Vocab(base_path)
    print(f"Vocab size: {vocab.len} | Tags: {vocab.n_labels}")
    
    train_data = PhoNER(os.path.join(base_path, 'train_word.json'), vocab)
    dev_data = PhoNER(os.path.join(base_path, 'dev_word.json'), vocab)
    test_data = PhoNER(os.path.join(base_path, 'test_word.json'), vocab)
    
    train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
    dev_loader = DataLoader(dev_data, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
    
    # Model
    input_dim = vocab.len
    output_dim = vocab.n_labels
    
    enc = Encoder(input_dim, ENC_EMB_DIM, HIDDEN_DIM, N_LAYERS, ENC_DROPOUT)
    dec = Decoder(output_dim, DEC_EMB_DIM, HIDDEN_DIM, N_LAYERS, DEC_DROPOUT)
    model = Seq2Seq(enc, dec, DEVICE).to(DEVICE)
    
    # Init weights (giúp hội tụ tốt hơn)
    def init_weights(m):
        for name, param in m.named_parameters():
            nn.init.uniform_(param.data, -0.08, 0.08)
    model.apply(init_weights)

    print(f'Mô hình có {sum(p.numel() for p in model.parameters() if p.requires_grad):,} tham số train được.')

    # 3. Optimizer & Loss
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    
    # ignore_index=-100 để không tính loss cho phần padding
    criterion = nn.CrossEntropyLoss(ignore_index=-100)
    
    # 4. Training 
    print("Bắt đầu huấn luyện...")
    best_valid_loss = float('inf')
    
    for epoch in range(EPOCHS):
        train_loss = train(model, train_loader, optimizer, criterion, clip=1)
        valid_loss, valid_f1 = evaluate(model, dev_loader, criterion, tag_pad_idx=-100)
        
        print(f'Epoch: {epoch+1:02}')
        print(f'\tTrain Loss: {train_loss:.3f}')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. F1: {valid_f1*100:.2f}%')
        
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), 'phoner_seq2seq.pt')
    
    # 5. Test
    print("\nĐang đánh giá trên tập Test...")
    model.load_state_dict(torch.load('phoner_seq2seq.pt'))
    test_loss, test_f1 = evaluate(model, test_loader, criterion, tag_pad_idx=-100)
    print(f'Test Loss: {test_loss:.3f} | Test F1: {test_f1*100:.2f}%')

if __name__ == '__main__':
    main()

Đang đọc dữ liệu...
--- Đang quét dữ liệu tại: /kaggle/working/DS201_Lab03-RNN/PhoNER ---
-> Đã xây dựng Vocab: 7306 từ, 21 nhãn (bao gồm pad).
Vocab size: 7306 | Tags: 21
-> Đã tải 5027 mẫu từ train_word.json
-> Đã tải 2000 mẫu từ dev_word.json
-> Đã tải 3000 mẫu từ test_word.json
Mô hình có 5,944,469 tham số train được.
Bắt đầu huấn luyện...
Step 10/158 | Loss: 3.0565
Step 20/158 | Loss: 3.0404
Step 30/158 | Loss: 3.0316
Step 40/158 | Loss: 3.0199
Step 50/158 | Loss: 3.0102
Step 60/158 | Loss: 2.9976
Step 70/158 | Loss: 2.9809
Step 80/158 | Loss: 2.9689
Step 90/158 | Loss: 2.9561
Step 100/158 | Loss: 2.9326
Step 110/158 | Loss: 2.9091
Step 120/158 | Loss: 2.8794
Step 130/158 | Loss: 2.8510
Step 140/158 | Loss: 2.8027
Step 150/158 | Loss: 2.7039
Epoch: 01
	Train Loss: 2.941
	 Val. Loss: 2.613 |  Val. F1: 5.55%
Step 10/158 | Loss: 2.4800
Step 20/158 | Loss: 2.2043
Step 30/158 | Loss: 1.9478
Step 40/158 | Loss: 1.6772
Step 50/158 | Loss: 1.4575
Step 60/158 | Loss: 1.4087
Step 70/158 | L