In [1]:
import os
import json
import re
from pathlib import Path
from collections import Counter

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import gdown
import numpy as np


In [2]:
def set_seed(seed=42):
    import random
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)


In [8]:
# Tải bộ dữ liệu từ Google Drive
folder_id = '1Lu9axyLkw7dMx80uLRgvCnZsmNzhJWAa'
os.system(f'gdown --folder https://drive.google.com/drive/folders/{folder_id}')
os.chdir('UIT-ViOCD')

In [9]:
import os

print("Current dir:", os.getcwd())
print("Files:", os.listdir("."))


Current dir: /content/UIT-ViOCD
Files: ['dev.json', 'train.json', 'test.json']


In [10]:
import json

def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    # dataset có thể là list hoặc dict
    if isinstance(data, dict):
        return list(data.values())
    return data

train_data = load_json("train.json")
dev_data   = load_json("dev.json")
test_data  = load_json("test.json")

len(train_data), len(dev_data), len(test_data)


(4387, 548, 549)

In [11]:
train_data[0]


{'review': 'gói hàng cẩn thận . chơi pubg với liên quân mượt với giá như này thì quá tốt',
 'label': 'non-complaint',
 'domain': 'mobile'}

In [12]:
def get_text(item):
    return item.get("review", item.get("text", ""))

def get_label(item):
    return item["domain"]


In [13]:
domains = sorted({get_label(x) for x in train_data})
domain2idx = {d:i for i,d in enumerate(domains)}
idx2domain = {i:d for d,i in domain2idx.items()}

domains, domain2idx


(['app', 'cosmetic', 'fashion', 'mobile'],
 {'app': 0, 'cosmetic': 1, 'fashion': 2, 'mobile': 3})

In [14]:
import re

def tokenize(text):
    text = text.lower()
    return re.findall(r"[a-zA-ZÀ-ỹ0-9]+", text)


In [15]:
from collections import Counter

PAD = "<pad>"
UNK = "<unk>"

counter = Counter()
for x in train_data:
    counter.update(tokenize(get_text(x)))

idx2word = [PAD, UNK] + list(counter.keys())
word2idx = {w:i for i,w in enumerate(idx2word)}

vocab_size = len(idx2word)
vocab_size


4404

In [16]:
import torch
from torch.utils.data import Dataset, DataLoader

MAX_LEN = 128

class DomainDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        item = self.data[i]
        tokens = tokenize(get_text(item))[:MAX_LEN]
        ids = [word2idx.get(t, word2idx[UNK]) for t in tokens]
        label = domain2idx[get_label(item)]
        return ids, label

def collate_fn(batch):
    pad_id = word2idx[PAD]
    max_len = max(len(x[0]) for x in batch)

    input_ids, masks, labels = [], [], []
    for ids, y in batch:
        pad_len = max_len - len(ids)
        input_ids.append(ids + [pad_id]*pad_len)
        masks.append([1]*len(ids) + [0]*pad_len)
        labels.append(y)

    return (
        torch.tensor(input_ids),
        torch.tensor(masks),
        torch.tensor(labels)
    )


In [17]:
train_loader = DataLoader(DomainDataset(train_data), batch_size=64, shuffle=True, collate_fn=collate_fn)
dev_loader   = DataLoader(DomainDataset(dev_data), batch_size=64, shuffle=False, collate_fn=collate_fn)
test_loader  = DataLoader(DomainDataset(test_data), batch_size=64, shuffle=False, collate_fn=collate_fn)

len(train_loader), len(dev_loader), len(test_loader)


(69, 9, 9)

In [18]:
import math
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=512):
        super().__init__()
        self.dropout = nn.Dropout(dropout)

        pe = torch.zeros(max_len, d_model)              # [T, D]
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)                            # [1, T, D]
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x: [B, T, D]
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)


In [19]:
class TransformerEncoderClassifier(nn.Module):
    def __init__(self, vocab_size, num_classes,
                 d_model=256, nhead=8, dim_ff=1024, num_layers=3,
                 dropout=0.1, pad_id=0, max_len=512):
        super().__init__()
        self.pad_id = pad_id
        self.d_model = d_model

        self.embed = nn.Embedding(vocab_size, d_model, padding_idx=pad_id)
        self.pos   = PositionalEncoding(d_model, dropout=dropout, max_len=max_len)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_ff,
            dropout=dropout,
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(d_model, num_classes)

    def forward(self, input_ids, attention_mask):
        # input_ids: [B,T] ; attention_mask: [B,T] (1 token thật, 0 pad)
        x = self.embed(input_ids) * math.sqrt(self.d_model)
        x = self.pos(x)

        # True ở vị trí pad
        src_key_padding_mask = (attention_mask == 0)
        x = self.encoder(x, src_key_padding_mask=src_key_padding_mask)

        # pooling: lấy mean theo token thật (masked mean)
        mask = attention_mask.unsqueeze(-1).type_as(x)       # [B,T,1]
        x_sum = (x * mask).sum(dim=1)                        # [B,D]
        denom = mask.sum(dim=1).clamp(min=1e-9)              # [B,1]
        pooled = x_sum / denom                               # [B,D]

        logits = self.fc(self.dropout(pooled))
        return logits


In [20]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cuda')

In [22]:
num_classes = len(domains)
model = TransformerEncoderClassifier(
    vocab_size=vocab_size,
    num_classes=num_classes,
    d_model=256,
    nhead=8,
    dim_ff=1024,
    num_layers=3,
    dropout=0.1,
    pad_id=word2idx[PAD],
    max_len=512
).to(DEVICE)

sum(p.numel() for p in model.parameters())/1e6

3.497732

In [23]:
LR = 3e-4
EPOCHS = 30
PATIENCE = 3

optimizer = torch.optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()
best_path = "best_encoder3.pt"


In [24]:
def train_one_epoch(model, loader):
    model.train()
    total_loss = 0.0
    total = 0

    for input_ids, attn_mask, labels in loader:
        input_ids = input_ids.to(DEVICE)
        attn_mask = attn_mask.to(DEVICE)
        labels = labels.to(DEVICE)

        optimizer.zero_grad()
        logits = model(input_ids, attn_mask)
        loss = criterion(logits, labels)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        total_loss += loss.item() * labels.size(0)
        total += labels.size(0)

    return total_loss / max(1, total)


@torch.no_grad()
def evaluate(model, loader):
    model.eval()
    total_loss = 0.0
    total_correct = 0
    total = 0

    for input_ids, attn_mask, labels in loader:
        input_ids = input_ids.to(DEVICE)
        attn_mask = attn_mask.to(DEVICE)
        labels = labels.to(DEVICE)

        logits = model(input_ids, attn_mask)
        loss = criterion(logits, labels)

        total_loss += loss.item() * labels.size(0)
        total_correct += (logits.argmax(-1) == labels).sum().item()
        total += labels.size(0)

    return total_loss / max(1, total), total_correct / max(1, total)


In [25]:
best_val_loss = float("inf")
patience_left = PATIENCE

for epoch in range(1, EPOCHS + 1):
    train_loss = train_one_epoch(model, train_loader)
    val_loss, val_acc = evaluate(model, dev_loader)

    improved = val_loss < best_val_loss - 1e-4

    print(
        f"Epoch {epoch:02d} | "
        f"train_loss={train_loss:.4f} | "
        f"val_loss={val_loss:.4f} | "
        f"val_acc={val_acc:.4f} | "
        f"{'improved' if improved else 'no-improve'} | "
        f"patience_left={patience_left}"
    )

    if improved:
        best_val_loss = val_loss
        patience_left = PATIENCE
        torch.save(model.state_dict(), best_path)
    else:
        patience_left -= 1
        if patience_left == 0:
            print("Early stopping triggered!")
            break

print("Best val loss:", best_val_loss)
print("Saved:", best_path)


  output = torch._nested_tensor_from_mask(


Epoch 01 | train_loss=0.7381 | val_loss=0.4887 | val_acc=0.8303 | improved | patience_left=3
Epoch 02 | train_loss=0.4661 | val_loss=0.3952 | val_acc=0.8577 | improved | patience_left=3
Epoch 03 | train_loss=0.3589 | val_loss=0.4091 | val_acc=0.8595 | no-improve | patience_left=3
Epoch 04 | train_loss=0.2833 | val_loss=0.4193 | val_acc=0.8631 | no-improve | patience_left=2
Epoch 05 | train_loss=0.2563 | val_loss=0.3556 | val_acc=0.8832 | improved | patience_left=1
Epoch 06 | train_loss=0.2094 | val_loss=0.4149 | val_acc=0.8759 | no-improve | patience_left=3
Epoch 07 | train_loss=0.1934 | val_loss=0.4268 | val_acc=0.8832 | no-improve | patience_left=2
Epoch 08 | train_loss=0.1875 | val_loss=0.4446 | val_acc=0.8741 | no-improve | patience_left=1
Early stopping triggered!
Best val loss: 0.35560231617767446
Saved: best_encoder3.pt


In [28]:
model.load_state_dict(torch.load(best_path, map_location=DEVICE))
test_loss, test_acc = evaluate(model, test_loader)

print("TEST loss:", test_loss)
print("TEST acc :", test_acc)


TEST loss: 0.31018261521891816
TEST acc : 0.9180327868852459


In [29]:
import numpy as np

@torch.no_grad()
def macro_f1_eval(model, loader, num_classes):
    model.eval()
    preds_all = []
    labels_all = []
    for input_ids, attn_mask, labels in loader:
        input_ids = input_ids.to(DEVICE)
        attn_mask = attn_mask.to(DEVICE)

        logits = model(input_ids, attn_mask)
        preds = logits.argmax(-1).cpu().numpy()

        preds_all.append(preds)
        labels_all.append(labels.numpy())

    preds_all = np.concatenate(preds_all)
    labels_all = np.concatenate(labels_all)

    f1s = []
    for c in range(num_classes):
        tp = np.sum((preds_all == c) & (labels_all == c))
        fp = np.sum((preds_all == c) & (labels_all != c))
        fn = np.sum((preds_all != c) & (labels_all == c))
        prec = tp / (tp + fp + 1e-9)
        rec  = tp / (tp + fn + 1e-9)
        f1 = 2*prec*rec / (prec + rec + 1e-9)
        f1s.append(f1)

    return float(np.mean(f1s))

test_macro_f1 = macro_f1_eval(model, test_loader, num_classes=num_classes)
print("TEST macro-F1:", test_macro_f1)


TEST macro-F1: 0.8879501120434231


In [30]:
import numpy as np

@torch.no_grad()
def classification_report_per_label(model, loader, idx2domain, num_classes):
    model.eval()
    all_preds = []
    all_labels = []

    for input_ids, attn_mask, labels in loader:
        input_ids = input_ids.to(DEVICE)
        attn_mask = attn_mask.to(DEVICE)

        logits = model(input_ids, attn_mask)
        preds = logits.argmax(dim=-1).cpu().numpy()

        all_preds.append(preds)
        all_labels.append(labels.numpy())

    y_pred = np.concatenate(all_preds)
    y_true = np.concatenate(all_labels)

    print("="*70)
    print(f"{'Label':<15} {'Precision':>10} {'Recall':>10} {'F1':>10} {'Support':>10}")
    print("-"*70)

    f1_macro = []
    total_correct = 0

    for c in range(num_classes):
        tp = np.sum((y_pred == c) & (y_true == c))
        fp = np.sum((y_pred == c) & (y_true != c))
        fn = np.sum((y_pred != c) & (y_true == c))
        support = np.sum(y_true == c)

        precision = tp / (tp + fp + 1e-9)
        recall    = tp / (tp + fn + 1e-9)
        f1        = 2 * precision * recall / (precision + recall + 1e-9)

        f1_macro.append(f1)
        total_correct += tp

        print(f"{idx2domain[c]:<15} {precision:10.4f} {recall:10.4f} {f1:10.4f} {support:10d}")

    acc = total_correct / len(y_true)
    macro_f1 = float(np.mean(f1_macro))

    print("-"*70)
    print(f"{'Accuracy':<15} {acc:>10.4f}")
    print(f"{'Macro-F1':<15} {macro_f1:>10.4f}")
    print("="*70)

    return acc, macro_f1


In [31]:
test_acc, test_macro_f1 = classification_report_per_label(
    model,
    test_loader,
    idx2domain,
    num_classes=num_classes
)


Label            Precision     Recall         F1    Support
----------------------------------------------------------------------
app                 0.9635     0.9536     0.9585        194
cosmetic            0.9241     0.8933     0.9085        150
fashion             0.8935     0.9497     0.9207        159
mobile              0.7907     0.7391     0.7640         46
----------------------------------------------------------------------
Accuracy            0.9180
Macro-F1            0.8880
