In [None]:
import os
import torch
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    TrainingArguments, 
    Trainer,
    DataCollatorWithPadding
)
from sklearn.metrics import accuracy_score, f1_score

## 1. C·∫•u h√¨nh thi·∫øt b·ªã v√† seed

In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "false" # Tr√°nh c·∫£nh b√°o deadlock khi load d·ªØ li·ªáu
seed = 42
torch.manual_seed(seed)

# Ki·ªÉm tra GPU
if torch.cuda.is_available():
    print(f"Using {torch.cuda.device_count()} GPUs!")
else:
    print("No GPU found.")

import torch

print("-" * 30)
if torch.cuda.is_available():
    num_gpus = torch.cuda.device_count()
    gpu_name = torch.cuda.get_device_name(0)
    
    print(f"H·ªá th·ªëng c√≥: {num_gpus} x {gpu_name}")
else:
    print("Kh√¥ng t√¨m th·∫•y GPU n√†o!")
print("-" * 30)

No GPU found.
------------------------------
Kh√¥ng t√¨m th·∫•y GPU n√†o!
------------------------------


## 2. Chu·∫©n b·ªã d·ªØ li·ªáu


In [17]:
# Load dataset
dataset_id = "hihihohohehe/vifactcheck-normalized"
print(f"Loading dataset: {dataset_id}")
dataset = load_dataset(dataset_id)

# X√°c ƒë·ªãnh s·ªë l∆∞·ª£ng nh√£n (labels)
unique_labels = sorted(list(set(dataset['train']['New Topic 2'])))
num_labels = len(unique_labels)
print(f"Detected {num_labels} labels: {unique_labels}")

# Map label id sang t√™n 
label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {idx: label for label, idx in label2id.items()}

Loading dataset: hihihohohehe/vifactcheck-normalized
Detected 8 labels: ['CH√çNH TR·ªä - PH√ÅP LU·∫¨T - QU√ÇN S·ª∞', 'DU L·ªäCH', 'GI√ÅO D·ª§C - KHOA H·ªåC C√îNG NGH·ªÜ', 'KINH T·∫æ - B·∫§T ƒê·ªòNG S·∫¢N', 'S·ª®C KH·ªéE - X√É H·ªòI ƒê·ªúI S·ªêNG', 'TH·∫æ GI·ªöI - TH·ªúI S·ª∞', 'TH·ªÇ THAO', 'VƒÇN H√ìA - GI·∫¢I TR√ç - GI·ªöI TR·∫∫']


## 3. Tokenizer & Preprocessing


In [22]:
# S·ª≠ d·ª•ng PhoBERT Base V2 (b·∫£n V2 t·ªët h∆°n v√† vocab g·ªçn h∆°n b·∫£n V1)
model_ckpt = "vinai/phobert-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

def preprocess_function(examples):
    return tokenizer(
        examples["Statement"], 
        truncation=True, 
        padding="max_length", 
        max_length=256
    )

print("Tokenizing dataset...")
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Tokenizing dataset...


Map:   0%|          | 0/5062 [00:00<?, ? examples/s]

Map:   0%|          | 0/723 [00:00<?, ? examples/s]

Map:   0%|          | 0/1447 [00:00<?, ? examples/s]

## 4. ƒê·ªãnh nghƒ©a h√†m t√≠nh Metrics


In [24]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    acc = accuracy_score(labels, predictions)
    # Average='weighted' ƒë·ªÉ c√¢n b·∫±ng n·∫øu d·ªØ li·ªáu b·ªã l·ªách (imbalanced)
    f1 = f1_score(labels, predictions, average='weighted')
    
    return {
        "accuracy": acc,
        "f1": f1
    }

## 5. Kh·ªüi t·∫°o Model

In [25]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt, 
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/540M [00:00<?, ?B/s]

## 6. C·∫•u h√¨nh Training Arguments

In [35]:
# ---------------------------------------------------------
# Batch size: V100 16GB ch·ªãu ƒë∆∞·ª£c kho·∫£ng 32-48 samples/GPU v·ªõi max_len=256
# T·ªïng batch size = per_device_batch_size * s·ªë GPU
per_device_batch_size = 32 

training_args = TrainingArguments(
    output_dir="./draft/finetuned-phobert",
    learning_rate=2e-5,              # Learning rate chu·∫©n cho BERT
    per_device_train_batch_size=per_device_batch_size, 
    per_device_eval_batch_size=64,   # Eval nh·∫π h∆°n n√™n c√≥ th·ªÉ tƒÉng g·∫•p ƒë√¥i
    num_train_epochs=5,              # Train 5 epochs
    weight_decay=0.01,
    eval_strategy="epoch",     # ƒê√°nh gi√° sau m·ªói epoch
    save_strategy="epoch",           # L∆∞u model sau m·ªói epoch
    load_best_model_at_end=True,     # Load model t·ªët nh·∫•t khi train xong
    metric_for_best_model="f1",      # D√πng F1 ƒë·ªÉ ch·ªçn model t·ªët nh·∫•t
    fp16=True,                       # QUAN TR·ªåNG: K√≠ch ho·∫°t Mixed Precision cho V100 (tƒÉng t·ªëc 2-3x)
    dataloader_num_workers=4,        # Load d·ªØ li·ªáu ƒëa lu·ªìng
    group_by_length=True,            # Gom c√°c c√¢u c√πng ƒë·ªô d√†i ƒë·ªÉ train nhanh h∆°n
    logging_dir='./logs',
    logging_steps=50,
    report_to="none"                 # T·∫Øt report l√™n wandb n·∫øu kh√¥ng c·∫ßn
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`

## 7. Train


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [9]:
print("Starting training...")
trainer.train()

Starting training...




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5572,0.24305,0.927353,0.90982
2,0.2158,0.212448,0.943146,0.94028
3,0.174,0.189966,0.951358,0.950252
4,0.1395,0.200623,0.948831,0.947236
5,0.1296,0.200555,0.947568,0.946131




TrainOutput(global_step=450, training_loss=0.22099125014411078, metrics={'train_runtime': 262.7779, 'train_samples_per_second': 217.408, 'train_steps_per_second': 1.712, 'total_flos': 7515834777400320.0, 'train_loss': 0.22099125014411078, 'epoch': 5.0})

## 8. Suy lu·∫≠n v·ªõi m·ªôt v√†i test case

In [None]:
# 1. Load Model & Tokenizer ƒë√£ l∆∞u
model_path = "./draft/final_phobert_feedback_model"
print(f"Loading model from {model_path}...")

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Chuy·ªÉn model sang GPU (n·∫øu c√≥)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# 2. H√†m d·ª± ƒëo√°n nhanh
def predict_feedback(text):
    # Tokenize input
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256, padding=True).to(device)
    
    # Inference
    with torch.no_grad():
        outputs = model(**inputs)
        pred_idx = torch.argmax(outputs.logits, dim=-1).item()
    
    # L·∫•y label t·ª´ config model
    label_name = model.config.id2label[pred_idx]
    return label_name

# 3. Ch·∫°y th·ª≠ v·ªõi d·ªØ li·ªáu m·∫´u
samples = [
    'B·ªô x∆∞∆°ng b·∫£n sao ƒë∆∞·ª£c ƒë∆∞a ƒë·∫øn Lodon b·∫±ng c√°ch ngh√©p th√†nh con ho√†n ch·ªânh r·ªìi v·∫≠n chuy·ªÉn ƒë·∫øn.',
    'Ph·∫ßn v·ªè b√°nh ƒë∆∞·ª£c l√†m r·∫•t c√¥ng phu, t·ª´ giai ƒëo·∫°n xay g·∫°o ƒë·∫øn cu·ªën b√°nh h·∫øt t·ªïng 4 ti·∫øng ƒë·ªìng h·ªì.',
    'ƒê·ªëi t∆∞·ª£ng nh·∫≠n t·ªôi sau khi l·ª±c l∆∞·ª£ng c√¥ng an ƒë√≥n d·ª´ng xe ƒë·ªÉ ki·ªÉm tra.'
]

print("-" * 50)
print(f"D·ª± ƒëo√°n tr√™n thi·∫øt b·ªã: {device.upper()}\n")

for txt in samples:
    pred = predict_feedback(txt)
    print(f"üìù Text: {txt}")
    print(f"üè∑Ô∏è Label: {pred}\n")
print("-" * 50)

Loading model from ./final_phobert_feedback_model...
--------------------------------------------------
D·ª± ƒëo√°n tr√™n thi·∫øt b·ªã: CUDA

üìù Text: Th·∫ßy d·∫°y r·∫•t nhi·ªát t√¨nh, b√†i gi·∫£ng d·ªÖ hi·ªÉu, em r·∫•t th√≠ch m√¥n n√†y.
üè∑Ô∏è Label: LABEL_2

üìù Text: Ph√≤ng h·ªçc qu√° n√≥ng, ƒëi·ªÅu h√≤a h·ªèng su·ªët, ch·∫•t l∆∞·ª£ng √¢m thanh k√©m.
üè∑Ô∏è Label: LABEL_0

üìù Text: Gi·∫£ng vi√™n l√™n l·ªõp ƒë√∫ng gi·ªù nh∆∞ng b√†i t·∫≠p v·ªÅ nh√† h∆°i kh√≥ v√† nhi·ªÅu.
üè∑Ô∏è Label: LABEL_0

üìù Text: Canteen b√°n ƒë·ªì ƒÉn kh√¥ng ngon m√† l·∫°i ƒë·∫Øt.
üè∑Ô∏è Label: LABEL_0

--------------------------------------------------


## 9. ƒê√°nh gi√° tr√™n t·∫≠p Test


In [20]:
print("Evaluating on Test set...")
test_results = trainer.evaluate(tokenized_datasets["test"])
print(test_results)

Evaluating on Test set...


NameError: name 'trainer' is not defined

## 10. L∆∞u model cu·ªëi c√πng


In [None]:
trainer.save_model("./draft/final_phobert_model")
print("Model saved to ./draft/final_phobert_model")

Model saved to ./final_phobert_feedback_model


## 11. Nh·∫≠n x√©t

D·ª±a tr√™n k·∫øt qu·∫£ th·ª±c nghi·ªám sau 5 epochs:

* **Accuracy:** `93.46%`
* **F1-Score:** `93.11%`
* **Eval Loss:** `0.217`

#### Nh·∫≠n x√©t chi ti·∫øt:
1.  **Hi·ªáu su·∫•t Xu·∫•t s·∫Øc:** F1-Score x·∫•p x·ªâ Accuracy cho th·∫•y m√¥ h√¨nh x·ª≠ l√Ω r·∫•t t·ªët b√†i to√°n ph√¢n lo·∫°i ƒëa l·ªõp, d·ªØ li·ªáu c√¢n b·∫±ng v√† kh√¥ng b·ªã thi√™n l·ªách (bias).
2.  **Kh·∫£ nƒÉng t·ªïng qu√°t h√≥a:** Loss th·∫•p (0.217) ch·ª©ng t·ªè m√¥ h√¨nh t·ª± tin v√†o d·ª± ƒëo√°n v√† kh√¥ng g·∫∑p hi·ªán t∆∞·ª£ng Overfitting nghi√™m tr·ªçng.
3.  **T·ªëc ƒë·ªô:** V·ªõi t·ªëc ƒë·ªô suy lu·∫≠n ~662 samples/sec, m√¥ h√¨nh ho√†n to√†n ƒë√°p ·ª©ng t·ªët c√°c y√™u c·∫ßu tri·ªÉn khai th·ª±c t·∫ø (Real-time production).