In [8]:
!pip install transformers datasets accelerate evaluate -q

import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# ============== LOAD DATA ==============
df_train = pd.read_csv("train.csv")
df_val   = pd.read_csv("val.csv")
df_test  = pd.read_csv("test.csv")

# Dataset HuggingFace format
train_dataset = Dataset.from_pandas(df_train)
val_dataset   = Dataset.from_pandas(df_val)
test_dataset  = Dataset.from_pandas(df_test)

# ============== TOKENIZER ==============
model_name = "vinai/phobert-base"   # hoặc "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# map labels -> id
labels = sorted(df_train["label"].unique())
label2id = {l: i for i, l in enumerate(labels)}
id2label = {i: l for l, i in label2id.items()}

def encode(examples):
    # input = context + prompt
    texts = [str(c) + " " + str(p) for c, p in zip(examples["context"], examples["prompt"])]
    enc = tokenizer(texts, truncation=True, padding="max_length", max_length=128)
    enc["labels"] = [label2id[l] for l in examples["label"]]
    return enc

train_dataset = train_dataset.map(encode, batched=True)
val_dataset   = val_dataset.map(encode, batched=True)
test_dataset  = test_dataset.map(encode, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids","attention_mask","labels"])
val_dataset.set_format(type="torch", columns=["input_ids","attention_mask","labels"])
test_dataset.set_format(type="torch", columns=["input_ids","attention_mask","labels"])

# ============== MODEL ==============
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

# ============== METRICS ==============
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

# ============== TRAINING ==============
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",     # ✅ đổi từ evaluation_strategy
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

# ============== EVALUATE ==============
results = trainer.evaluate(test_dataset)
print("Test set results:", results)


Map:   0%|          | 0/142 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,1.069381,0.4375,0.328526,0.4375,0.329861
2,No log,1.038321,0.625,0.589286,0.625,0.5625
3,No log,1.038259,0.5,0.329861,0.5,0.393669
4,No log,0.965112,0.625,0.638889,0.625,0.584821
5,No log,0.91404,0.6875,0.69375,0.6875,0.6875
6,No log,0.886102,0.6875,0.716071,0.6875,0.690657
7,No log,0.846293,0.6875,0.71875,0.6875,0.671474
8,No log,0.836949,0.5625,0.379464,0.5625,0.452724
9,No log,0.793015,0.8125,0.810417,0.8125,0.806818
10,No log,0.796783,0.8125,0.810417,0.8125,0.806818


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Test set results: {'eval_loss': 0.9287965893745422, 'eval_accuracy': 0.525, 'eval_precision': 0.5664473684210527, 'eval_recall': 0.525, 'eval_f1': 0.5179976851851851, 'eval_runtime': 0.7798, 'eval_samples_per_second': 51.295, 'eval_steps_per_second': 6.412, 'epoch': 10.0}


In [9]:
results = trainer.evaluate(test_dataset)
print("\n📊 Evaluation on Test set:")
for k, v in results.items():
    print(f"{k}: {v:.4f}")


📊 Evaluation on Test set:
eval_loss: 0.9288
eval_accuracy: 0.5250
eval_precision: 0.5664
eval_recall: 0.5250
eval_f1: 0.5180
eval_runtime: 0.9717
eval_samples_per_second: 41.1650
eval_steps_per_second: 5.1460
epoch: 10.0000
