In [1]:
import os
import math
import random
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer
from transformers.modeling_outputs import SequenceClassifierOutput
from sklearn.metrics import f1_score, classification_report
from IPython.display import display

In [2]:
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
MODEL_NAME = 'vinai/phobert-base'
DATA_DIR = '.'
TRAIN_PATH = os.path.join(DATA_DIR, 'train_data.csv')
VAL_PATH = os.path.join(DATA_DIR, 'val_data.csv')
TEST_PATH = os.path.join(DATA_DIR, 'test_data.csv')
ASPECT_COLUMNS = ['Price','Shipping','Outlook','Quality','Size','Shop_Service','General','Others']
LABEL_MAP = {-1: 0, 0: 1, 1: 2}
LABEL_NAMES = ['positive_or_present','neutral','negative']
IGNORE_INDEX = -100
NUM_LABELS = len(LABEL_MAP)
MAX_LENGTH = 256
RANDOM_SEED = 123
def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
set_seed(RANDOM_SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
# Doc du lieu csv va hien thi thong tin tong quat
train_df = pd.read_csv(TRAIN_PATH)
val_df = pd.read_csv(VAL_PATH)
test_df = pd.read_csv(TEST_PATH)
print(f'Kích thước tập train: {train_df.shape}')
print(f'Kích thước tập validation: {val_df.shape}')
print(f'Kích thước tập test: {test_df.shape}')
print('Các cột trong file:', list(train_df.columns))
print('\nKiểu dữ liệu:')
print(train_df.dtypes)
print('\nMẫu bản ghi train:')
display(train_df.head(5))
print('\nMẫu bản ghi validation:')
display(val_df.head(5))
print('\nMẫu bản ghi test:')
display(test_df.head(5))

Kích thước tập train: (8424, 9)
Kích thước tập validation: (936, 9)
Kích thước tập test: (2340, 9)
Các cột trong file: ['Review', 'Price', 'Shipping', 'Outlook', 'Quality', 'Size', 'Shop_Service', 'General', 'Others']

Kiểu dữ liệu:
Review          object
Price            int64
Shipping         int64
Outlook          int64
Quality          int64
Size             int64
Shop_Service     int64
General          int64
Others           int64
dtype: object

Mẫu bản ghi train:


Unnamed: 0,Review,Price,Shipping,Outlook,Quality,Size,Shop_Service,General,Others
0,"Giày đẹp, đi êm lắm",-1,-1,1,1,-1,-1,-1,-1
1,Mình săn sale với giá khá rẻ Chất lượng ok Sh...,1,1,-1,1,-1,-1,-1,-1
2,Hình ảnh và video chỉ mang tính chất minh họa ...,1,1,-1,-1,-1,-1,1,-1
3,Mình đặt size 39 nhưng chật k đeo nổi. Còn giầ...,-1,1,1,-1,0,-1,-1,-1
4,Nên mua nha mọi người đẹp xuất sắc lun ạ,-1,-1,1,-1,-1,-1,-1,-1



Mẫu bản ghi validation:


Unnamed: 0,Review,Price,Shipping,Outlook,Quality,Size,Shop_Service,General,Others
0,Hàng đẹp so với giá nha .nhưng đóng gói không ...,-1,-1,1,-1,-1,0,-1,-1
1,Dịch vụ khách hàng giới thiệu sản phẩm rất tốt...,-1,-1,-1,-1,-1,1,-1,-1
2,giao nhanh giày đẹp êm chân nhưng from giày hơ...,-1,1,1,1,2,-1,-1,-1
3,Ngoài ngắm cực phẩm thi đấu trong sân thì ngoà...,-1,-1,-1,-1,-1,-1,-1,2
4,Dài xinh nha from lên đẹp xỉu êm chân lắm nha ...,1,-1,1,-1,-1,-1,-1,-1



Mẫu bản ghi test:


Unnamed: 0,Review,Price,Shipping,Outlook,Quality,Size,Shop_Service,General,Others
0,"Giày hơi có mùi nồng, lưu ý đôi LA không phải ...",-1,-1,-1,2,-1,-1,2,-1
1,Hàng về đẹp lắm nha ship thân thiện đi giày vừ...,-1,1,1,-1,-1,-1,-1,-1
2,Hàng ôk nên mua Dày rất đẹp,-1,-1,1,-1,-1,-1,2,-1
3,Bun. GTI gửi Oke sớ ơ đi sidbd. Bởi đi được đ...,-1,-1,-1,-1,-1,-1,-1,2
4,Màu đẹp giống trong hình mọi người nên mua nha...,-1,-1,1,-1,-1,-1,-1,-1


In [4]:
# Tong hop phan bo nhan cho tung khia canh
def summarize_aspects(df: pd.DataFrame, split_name: str):
    print(f'\n=== TẬP {split_name.upper()} ===')
    for col in ASPECT_COLUMNS:
        counts = df[col].value_counts().sort_index()
        print(f'{col}: {counts.to_dict()}')
summarize_aspects(train_df, 'train')
summarize_aspects(val_df, 'validation')
summarize_aspects(test_df, 'test')


=== TẬP TRAIN ===
Price: {-1: 7173, 0: 12, 1: 929, 2: 310}
Shipping: {-1: 5912, 0: 390, 1: 1992, 2: 130}
Outlook: {-1: 3859, 0: 332, 1: 4007, 2: 226}
Quality: {-1: 5902, 0: 330, 1: 1795, 2: 397}
Size: {-1: 6986, 0: 493, 1: 609, 2: 336}
Shop_Service: {-1: 6208, 0: 494, 1: 1605, 2: 117}
General: {-1: 6780, 0: 56, 1: 984, 2: 604}
Others: {-1: 7768, 2: 656}

=== TẬP VALIDATION ===
Price: {-1: 798, 0: 1, 1: 106, 2: 31}
Shipping: {-1: 627, 0: 50, 1: 241, 2: 18}
Outlook: {-1: 433, 0: 32, 1: 454, 2: 17}
Quality: {-1: 657, 0: 35, 1: 194, 2: 50}
Size: {-1: 786, 0: 44, 1: 68, 2: 38}
Shop_Service: {-1: 693, 0: 53, 1: 180, 2: 10}
General: {-1: 748, 0: 6, 1: 103, 2: 79}
Others: {-1: 845, 2: 91}

=== TẬP TEST ===
Price: {-1: 1999, 0: 3, 1: 247, 2: 91}
Shipping: {-1: 1635, 0: 124, 1: 549, 2: 32}
Outlook: {-1: 1069, 0: 95, 1: 1118, 2: 58}
Quality: {-1: 1654, 0: 98, 1: 478, 2: 110}
Size: {-1: 1953, 0: 125, 1: 165, 2: 97}
Shop_Service: {-1: 1740, 0: 140, 1: 431, 2: 29}
General: {-1: 1861, 0: 11, 1: 285,

In [5]:
# Chuan hoa nhan va khoi tao tokenizer cung lop dataset
INV_LABEL_MAP = {v: k for k, v in LABEL_MAP.items()}
def normalize_labels(frame: pd.DataFrame) -> pd.DataFrame:
    cleaned = frame.copy()
    for col in ASPECT_COLUMNS:
        cleaned[col] = cleaned[col].apply(lambda value: LABEL_MAP.get(value, IGNORE_INDEX))
    return cleaned
train_proc = normalize_labels(train_df)
val_proc = normalize_labels(val_df)
test_proc = normalize_labels(test_df)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
class ABSADataset(Dataset):
    def __init__(self, frame: pd.DataFrame, tokenizer: AutoTokenizer, max_length: int):
        self.frame = frame.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.frame)
    def __getitem__(self, idx: int):
        record = self.frame.iloc[idx]
        text = str(record['Review'])
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length)
        labels = record[ASPECT_COLUMNS].astype(int).tolist()
        item = {key: torch.tensor(value) for key, value in encoding.items()}
        item['labels'] = torch.tensor(labels, dtype=torch.long)
        return item
train_dataset = ABSADataset(train_proc, tokenizer, MAX_LENGTH)
val_dataset = ABSADataset(val_proc, tokenizer, MAX_LENGTH)
test_dataset = ABSADataset(test_proc, tokenizer, MAX_LENGTH)
print('Ví dụ bản ghi đã mã hóa:')
sample_item = train_dataset[0]
print({key: value.shape for key, value in sample_item.items()})
print('Nhãn mẫu:', sample_item['labels'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

bpe.codes: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Ví dụ bản ghi đã mã hóa:
{'input_ids': torch.Size([256]), 'token_type_ids': torch.Size([256]), 'attention_mask': torch.Size([256]), 'labels': torch.Size([8])}
Nhãn mẫu: tensor([0, 0, 2, 2, 0, 0, 0, 0])


In [6]:
# Xay dung mo hinh PhoBERT multi aspect
class PhoBERTMultiAspectClassifier(nn.Module):
    def __init__(self, model_name: str, num_aspects: int, num_labels: int, ignore_index: int, dropout: float = 0.2):
        super().__init__()
        self.num_aspects = num_aspects
        self.num_labels = num_labels
        self.ignore_index = ignore_index
        self.backbone = AutoModel.from_pretrained(model_name)
        hidden_size = self.backbone.config.hidden_size
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(hidden_size, num_aspects * num_labels)
    def forward(self, input_ids, attention_mask, token_type_ids=None, labels=None):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0]
        dropped = self.dropout(pooled)
        logits = self.classifier(dropped)
        logits = logits.view(-1, self.num_aspects, self.num_labels)
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=self.ignore_index)
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        return SequenceClassifierOutput(loss=loss, logits=logits)
model = PhoBERTMultiAspectClassifier(MODEL_NAME, len(ASPECT_COLUMNS), NUM_LABELS, IGNORE_INDEX)

pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

In [7]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.reshape(-1, NUM_LABELS)
    labels = labels.reshape(-1)
    mask = labels != IGNORE_INDEX
    if mask.sum() == 0:
        return {'macro_f1': 0.0, 'micro_f1': 0.0}
    filtered_labels = labels[mask]
    filtered_preds = preds[mask]
    pred_ids = filtered_preds.argmax(axis=-1)
    macro_f1 = f1_score(filtered_labels, pred_ids, average='macro')
    micro_f1 = f1_score(filtered_labels, pred_ids, average='micro')
    return {'macro_f1': macro_f1, 'micro_f1': micro_f1}

_common_args = dict(
    output_dir='absa_phobert_runs',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=3e-5,
    weight_decay=0.01,
    warmup_ratio=0.1,
    logging_steps=50,
    fp16=torch.cuda.is_available(),
    gradient_accumulation_steps=1,
    report_to='none',
    remove_unused_columns=False
)

try:
    training_args = TrainingArguments(
        **_common_args,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        load_best_model_at_end=True,
        metric_for_best_model='macro_f1',
        greater_is_better=True,
    )
    print("Using TrainingArguments with evaluation_strategy/save_strategy params (modern API).")
except TypeError as e:
    # Fallback for older transformers where evaluation_strategy isn't accepted as kwarg.
    # Create without load_best_model_at_end/eval/save strategy to skip __post_init__ validation,
    # then set attributes manually on the created object.
    print("Falling back to compatibility mode (old transformers). Error:", e)
    # create without the problematic keywords
    training_args = TrainingArguments(**_common_args)

    # set attributes manually AFTER init to avoid __post_init__ validation errors
    # Use strings here ('epoch') — Trainer and downstream code normally read these attributes.
    setattr(training_args, "evaluation_strategy", "epoch")
    setattr(training_args, "save_strategy", "epoch")
    setattr(training_args, "load_best_model_at_end", True)
    setattr(training_args, "metric_for_best_model", "macro_f1")
    setattr(training_args, "greater_is_better", True)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

train_result = trainer.train()
trainer.save_state()
print(train_result)

Falling back to compatibility mode (old transformers). Error: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'


  trainer = Trainer(


model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Step,Training Loss
50,0.9402
100,0.5826
150,0.5401
200,0.4831
250,0.4056
300,0.3762
350,0.348
400,0.3166
450,0.2947
500,0.2757


TrainOutput(global_step=1581, training_loss=0.28589816699670434, metrics={'train_runtime': 508.2086, 'train_samples_per_second': 49.728, 'train_steps_per_second': 3.111, 'total_flos': 0.0, 'train_loss': 0.28589816699670434, 'epoch': 3.0})


In [8]:
# Danh gia mo hinh tren validation va test
def evaluate_split(dataset, split_name: str):
    metrics = trainer.evaluate(eval_dataset=dataset)
    print(f'\nChỉ số {split_name.title()}: {metrics}')
    predictions = trainer.predict(dataset)
    logits = predictions.predictions
    labels = predictions.label_ids
    pred_ids = logits.reshape(-1, NUM_LABELS).argmax(axis=-1)
    flat_labels = labels.reshape(-1)
    mask = flat_labels != IGNORE_INDEX
    if mask.sum() == 0:
        print('Không có mẫu hợp lệ để đánh giá.')
        return
    masked_preds = pred_ids[mask]
    masked_labels = flat_labels[mask]
    report = classification_report(masked_labels, masked_preds, target_names=LABEL_NAMES, digits=4)
    print(f'Báo cáo phân loại ({split_name}):')
    print(report)
evaluate_split(val_dataset, 'validation')
evaluate_split(test_dataset, 'test')


Chỉ số Validation: {'eval_loss': 0.1900913417339325, 'eval_macro_f1': 0.8420171232188313, 'eval_micro_f1': 0.9415711490075482, 'eval_runtime': 3.8501, 'eval_samples_per_second': 243.107, 'eval_steps_per_second': 7.792, 'epoch': 3.0}
Báo cáo phân loại (validation):
                     precision    recall  f1-score   support

positive_or_present     0.9620    0.9699    0.9660      5587
            neutral     0.7640    0.6154    0.6817       221
           negative     0.8794    0.8774    0.8784      1346

           accuracy                         0.9416      7154
          macro avg     0.8685    0.8209    0.8420      7154
       weighted avg     0.9403    0.9416    0.9407      7154




Chỉ số Test: {'eval_loss': 0.18383270502090454, 'eval_macro_f1': 0.8520630769313473, 'eval_micro_f1': 0.9437845072778986, 'eval_runtime': 9.4442, 'eval_samples_per_second': 247.771, 'eval_steps_per_second': 7.835, 'epoch': 3.0}
Báo cáo phân loại (test):
                     precision    recall  f1-score   support

positive_or_present     0.9646    0.9700    0.9673     14062
            neutral     0.8274    0.6191    0.7083       596
           negative     0.8712    0.8903    0.8806      3273

           accuracy                         0.9438     17931
          macro avg     0.8877    0.8265    0.8521     17931
       weighted avg     0.9430    0.9438    0.9429     17931



In [9]:
# Luu mo hinh va thu inference tren review mau
FINAL_DIR = 'absa_phobert_model'
trainer.save_model(FINAL_DIR)
tokenizer.save_pretrained(FINAL_DIR)
def predict_aspects(review_text: str):
    model.eval()
    encoded = tokenizer(review_text, truncation=True, padding='max_length', max_length=MAX_LENGTH, return_tensors='pt')
    model_device = next(model.parameters()).device
    encoded = {key: value.to(model_device) for key, value in encoded.items()}
    with torch.no_grad():
        outputs = model(**encoded)
        preds = outputs.logits.argmax(dim=-1).squeeze(0).tolist()
    decoded = {aspect: INV_LABEL_MAP.get(int(pred), None) for aspect, pred in zip(ASPECT_COLUMNS, preds)}
    return decoded
sample_review = 'Giày đẹpppp nha mn Nênnnn muaaa ạ mình mua sz 37 vừa chân luônn'
prediction = predict_aspects(sample_review)
print('Dự đoán cảm xúc trên từng khía cạnh:')
for aspect, value in prediction.items():
    print(f'{aspect}: {value}')

Dự đoán cảm xúc trên từng khía cạnh:
Price: -1
Shipping: -1
Outlook: 1
Quality: -1
Size: 1
Shop_Service: -1
General: -1
Others: -1


In [10]:
import shutil

# Giả sử folder tên 'my_model'
shutil.make_archive('absa_phobert_model', 'zip', 'absa_phobert_model')


'/content/absa_phobert_model.zip'