In [1]:
import numpy as np
import pandas as pd 
from transformers import BertTokenizer, Trainer, BertForSequenceClassification, TrainingArguments
from transformers import ElectraTokenizer, ElectraForSequenceClassification
from datasets import Dataset
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from trl import SFTTrainer
import os,torch, wandb, platform, gradio, warnings
from pyvi import ViTokenizer


In [2]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [3]:
import torch
import transformers
torch.__version__, transformers.__version__

('2.7.0+cu118', '4.51.3')

In [4]:
torch.cuda.is_available()

True

### load dataset

In [5]:
filename_train = "data_pseudo_vireport/train_data_high.csv"
filename_test = "data_pseudo_vireport/final_test.csv"

data_train = pd.read_csv(filename_train)
data_test = pd.read_csv(filename_test)
data_train

Unnamed: 0,Sentences,labels
0,Thêm dấu phẩy sau trên 700 nghìn tỷ đồng vốn đ...,0
1,Lựa chọn cổ phiếu niêm yết có chỉ số PB cao đư...,0
2,Chính sách bảo hiểm nông nghiệp là tia hy vọng...,2
3,Ngoài ra Vingroup cam kết phát triển các dự án...,2
4,Diện tích trồng cà gai leo tại địa phương là k...,1
...,...,...
37631,Chúng tôi là một trong những công ty Thụy Sĩ đ...,1
37632,Chúng ta sẽ đạt được mục tiêu này bằng cách tr...,1
37633,Điều này có nghĩa là cải thiện hiệu quả điện n...,1
37634,Năm 2021 chúng tôi đã tiến hành bước đầu tiên ...,1


In [6]:
data_train['labels'].value_counts()

labels
2    14155
0     9511
3     8203
1     5767
Name: count, dtype: int64

In [7]:
data_test['labels'].value_counts()

labels
2    996
3    924
1    570
0    510
Name: count, dtype: int64

### load FinBERT pretrained model
The pretrained FinBERT model path on Huggingface is https://huggingface.co/yiyanghkust/finbert-pretrain

In [8]:
import torch
tokenizer = ElectraTokenizer.from_pretrained('NlpHUST/vi-electra-small')
model = ElectraForSequenceClassification.from_pretrained('NlpHUST/vi-electra-small',num_labels=4)
model.cuda()

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at NlpHUST/vi-electra-small and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(62000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Li

In [9]:
# for param in model.bert.parameters():
#     param.requires_grad = False

# Keep only the classification head trainable
for param in model.classifier.parameters():
    param.requires_grad = True

### prepare dataset for fine-tuning

In [10]:
from datasets import Dataset
from gensim.utils import simple_preprocess

def preprocess_text(text):
    tokens = simple_preprocess(text)
    # Ghép lại thành câu
    text = ' '.join(tokens)
    # Tách từ kiểu tiếng Việt (giữ định dạng từ ghép có dấu gạch dưới)
    text = ViTokenizer.tokenize(text)
    return text

# Tiền xử lý cột văn bản trong từng DataFrame
data_train['Sentences'] = data_train['Sentences'].apply(preprocess_text)
data_test['Sentences'] = data_test['Sentences'].apply(preprocess_text)

# Chuyển sang Dataset của HuggingFace
dataset_train = Dataset.from_pandas(data_train)
dataset_val = Dataset.from_pandas(data_train)


dataset_train = dataset_train.map(lambda e: tokenizer(e['Sentences'], truncation=True, padding='max_length', max_length=128), batched=True)
dataset_val = dataset_val.map(lambda e: tokenizer(e['Sentences'], truncation=True, padding='max_length', max_length=128), batched=True)

dataset_train.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
dataset_val.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])


Map:   0%|          | 0/37636 [00:00<?, ? examples/s]

Map:   0%|          | 0/37636 [00:00<?, ? examples/s]

In [11]:
dataset_train

Dataset({
    features: ['Sentences', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 37636
})

In [12]:
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader
data_collator = DataCollatorWithPadding(tokenizer)

# Tạo DataLoader
train_dataloader = DataLoader(
    dataset_train, 
    batch_size=32, 
    collate_fn=data_collator,
    shuffle=True
)

# Kiểm tra kích thước batch
for batch in train_dataloader:
    print("Input IDs shape:", batch["input_ids"].shape)
    print("Labels shape:", batch["labels"].shape)
    print((batch["labels"][0]))
    break


Input IDs shape: torch.Size([32, 128])
Labels shape: torch.Size([32])
tensor(3)


### define training options

In [13]:
wandb.login(key="e7056d3a3bc855a6e0d38b8c4ff7d2ec7ce895af")
run = wandb.init(project='viELECTRA', job_type="training", anonymous="allow")


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/pc/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtrung235689[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [14]:
print(model.config.num_labels)  


4


In [22]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'accuracy' : accuracy_score(predictions, labels)}

args = TrainingArguments(
        output_dir = 'temp/',
        eval_strategy = 'epoch',
        save_strategy = 'epoch',
        learning_rate= 2e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=10,  # 1 epoch
        weight_decay=0.005,
        load_best_model_at_end=True,
        metric_for_best_model='accuracy',
)


trainer = Trainer(
        model=model,                         # the instantiated 🤗 Transformers model to be trained
        args=args,                  # training arguments, defined above
        train_dataset=dataset_train,         # training dataset
        eval_dataset=dataset_val,            # evaluation dataset
        compute_metrics=compute_metrics
)

trainer.train()   

Epoch,Training Loss,Validation Loss,Accuracy
1,0.1616,0.223027,0.934956
2,0.1581,0.151643,0.953157
3,0.1606,0.130092,0.960517
4,0.1736,0.159131,0.949968
5,0.1777,0.068444,0.98071
6,0.1867,0.087156,0.971782
7,0.1716,0.084879,0.972712
8,0.1534,0.048593,0.986795
9,0.1423,0.049574,0.985891
10,0.1354,0.050178,0.985785


TrainOutput(global_step=11770, training_loss=0.16118919288291705, metrics={'train_runtime': 1420.2293, 'train_samples_per_second': 264.999, 'train_steps_per_second': 8.287, 'total_flos': 2768242574745600.0, 'train_loss': 0.16118919288291705, 'epoch': 10.0})












### evaluate on testing set

In [23]:
model.eval()
trainer.predict(dataset_val).metrics

{'test_loss': 0.04859252646565437,
 'test_accuracy': 0.9867945584015304,
 'test_runtime': 32.0251,
 'test_samples_per_second': 1175.205,
 'test_steps_per_second': 36.752}

### save the fine-tuned model

In [24]:
trainer.save_model('VIELECTRA_train/')
tokenizer.save_pretrained("./VIELECTRA_train/")


Saving vocabulary to ./VIELECTRA_train/vocab.txt: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!


('./VIELECTRA_train/tokenizer_config.json',
 './VIELECTRA_train/special_tokens_map.json',
 './VIELECTRA_train/vocab.txt',
 './VIELECTRA_train/added_tokens.json')

In [25]:
model.eval()
predictions = trainer.predict(dataset_val)
preds = np.argmax(predictions.predictions, axis=1)
labels = predictions.label_ids

# --- 9. Hàm tính accuracy theo từng lớp ---
def save_per_class_accuracy(y_true, y_pred, class_names=None, output_path="viELECTRA_per_class_accuracy.csv"):
    if isinstance(y_true, torch.Tensor):
        y_true = y_true.cpu().numpy()
    if isinstance(y_pred, torch.Tensor):
        y_pred = y_pred.cpu().numpy()

    labels = np.unique(y_true)
    acc_per_class = []

    for label in labels:
        idx = y_true == label
        acc = accuracy_score(y_true[idx], y_pred[idx])
        class_label = class_names[label] if class_names else f"Class {label}"
        acc_per_class.append((class_label, acc))

    df = pd.DataFrame(acc_per_class, columns=["Class", "Accuracy"])
    print(df)
    df.to_csv(output_path, index=False)

# --- 10. Gọi hàm ---
class_names = ["0", "1", "2", "3"]  # bạn có thể thay bằng tên thật của từng lớp nếu có
save_per_class_accuracy(labels, preds, class_names)

  Class  Accuracy
0     0  0.977605
1     1  0.994625
2     2  0.992441
3     3  0.982202


In [26]:
from sklearn.metrics import accuracy_score
overall_accuracy = accuracy_score(labels, preds)
overall_accuracy

0.9867945584015304