In [None]:
import evaluate
import os
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from datasets import Dataset

In [None]:
def load_train_data(file_path, label):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            text = line.strip()
            if text:
                data.append({"text": text, "label": label})
    return data

In [None]:
train_data = []
train_data += load_train_data("SA2016-training_data/SA-training_negative.txt", 0)
train_data += load_train_data("SA2016-training_data/SA-training_neutral.txt", 1)
train_data += load_train_data("SA2016-training_data/SA-training_positive.txt", 2)

In [None]:
train_dataset = Dataset.from_list(train_data)

In [None]:
def load_test_data(file_path):
    test_samples = []
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = [line.strip() for line in f if line.strip()]
    for i in range(0, len(lines), 2):
        text = lines[i]
        true_label = lines[i+1] if (i+1) < len(lines) else None
        test_samples.append({"text": text, "label": true_label})
    return test_samples

test_data = load_test_data("SA2016-TestData-Ans/test_raw_ANS.txt")
test_dataset = Dataset.from_list(test_data)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2", use_fast=False)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

label_mapping = {"NEG": 0, "NEU": 1, "POS": 2}

def convert_label(example):
    if isinstance(example["label"], str):
        example["label"] = label_mapping[example["label"]]
    return example

tokenized_test = tokenized_test.map(convert_label)

tokenized_test.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
config = AutoConfig.from_pretrained("vinai/phobert-base-v2", num_labels=3)
model = AutoModelForSequenceClassification.from_pretrained("vinai/phobert-base-v2", config=config)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=20,              
    learning_rate=1e-5,       
    warmup_steps=500,                  
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",      
    save_strategy="epoch",            
    load_best_model_at_end=True,       
    metric_for_best_model="eval_accuracy", 
    greater_is_better=True,          
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=100,
)

In [None]:
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])
tokenized_test.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,   
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
test_eval_result = trainer.evaluate(eval_dataset=tokenized_test)
print("Test Accuracy:", test_eval_result["eval_accuracy"])

In [None]:
save_folder = "models/phobert-base-v2-3"
os.makedirs(save_folder, exist_ok=True)

model.save_pretrained(save_folder)
tokenizer.save_pretrained(save_folder)

In [None]:
load_folder = "models/phobert-base-v2-2"

model = AutoModelForSequenceClassification.from_pretrained(load_folder)
tokenizer = AutoTokenizer.from_pretrained(load_folder)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)

test_eval_result = trainer.evaluate(eval_dataset=tokenized_test)
print("Test Accuracy:", test_eval_result["eval_accuracy"])