In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import json
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

file_path = "/content/ori_pqaa.json"  
try:
    with open(file_path, "r") as f:
      data = json.load(f)
except json.JSONDecodeError as e:
    print(f"JSON Error: {e}")
    with open(file_path, "r") as f:
      content = f.read()
      start = max(0, e.pos - 100)
      end = min(len(content), e.pos + 100)
      print(f"Error context: {content[start:end]}")

samples = []
for key, value in data.items():
    context = " ".join(value["CONTEXTS"])  
    label = 0 if value["final_decision"] == "yes" else (1 if value["final_decision"] == "no" else 2)
    samples.append({"question": value["QUESTION"], "context": context, "answer": label})

dataset = Dataset.from_list(samples)
dataset = dataset.train_test_split(test_size=0.1)  

tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
def tokenize_function(examples):
    return tokenizer(examples['question'], examples['context'], truncation=True, padding='max_length', max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.rename_column("answer", "labels")

model = AutoModelForSequenceClassification.from_pretrained("dmis-lab/biobert-base-cased-v1.1", num_labels=3)
model.to(device)

training_args = TrainingArguments(
    output_dir="./results_biobert_colab",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    dataloader_num_workers=2,
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["test"])
trainer.train()
trainer.save_model("./results_biobert_colab")
results = trainer.evaluate()
print(results)  


In [None]:
import matplotlib.pyplot as plt
import numpy as np

epochs = [1, 2, 3]
training_loss = [0.100900, 0.057400, 0.033800]
validation_loss = [0.091560, 0.103264, 0.141920]

plt.figure(figsize=(10, 6))
plt.plot(epochs, training_loss, color='#1f77b4', marker='o', label='Training Loss', linewidth=2, markersize=8)  # Xanh dương
plt.plot(epochs, validation_loss, color='#ff7f0e', marker='s', label='Validation Loss', linewidth=2, markersize=8)  # Cam

plt.title('Training vs Validation Loss', fontsize=14)
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)

for i, (tr, val) in enumerate(zip(training_loss, validation_loss)):
    plt.annotate(f'{tr:.4f}', (epochs[i], training_loss[i]), textcoords="offset points", xytext=(0,10), ha='center', fontsize=9)
    plt.annotate(f'{val:.4f}', (epochs[i], validation_loss[i]), textcoords="offset points", xytext=(0,10), ha='center', fontsize=9)

plt.xticks(epochs)
plt.tight_layout()
plt.savefig('training_results.png', dpi=300, bbox_inches='tight', facecolor='white')
plt.show()