In [1]:
import os
from datasets import load_dataset, DatasetDict, load_from_disk
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    TrainingArguments, 
    Trainer,
    DataCollatorWithPadding,
    set_seed
)
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
set_seed(42)

dataset_path = "../data/tokenized_dataset"
if os.path.exists(dataset_path):
    dataset = load_from_disk(dataset_path)
    print("✅ Tokenized dataset loaded from disk.")
else:
    raise FileNotFoundError("❌ Tokenized dataset not found. Run data_preprocessing.py first.")

In [None]:
model_ckpt = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=3)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted"),
        "precision": precision_score(labels, preds, average="weighted"),
        "recall": recall_score(labels, preds, average="weighted"),
    }

In [None]:
training_args = TrainingArguments(
    output_dir="../models/trained_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir="../models/logs",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_steps=10,
    do_eval=True
)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


In [None]:
trainer.train()

trainer.save_model("../models/trained_model")
tokenizer.save_pretrained("../models/tokenizer")


In [None]:
results = trainer.evaluate(dataset["test"])
print("Test Results:", results)