In [None]:
import misc.select_directory as sd
sd.select_directory_local()

In [None]:
# from google.colab import drive
# import os
# drive.mount('/content/drive')
# os.chdir('/content/drive/MyDrive/minor-project')

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments
from model.dataset import QAClassifierDataset
from misc.dataset_modifier import get_json
from model.weight import compute_class_weights, LABEL2ID
from model.debertav3 import WeightedTrainer
from sklearn.metrics import f1_score, accuracy_score
import numpy as np

In [None]:
model_id = "microsoft/deberta-v3-base"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels=3
)

In [None]:
original = get_json("./data/curated/validation.json")
combined = get_json("./data/updated/combined_set/train.json")
validation = get_json("./data/curated/validation.json")
n=5

In [None]:
class_weight_combined = compute_class_weights(combined) 
class_weight_validation = compute_class_weights(validation)

original_set = QAClassifierDataset(
    original,
    tokenizer,
    class_weight_combined
)
validation_set = QAClassifierDataset(
    validation,
    tokenizer,
    class_weight_validation
)
combined_set = QAClassifierDataset(
    combined,
    tokenizer,
    class_weight_combined
)

In [None]:
print('--------------------------------------------Datasets--------------------------------------------')
print('--------------------------------------------original--------------------------------------------')
batch = original_set[0]
print('============================================input Text============================================')
print("Decoded input text:", tokenizer.decode(batch["input_ids"]))
print('--------------------------------------------Combined--------------------------------------------')
batch = combined_set[0]

print('============================================input Text============================================')
print("Decoded input text:", tokenizer.decode(batch["input_ids"]))
print('--------------------------------------------Validation--------------------------------------------')
batch = validation_set[0]
print('============================================input Text============================================')
print("Decoded input text:", tokenizer.decode(batch["input_ids"]))

In [None]:
from sklearn.metrics import f1_score, accuracy_score

def compute_metrics_fn(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)

    # map numeric labels back to string labels if needed
    id2label = {v: k for k, v in LABEL2ID.items()}
    y_true = [id2label[l] for l in labels]
    y_pred = [id2label[p] for p in preds]

    # assign weight per sample
    sample_weights = np.array([class_weight_validation[lbl] for lbl in y_true])

    return {
        "weighted_macro_f1": f1_score(y_true, y_pred, average="macro", sample_weight=sample_weights),
        "accuracy": accuracy_score(labels, preds)
    }

In [None]:
trainer = WeightedTrainer(
model=model,
args=TrainingArguments(
    output_dir="./ckpts/phase1",
    num_train_epochs=1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=False,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=10,
    report_to="tensorboard"
),
train_dataset=original_set,
eval_dataset=validation_set,
tokenizer=tokenizer,
compute_metrics=compute_metrics_fn,
)

trainer.train()

metrics = trainer.evaluate()
print("Phase 1 eval metrics:", metrics)

trainer = WeightedTrainer(
    model=model,
    args=TrainingArguments(
        output_dir="./ckpts/phase2",
        num_train_epochs=n,  # remaining epochs
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="macro_f1",
        greater_is_better=True,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        logging_dir="./logs",
        logging_steps=10,
        report_to="tensorboard"
    ),
    train_dataset=combined_set,
    eval_dataset=validation_set,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_fn,
)

trainer.train()

metrics = trainer.evaluate()
print("Phase 2 eval metrics:", metrics)