In [73]:
!pip install -qU transformers
!pip install -qU evaluate

In [74]:
# define model
MODEL_PATH = "aubmindlab/bert-large-arabertv2"

In [75]:
import pickle

# read the train/test data
with open("data/multilabel_train.pkl", "rb") as f:
    X_train, y_train = pickle.load(f)

with open("data/multilabel_test.pkl", "rb") as f:
    X_test, y_test = pickle.load(f)

class2id = {
    class_: id
    for id, class_ in enumerate(
        ["bug_report", "improvement_request", "rating", "others"]
    )
}
id2class = {id: class_ for class_, id in class2id.items()}



In [76]:
from transformers import AutoTokenizer
import numpy as np
from datasets import Dataset

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)


y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

# preprocess the data for training, create dataset, tokenize
train_ds = Dataset.from_dict({"text": X_train, "labels": y_train.tolist()}).map(
    lambda ex: tokenizer(ex["text"], truncation=True, max_length=39),
    batched=True,
)

test_ds = Dataset.from_dict({"text": X_test, "labels": y_test.tolist()}).map(
    lambda ex: tokenizer(ex["text"], truncation=True, max_length=39),
    batched=True,
)

# split the train into train/val
split = train_ds.train_test_split(test_size=0.15, seed=777)

# reassign
train_ds = split["train"]
val_ds = split["test"]

# more effecint
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenizer_config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/656k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/2227 [00:00<?, ? examples/s]

Map:   0%|          | 0/557 [00:00<?, ? examples/s]

In [77]:
import evaluate
import numpy as np

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def compute_metrics(eval_pred):

    predictions, labels = eval_pred
    predictions = sigmoid(predictions)
    predictions = (predictions > 0.5).astype(int).reshape(-1)
    return clf_metrics.compute(
        predictions=predictions, references=labels.astype(int).reshape(-1)
    )


import torch
from transformers import Trainer

eps = 1e-8
y_arr = torch.tensor(y_train, dtype=torch.float32)

# for weighted loss
pos_weight = (y_arr.shape[0] - y_arr.sum(axis=0)) / (y_arr.sum(axis=0) + eps)
pos_weight = torch.tensor(pos_weight, dtype=torch.float32).to("cuda")  # or your device


class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        # BCEWithLogitsLoss with per-class pos_weight
        loss_fct = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

  pos_weight = torch.tensor(pos_weight, dtype=torch.float32).to("cuda")  # or your device


In [78]:
from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)

# set up the model, training args
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_PATH,
    num_labels=4,
    id2label=id2class,
    label2id=class2id,
    problem_type="multi_label_classification",
    hidden_dropout_prob=0.2,
    attention_probs_dropout_prob=0.2,
)

training_args = TrainingArguments(
    output_dir="FineTuned_Model",
    run_name="FineTuned_Model",
    report_to=["none"],
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=20,
    weight_decay=0.01,
    eval_strategy="epoch",
    metric_for_best_model="eval_f1",  # ← the metric name (prefixed with “eval_”)
    greater_is_better=True,
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.48G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-large-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [79]:
trainer.train()

Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.717611,0.63209,0.599513,0.545858,0.664865
2,No log,0.595143,0.748507,0.7194,0.668731,0.778378
3,No log,0.562822,0.805224,0.759447,0.777358,0.742342
4,No log,0.515892,0.794776,0.757709,0.741379,0.774775
5,No log,0.532865,0.811194,0.769791,0.777574,0.762162
6,No log,0.522857,0.816418,0.780357,0.773451,0.787387
7,No log,0.566531,0.817164,0.778681,0.780797,0.776577
8,No log,0.593953,0.820896,0.781022,0.791128,0.771171
9,0.464000,0.610277,0.815672,0.777678,0.776978,0.778378
10,0.464000,0.632756,0.821642,0.78371,0.787273,0.78018


TrainOutput(global_step=1200, training_loss=0.25454425970713296, metrics={'train_runtime': 1075.598, 'train_samples_per_second': 35.18, 'train_steps_per_second': 1.116, 'total_flos': 2686164692365440.0, 'train_loss': 0.25454425970713296, 'epoch': 20.0})

In [80]:
test_metrics = trainer.evaluate(eval_dataset=test_ds)
print("Test set metrics:", test_metrics)

predictions, labels, preds_metrics = trainer.predict(test_ds)

import numpy as np

probs = 1 / (1 + np.exp(-predictions))
bin_preds = (probs > 0.5).astype(int)

print("Prediction shape:", bin_preds.shape)
print("Ground-truth shape:", labels.shape)

from sklearn.metrics import classification_report

print(
    classification_report(
        labels, bin_preds, target_names=list(id2class.values()), zero_division=0
    )
)

Test set metrics: {'eval_loss': 0.8696743845939636, 'eval_accuracy': 0.8173249551166966, 'eval_f1': 0.7822364901016586, 'eval_precision': 0.8041804180418042, 'eval_recall': 0.7614583333333333, 'eval_runtime': 3.4648, 'eval_samples_per_second': 160.76, 'eval_steps_per_second': 5.195, 'epoch': 20.0}
Prediction shape: (557, 4)
Ground-truth shape: (557, 4)
                     precision    recall  f1-score   support

         bug_report       0.78      0.79      0.78       223
improvement_request       0.73      0.70      0.72       206
             rating       0.90      0.90      0.90       378
             others       0.66      0.45      0.54       153

          micro avg       0.80      0.76      0.78       960
          macro avg       0.77      0.71      0.73       960
       weighted avg       0.80      0.76      0.78       960
        samples avg       0.86      0.82      0.81       960

