# Read Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
train_balanced = pd.read_pickle('train_balanced.pkl')
val_balanced = pd.read_pickle('val_balanced.pkl')
test_balanced = pd.read_pickle('test_balanced.pkl')
VOCAB_BALANCED_SIZE = set()
for tokenized in train_balanced.tokenized:
    for token in tokenized:
        VOCAB_BALANCED_SIZE.add(token)
max(VOCAB_BALANCED_SIZE)     

7130

In [3]:
train_unbalanced = pd.read_pickle('train_unbalanced.pkl')
val_unbalanced = pd.read_pickle('val_unbalanced.pkl')
test_unbalanced = pd.read_pickle('test_unbalanced.pkl')

VOCAB_UNBALANCED_SIZE = set()
for tokenized in train_unbalanced.tokenized:
    for token in tokenized:
        VOCAB_UNBALANCED_SIZE.add(token)

max(VOCAB_UNBALANCED_SIZE)

6767

In [4]:
from datasets import Dataset

In [5]:
train_dataset_unbalanced = Dataset.from_dict({"input_ids": [x['input_ids'] for x in train_unbalanced.marbert_tokens], 
                                   "label": train_unbalanced['MultiLabel'], 
                                   'attention_mask':[x['attention_mask'] for x in train_unbalanced.marbert_tokens]})
val_dataset_unbalanced = Dataset.from_dict({"input_ids": [x['input_ids'] for x in val_unbalanced.marbert_tokens], 
                                 "label": val_unbalanced['MultiLabel'], 
                                 'attention_mask':[x['attention_mask'] for x in val_unbalanced.marbert_tokens]})
test_dataset_unbalanced = Dataset.from_dict({"input_ids": [x['input_ids'] for x in test_unbalanced.marbert_tokens], 
                                  "label": test_unbalanced['MultiLabel'], 
                                  'attention_mask':[x['attention_mask'] for x in test_unbalanced.marbert_tokens]})

In [6]:
train_dataset_balanced = Dataset.from_dict({"input_ids": [x['input_ids'] for x in train_balanced.marbert_tokens], 
                                   "label": train_balanced['MultiLabel'], 
                                   'attention_mask':[x['attention_mask'] for x in train_balanced.marbert_tokens]})
val_dataset_balanced = Dataset.from_dict({"input_ids": [x['input_ids'] for x in val_balanced.marbert_tokens], 
                                 "label": val_balanced['MultiLabel'], 
                                 'attention_mask':[x['attention_mask'] for x in val_balanced.marbert_tokens]})
test_dataset_balanced = Dataset.from_dict({"input_ids": [x['input_ids'] for x in test_balanced.marbert_tokens], 
                                  "label": test_balanced['MultiLabel'], 
                                  'attention_mask':[x['attention_mask'] for x in test_balanced.marbert_tokens]})

In [7]:
import evaluate
import numpy as np
from transformers import Trainer

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def compute_metrics(eval_pred):

    predictions, labels = eval_pred
    predictions = sigmoid(predictions)
    predictions = (predictions > 0.5).astype(int).reshape(-1)
    return clf_metrics.compute(
        predictions=predictions, references=labels.astype(int).reshape(-1)
    )


import torch
from transformers import Trainer

eps = 1e-8
y_unbalanced=np.stack(train_unbalanced['MultiLabel'].to_numpy())
y_balanced=np.stack(train_balanced['MultiLabel'].to_numpy())
pos_weight_unbalanced = (y_unbalanced.shape[0] - y_unbalanced.sum(axis=0)) / (y_balanced.sum(axis=0) + 0.0000001)
pos_weight_unbalanced = torch.tensor(pos_weight_unbalanced, dtype=torch.float32).to("cuda")  # or your device

pos_weight_balanced = (y_balanced.shape[0] - y_balanced.sum(axis=0)) / (y_balanced.sum(axis=0) + 0.0000001)
pos_weight_balanced = torch.tensor(pos_weight_balanced, dtype=torch.float32).to("cuda")  # or your device


class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
        
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

2025-05-10 23:51:15.535034: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746910275.548486   20102 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746910275.552047   20102 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746910275.561128   20102 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746910275.561145   20102 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746910275.561147   20102 computation_placer.cc:177] computation placer alr

In [8]:
from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
)
# define model
MODEL_PATH = "UBC-NLP/MARBERTv2"
# set up the model, training args
model_balanced = AutoModelForSequenceClassification.from_pretrained(
    MODEL_PATH,
    num_labels=4,
    problem_type="multi_label_classification",  # use this for CrossEntropyLoss
    hidden_dropout_prob=0.2,
    attention_probs_dropout_prob=0.2,
)

for param in model_balanced.parameters(): param.data = param.data.contiguous()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at UBC-NLP/MARBERTv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
pos_weight = pos_weight_balanced   
training_args = TrainingArguments(
    output_dir="./results",
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    evaluation_strategy='epoch',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    num_train_epochs=10,
    logging_strategy='steps',
    logging_steps=10,
)


trainer = WeightedTrainer(
    model=model_balanced,
    args=training_args,
    train_dataset=train_dataset_balanced,
    eval_dataset=val_dataset_balanced,
    compute_metrics=compute_metrics,
)
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.7211,0.668319,0.720326,0.704777,0.646552,0.774527
2,0.6592,0.580076,0.760386,0.731057,0.708065,0.755594
3,0.606,0.518584,0.810831,0.778838,0.784965,0.772806
4,0.5595,0.499998,0.824926,0.796552,0.797927,0.795181
5,0.5435,0.474032,0.829377,0.800347,0.807356,0.79346
6,0.4853,0.467445,0.840504,0.813206,0.821053,0.805508
7,0.4842,0.481735,0.836053,0.809318,0.811419,0.807229
8,0.477,0.49037,0.826409,0.799314,0.796581,0.802065
9,0.4729,0.482815,0.833828,0.80789,0.805128,0.810671
10,0.4767,0.486157,0.830119,0.803433,0.80137,0.805508


TrainOutput(global_step=800, training_loss=0.5539595425128937, metrics={'train_runtime': 506.8302, 'train_samples_per_second': 50.234, 'train_steps_per_second': 1.578, 'total_flos': 2616768656736000.0, 'train_loss': 0.5539595425128937, 'epoch': 10.0})

In [10]:
model_unbalanced = AutoModelForSequenceClassification.from_pretrained(
    MODEL_PATH,
    num_labels=4,
    problem_type="multi_label_classification",
    hidden_dropout_prob=0.2,
    attention_probs_dropout_prob=0.2,
)
for param in model_unbalanced.parameters(): param.data = param.data.contiguous()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at UBC-NLP/MARBERTv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
pos_weight = pos_weight_unbalanced   
training_args = TrainingArguments(
    output_dir="./results",
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    evaluation_strategy='epoch',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    num_train_epochs=10,
    logging_strategy='steps',
    logging_steps=10,
)

trainer_unbalanced = WeightedTrainer(
    model=model_unbalanced,
    args=training_args,
    train_dataset=train_dataset_unbalanced,
    eval_dataset=val_dataset_unbalanced,
    compute_metrics=compute_metrics,
)

trainer_unbalanced.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3921,0.407131,0.81454,0.766355,0.838446,0.70568
2,0.369,0.401249,0.830119,0.78894,0.849206,0.736661
3,0.3216,0.399189,0.846439,0.810959,0.863813,0.7642
4,0.2919,0.405966,0.847923,0.813127,0.864341,0.767642
5,0.2583,0.401153,0.849407,0.814612,0.867704,0.767642


TrainOutput(global_step=300, training_loss=0.33379445791244505, metrics={'train_runtime': 224.8123, 'train_samples_per_second': 42.391, 'train_steps_per_second': 1.334, 'total_flos': 979489603248000.0, 'train_loss': 0.33379445791244505, 'epoch': 5.0})

In [12]:
from sklearn.metrics import classification_report, multilabel_confusion_matrix
from tqdm import tqdm
import torch
import numpy as np

In [13]:


def test(trainer, test_dataset, threshold=0.5):
    preds = []
    true = []

    model = trainer.model
    model.eval()

    with torch.no_grad():
        for sample in tqdm(test_dataset):
            input_ids = torch.tensor(sample['input_ids'], dtype=torch.long).unsqueeze(0).to('cuda')
            attention_mask = torch.tensor(sample['attention_mask'], dtype=torch.long).unsqueeze(0).to('cuda')

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = torch.sigmoid(outputs.logits).cpu().numpy() 
            pred_labels = (probs > threshold).astype(int)

            preds.append(pred_labels[0])
            true.append(np.array(sample['label']))

    y_true = np.array(true)
    y_pred = np.array(preds)

    print(classification_report(y_true=y_true, y_pred=y_pred, zero_division=0))
    print(multilabel_confusion_matrix(y_true=y_true, y_pred=y_pred))


In [17]:
test(trainer, test_dataset_balanced)

100%|█████████████████████████████████████████████████████████████████████████████████| 561/561 [00:06<00:00, 93.30it/s]

              precision    recall  f1-score   support

           0       0.84      0.79      0.82       229
           1       0.69      0.70      0.70       215
           2       0.87      0.89      0.88       387
           3       0.80      0.46      0.59       151

   micro avg       0.81      0.76      0.79       982
   macro avg       0.80      0.71      0.75       982
weighted avg       0.81      0.76      0.78       982
 samples avg       0.86      0.82      0.81       982

[[[297  35]
  [ 47 182]]

 [[278  68]
  [ 64 151]]

 [[122  52]
  [ 41 346]]

 [[393  17]
  [ 81  70]]]





In [18]:
test(trainer_unbalanced, test_dataset_unbalanced)

100%|█████████████████████████████████████████████████████████████████████████████████| 561/561 [00:05<00:00, 99.41it/s]

              precision    recall  f1-score   support

           0       0.84      0.81      0.83       229
           1       0.76      0.65      0.70       215
           2       0.94      0.80      0.87       387
           3       0.77      0.51      0.61       151

   micro avg       0.85      0.73      0.78       982
   macro avg       0.83      0.69      0.75       982
weighted avg       0.85      0.73      0.78       982
 samples avg       0.89      0.79      0.80       982

[[[298  34]
  [ 44 185]]

 [[301  45]
  [ 76 139]]

 [[153  21]
  [ 76 311]]

 [[387  23]
  [ 74  77]]]



