In [1]:
from datasets import load_dataset

ds = load_dataset("zeroshot/twitter-financial-news-sentiment")

In [2]:
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B")
 

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)
 

if "label" in ds["train"].features.keys():
    split_dataset =  ds.rename_column("label", "labels") 
tokenized_dataset = split_dataset.map(tokenize, batched=True, remove_columns=["text"])
 
tokenized_dataset["train"]

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 9543
})

In [3]:
from transformers import AutoModelForSequenceClassification
 
# Model id to load the tokenizer
model_id = "answerdotai/ModernBERT-base"
 
label2id = {
    "bearish": 0,
    "bullish": 1,
    "neutral": 2,
}

id2label = {v: k for k, v in label2id.items()}

In [4]:
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score
 
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    f1 = f1_score(
            labels, predictions, labels=labels, pos_label=1, average="weighted"
        )
    precision = precision_score(
        labels, predictions, labels=labels, pos_label=1, average="weighted"
    )
    recall = recall_score(
        labels, predictions, labels=labels, pos_label=1, average="weighted"
    )

    return {"f1": float(f1) if f1 == 1 else f1, "precision": precision, "recall": recall}


In [5]:
import torch
import torch.nn as nn


class QwenClassifier(nn.Module):
    def __init__(self, model_id="Qwen/Qwen2.5-0.5B", num_labels=3):
        super(QwenClassifier, self).__init__()
        self.model_id = model_id
        self.model = AutoModelForCausalLM.from_pretrained(self.model_id)
        self.num_labels = num_labels
        self.loss_type = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels):
        outputs = self.model(input_ids, attention_mask=attention_mask)

        # Compute last token indices once.
        batch_size = outputs.logits.size(0)
        batch_indices = torch.arange(batch_size, device=outputs.logits.device)
        last_token_indices = attention_mask.sum(dim=1) - 1

        # Gather the logits for the last non-padded token in each sequence.
        final_logits = outputs.logits[batch_indices, last_token_indices, :]

        # Mask out logits beyond self.num_labels in-place.
        final_logits[:, self.num_labels:] = float("-inf")

        loss = self.loss_type(final_logits.view(-1, self.model.config.vocab_size), labels.view(-1))
        return {"loss": loss, "logits": final_logits}

    def predict(self, input_ids, attention_mask):
        outputs = self.model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits[:, attention_mask.sum(dim=1)-1, :]
        logits[:, :, self.num_labels:] = float("-inf")

        return torch.argmax(logits, dim=-1)


model = QwenClassifier(num_labels=len(label2id))

input_ids = torch.tensor(tokenized_dataset["train"]["input_ids"][2]).unsqueeze(0)
attention_mask = torch.tensor(tokenized_dataset["train"]["attention_mask"][2]).unsqueeze(0)

input_ids2 = torch.tensor(tokenized_dataset["train"]["input_ids"][3]).unsqueeze(0)
attention_mask2 = torch.tensor(tokenized_dataset["train"]["attention_mask"][3]).unsqueeze(0)

input_ids = torch.cat([input_ids, input_ids2], dim=0)
attention_mask = torch.cat([attention_mask, attention_mask2], dim=0)

# model.predict(input_ids, attention_mask)

In [6]:
import pandas as pd

pd.DataFrame(tokenized_dataset["validation"]["labels"]).value_counts()

0
2    1566
1     475
0     347
Name: count, dtype: int64

In [7]:
lp = []
for batch in tokenized_dataset["validation"]:
    input_ids = torch.tensor(batch["input_ids"]).unsqueeze(0)
    attention_mask = torch.tensor(batch["attention_mask"]).unsqueeze(0)
    predictions = model.predict(input_ids, attention_mask)
    lp.append(predictions.item())
    for prediction in predictions:
        print("prediction:", id2label[prediction.item()], "expected:", id2label[batch["labels"]], "f1", f1_score(lp, tokenized_dataset["validation"]["labels"][:len(lp)], average="weighted"))
    break

prediction: neutral expected: bearish f1 0.0


In [None]:
from huggingface_hub import HfFolder
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

 
# Define training args
training_args = TrainingArguments(
    output_dir= "ModernBERT-tweet-classifier-classhead",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=10e-6,
        num_train_epochs=5,
    bf16=True, # bfloat16 training 
    optim="adamw_torch_fused", # improved optimizer 
    # logging & evaluation strategies
    logging_strategy="steps",
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    # push to hub parameters
    # push_to_hub=True,
    # hub_strategy="every_save",
    # hub_token=HfFolder.get_token(),
    # report_to="wandb",
    eval_on_start=True,
    save_safetensors=False,  # Disable safetensors saving
)
 
# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,
)
trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
0,No log,0.560549,0.875815,0.864184,0.88968
1,0.136700,0.627017,0.875079,0.8738,0.876764
2,0.063800,0.887516,0.874955,0.873817,0.876375
3,0.029500,1.064972,0.870642,0.88146,0.860485
