In [1]:
from datasets import load_dataset

ds = load_dataset("zeroshot/twitter-financial-news-sentiment")

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
from transformers import AutoTokenizer
 
model_id = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
 

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)
 

if "label" in ds["train"].features.keys():
    split_dataset =  ds.rename_column("label", "labels") 
tokenized_dataset = split_dataset.map(tokenize, batched=True, remove_columns=["text"])
 
tokenized_dataset.set_format("torch")
tokenized_dataset["train"]

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 9543
})

TypeError: 'DatasetDict' object is not an iterator

In [None]:
from transformers import AutoModelForSequenceClassification, AutoModel, ModernBertModel
 
# Model id to load the tokenizer
model_id = "answerdotai/ModernBERT-base"
 
label2id = {
    "bearish": 0,
    "bullish": 1,
    "neutral": 2,
}

id2label = {v: k for k, v in label2id.items()}

# Download the model from huggingface.co/models
model = AutoModel.from_pretrained(
    model_id, num_labels=len(label2id), label2id=label2id, id2label=id2label,
)

In [None]:
model(tokenized_dataset["train"][0]["input_ids"].unsqueeze(0), tokenized_dataset["train"][0]["attention_mask"].unsqueeze(0))[0]

torch.Size([1, 81, 768])

In [None]:
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score
 
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    f1 = f1_score(
            labels, predictions, labels=labels, pos_label=1, average="weighted"
        )
    precision = precision_score(
        labels, predictions, labels=labels, pos_label=1, average="weighted"
    )
    recall = recall_score(
        labels, predictions, labels=labels, pos_label=1, average="weighted"
    )

    return {"f1": float(f1) if f1 == 1 else f1, "precision": precision, "recall": recall}


In [None]:
from huggingface_hub import HfFolder
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

 
# Define training args
training_args = TrainingArguments(
    output_dir= "ModernBERT-domain-classifier",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
        num_train_epochs=5,
    bf16=True, # bfloat16 training 
    optim="adamw_torch_fused", # improved optimizer 
    # logging & evaluation strategies
    logging_strategy="steps",
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    use_mps_device=True,
    metric_for_best_model="f1",
    # push to hub parameters
    push_to_hub=True,
    hub_strategy="every_save",
    hub_token=HfFolder.get_token(),
    report_to="wandb",
    eval_on_start=True,
)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,
)
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mlaz4rz[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss,F1
0,No log,1.242949,0.195876
1,0.422400,0.354797,0.890739
2,0.219500,0.329708,0.914037
3,0.066900,0.492497,0.909958
4,0.022200,0.695993,0.916118
5,0.005800,0.713806,0.916547


TrainOutput(global_step=1495, training_loss=0.15992031635648032, metrics={'train_runtime': 1119.4709, 'train_samples_per_second': 42.623, 'train_steps_per_second': 1.335, 'total_flos': 2879473011343668.0, 'train_loss': 0.15992031635648032, 'epoch': 5.0})