In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np
import evaluate
import torch

In [4]:
dataset = load_dataset("mediabiasgroup/BABE")

README.md:   0%|          | 0.00/770 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/712k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/233k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3121 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [5]:
# Use only 'text' and 'label' columns
def preprocess(example):
    return tokenizer(example["text"], truncation=True, padding="max_length")
# Load tokenizer and model
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
device = torch.device("cuda:0")  # Or "cuda:1" if that's your best GPU
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [7]:
# Preprocess dataset
encoded_dataset = dataset.map(preprocess, batched=True)
encoded_dataset = encoded_dataset.rename_column("label", "labels")
encoded_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/3121 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [8]:
# Load evaluation metrics
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

In [9]:
def compute_metrics(pred):
    logits, labels = pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1": f1.compute(predictions=preds, references=labels, average="weighted")["f1"]
    }

In [10]:
# Training configuration
training_args = TrainingArguments(
    output_dir="./roberta-babe",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    no_cuda=False,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_dir="./logs",
    logging_steps=10,
)

In [11]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [12]:
# Train
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3284,0.399587,0.833,0.833197
2,0.3006,0.34009,0.852,0.852337
3,0.2307,0.393154,0.852,0.852483




TrainOutput(global_step=294, training_loss=0.321795669339952, metrics={'train_runtime': 136.959, 'train_samples_per_second': 68.364, 'train_steps_per_second': 2.147, 'total_flos': 2463508811335680.0, 'train_loss': 0.321795669339952, 'epoch': 3.0})

In [13]:
# Save model
trainer.save_model("./roberta-babe")

In [14]:
from transformers import pipeline

bias_detector = pipeline("text-classification", model="./roberta-babe", tokenizer="roberta-base")

Device set to use cuda:0


In [15]:
print(bias_detector("The Earth is a planet."))
print(bias_detector("Immigrants are criminals."))

[{'label': 'LABEL_0', 'score': 0.9305679798126221}]
[{'label': 'LABEL_1', 'score': 0.9843938946723938}]


## Evaluation

In [20]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [16]:
# Load test split from BABE
data = load_dataset("mediabiasgroup/BABE", split="test")

In [17]:
# Load your fine-tuned model as a pipeline
bias_detector = pipeline("text-classification", model="./roberta-babe", tokenizer="roberta-base")

Device set to use cuda:0


In [18]:
# Prepare predictions and ground truth
true_labels = []
pred_labels = []

for example in data:
    text = example["text"]
    true_label = example["label"]
    
    # Run prediction
    prediction = bias_detector(text)[0]
    pred_label = 1 if prediction["label"] == "LABEL_1" else 0

    true_labels.append(true_label)
    pred_labels.append(pred_label)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [21]:
# Compute metrics
accuracy = accuracy_score(true_labels, pred_labels)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, pred_labels, average="binary")

In [22]:
# Print results
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")

Accuracy:  0.8520
Precision: 0.9237
Recall:    0.8014
F1 Score:  0.8582


## Pushing to HGface

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification


In [2]:
model = AutoModelForSequenceClassification.from_pretrained("./roberta-babe")

In [3]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base") 

In [5]:
model.push_to_hub("himel7/roberta-babe")
tokenizer.push_to_hub("himel7/roberta-babe")

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/himel7/roberta-babe/commit/ff09404d60f2f6f23fbc09a0c489e64062deb18c', commit_message='Upload tokenizer', commit_description='', oid='ff09404d60f2f6f23fbc09a0c489e64062deb18c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/himel7/roberta-babe', endpoint='https://huggingface.co', repo_type='model', repo_id='himel7/roberta-babe'), pr_revision=None, pr_num=None)