# ModernBERT with unbalanced dataset

Notebook by: [Juliana Gómez Consuegra](https://www.linkedin.com/in/julianagomezconsuegra/)

- Documentation: https://huggingface.co/docs/transformers/en/model_doc/modernbert

Notes:

- The pre-trained ModernBERT-base model doesn't include a classification layer, so when you create a ModernBertForSequenceClassification model, it adds a new classification layer on top of the base model.


In [None]:
#upgrade fsspec to solve dependency issuse
!pip install -U fsspec==2024.10.0


Collecting fsspec==2024.10.0
  Downloading fsspec-2024.10.0-py3-none-any.whl.metadata (11 kB)
Downloading fsspec-2024.10.0-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.6/179.6 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2024.9.0
    Uninstalling fsspec-2024.9.0:
      Successfully uninstalled fsspec-2024.9.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 3.2.0 requires fsspec[http]<=2024.9.0,>=2023.1.0, but you have fsspec 2024.10.0 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.5.3.2 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cuda-cupti-cu12==1

In [None]:
!pip install -q datasets
!pip install -q codecarbon
!pip install -q -U transformers>=4.48.0
# !pip install -q flash- #not available for T4: RuntimeError: FlashAttention only supports Ampere GPUs or newer.


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.5.1+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.5.3.2 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cuda-cupti-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-cupti-cu12 12.5.82 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-nvrtc-cu12 12.5.82 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cuda-runtime-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-runtime-cu12 12.5.82 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cudnn-cu12==9.1.0.70; platform_sy

In [None]:
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
from datasets import load_dataset

#standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime


#emissions tracking
from codecarbon import EmissionsTracker


# accuracy and model
from datasets import load_dataset, DatasetDict
import torch
from functools import partial
import gc
from sklearn.metrics import accuracy_score

import torch
from transformers import AutoTokenizer, ModernBertForSequenceClassification,TrainingArguments, Trainer, TrainerCallback, DataCollatorWithPadding

In [None]:
# dataset
dataset = load_dataset("quotaclimat/frugalaichallenge-text-train")

# Define the label mapping
LABEL_MAPPING = {
    "0_not_relevant": 0,
    "1_not_happening": 1,
    "2_not_human": 2,
    "3_not_bad": 3,
    "4_solutions_harmful_unnecessary": 4,
    "5_science_unreliable": 5,
    "6_proponents_biased": 6,
    "7_fossil_fuels_needed": 7
}

# Convert string labels to integers
dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Fine-tuning ModernBERT on the dataset

In [None]:
# tokenizer
# tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
# model = ModernBertForSequenceClassification.from_pretrained("answerdotai/ModernBERT-base", num_labels=len(LABEL_MAPPING))


In [None]:
# Load tokenizer
model_name = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load model with alternative attention implementation
model = ModernBertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(LABEL_MAPPING),
    attn_implementation="sdpa"  #SDPA (Scaled Dot Product Attention)
).to('cuda')

model.gradient_checkpointing_enable()



Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Freeze all layers except the last one
for name, param in model.named_parameters():
    if 'layers.' in name:
        layer_num = int(name.split('.')[2])
        if layer_num < 21:
            param.requires_grad = False

# Verify the freezing
for name, param in model.named_parameters():
    if 'layers.' in name:
        layer_num = int(name.split('.')[2])
        print(f"Layer {layer_num}: {param.requires_grad}")
    else:
        print(f"{name}: {param.requires_grad}")


model.embeddings.tok_embeddings.weight: True
model.embeddings.norm.weight: True
Layer 0: False
Layer 0: False
Layer 0: False
Layer 0: False
Layer 0: False
Layer 1: False
Layer 1: False
Layer 1: False
Layer 1: False
Layer 1: False
Layer 1: False
Layer 2: False
Layer 2: False
Layer 2: False
Layer 2: False
Layer 2: False
Layer 2: False
Layer 3: False
Layer 3: False
Layer 3: False
Layer 3: False
Layer 3: False
Layer 3: False
Layer 4: False
Layer 4: False
Layer 4: False
Layer 4: False
Layer 4: False
Layer 4: False
Layer 5: False
Layer 5: False
Layer 5: False
Layer 5: False
Layer 5: False
Layer 5: False
Layer 6: False
Layer 6: False
Layer 6: False
Layer 6: False
Layer 6: False
Layer 6: False
Layer 7: False
Layer 7: False
Layer 7: False
Layer 7: False
Layer 7: False
Layer 7: False
Layer 8: False
Layer 8: False
Layer 8: False
Layer 8: False
Layer 8: False
Layer 8: False
Layer 9: False
Layer 9: False
Layer 9: False
Layer 9: False
Layer 9: False
Layer 9: False
Layer 10: False
Layer 10: False
Lay

In [None]:
# see all layers
# for name, param in model.named_parameters():
#     print(name)

## Split the dataset

In [None]:
def split_dataset(dataset, train_size=0.8, validation_size=0.1):
    train_test_split = dataset['train'].train_test_split(test_size=1-train_size)
    test_validation_split = train_test_split['test'].train_test_split(test_size=validation_size/(1-train_size))

    dataset_dict = DatasetDict({
        'train': train_test_split['train'],
        'validation': test_validation_split['train'],
        'test': test_validation_split['test']
    })
    return dataset_dict

# Split the dataset
dataset_splits = split_dataset(dataset)

### Tokenize the dataset

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['quote'], padding="max_length", truncation=True)

tokenized_datasets = dataset_splits.map(tokenize_function, batched=True)


Map:   0%|          | 0/3897 [00:00<?, ? examples/s]

Map:   0%|          | 0/487 [00:00<?, ? examples/s]

Map:   0%|          | 0/488 [00:00<?, ? examples/s]

### Set up hyperparameters

In [None]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'


In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    gradient_accumulation_steps=4,
    fp16=True,
)




In [None]:
# set accuracy as metric
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

### Fine-tune

In [None]:
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

class MetricsLoggerCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
      '''save train and validation losses for plotting'''
      if logs is not None:
        if "loss" in logs:
          train_losses.append(logs["loss"])
        if "eval_loss" in logs:
          val_losses.append(logs["eval_loss"])
        if "eval_accuracy" in logs:
          val_accuracies.append(logs["eval_accuracy"])
        if "accuracy" in logs:
          train_accuracies.append(logs["accuracy"])

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[MetricsLoggerCallback],
)

trainer.train()

  trainer = Trainer(
[codecarbon INFO @ 02:32:24] [setup] RAM Tracking...
[codecarbon INFO @ 02:32:24] [setup] CPU Tracking...
 Linux OS detected: Please ensure RAPL files exist at \sys\class\powercap\intel-rapl to measure CPU

[codecarbon INFO @ 02:32:25] CPU Model on constant consumption mode: Intel(R) Xeon(R) CPU @ 2.00GHz
[codecarbon INFO @ 02:32:25] [setup] GPU Tracking...
[codecarbon INFO @ 02:32:25] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 02:32:25] >>> Tracker's metadata:
[codecarbon INFO @ 02:32:25]   Platform system: Linux-6.1.85+-x86_64-with-glibc2.35
[codecarbon INFO @ 02:32:25]   Python version: 3.11.11
[codecarbon INFO @ 02:32:25]   CodeCarbon version: 2.8.3
[codecarbon INFO @ 02:32:25]   Available RAM : 12.675 GB
[codecarbon INFO @ 02:32:25]   CPU count: 2
[codecarbon INFO @ 02:32:25]   CPU model: Intel(R) Xeon(R) CPU @ 2.00GHz
[codecarbon INFO @ 02:32:25]   GPU count: 1
[codecarbon INFO @ 02:32:25]   GPU model: 1 x Tesla T4
[codecarbon INFO @ 02:32:26] Saving e

[codecarbon INFO @ 02:32:44] Energy consumed for RAM : 0.000020 kWh. RAM Power : 4.7530388832092285 W
[codecarbon INFO @ 02:32:44] Energy consumed for all CPUs : 0.000177 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 02:32:44] Energy consumed for all GPUs : 0.000217 kWh. Total GPU Power : 52.033674252100546 W
[codecarbon INFO @ 02:32:44] 0.000414 kWh of electricity used since the beginning.


Epoch,Training Loss,Validation Loss


[codecarbon INFO @ 02:32:59] Energy consumed for RAM : 0.000040 kWh. RAM Power : 4.7530388832092285 W
[codecarbon INFO @ 02:32:59] Energy consumed for all CPUs : 0.000354 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 02:32:59] Energy consumed for all GPUs : 0.000479 kWh. Total GPU Power : 62.92325679153174 W
[codecarbon INFO @ 02:32:59] 0.000873 kWh of electricity used since the beginning.
[codecarbon INFO @ 02:33:14] Energy consumed for RAM : 0.000059 kWh. RAM Power : 4.7530388832092285 W
[codecarbon INFO @ 02:33:14] Energy consumed for all CPUs : 0.000532 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 02:33:14] Energy consumed for all GPUs : 0.000739 kWh. Total GPU Power : 62.32345457570483 W
[codecarbon INFO @ 02:33:14] 0.001330 kWh of electricity used since the beginning.


### Training curve

In [None]:
# Plotting after training
plt.figure(figsize=(12, 5))

# Plot Losses
plt.subplot(1, 2, 1)
plt.plot(train_losses, label="Training Loss")
plt.plot(val_losses, label="Validation Loss")
plt.title("Training and Validation Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()

# Plot Accuracies
plt.subplot(1, 2, 2)
plt.plot(train_accuracies, label="Training Accuracy")
plt.plot(val_accuracies, label="Validation Accuracy")
plt.title("Training and Validation Accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()

plt.tight_layout()
plt.show()

# Inference

In [None]:
def evaluate_text(trainer, tokenized_datasets, dataset_name="quotaclimat/frugalaichallenge-text-train"):
    # Initialize CodeCarbon tracker
    tracker = EmissionsTracker(project_name="text_classification_baseline")

    # Start tracking emissions
    tracker.start()

    ########################################################################
    # ModernBERT inference
    test_results = trainer.evaluate(tokenized_datasets["test"])
    print(f"Test results: {test_results}")
    ########################################################################

    # Stop tracking emissions
    emissions = tracker.stop()

    # Prepare results dictionary
    results = {
        "accuracy": float(test_results["eval_accuracy"]),
        "energy_consumed_wh": emissions.energy_consumed * 1000,
        "emissions_gco2eq": emissions.emissions * 1000,
        "emissions_data": emissions,
        "dataset_config": {
            "dataset_name": dataset_name,
            "test_size": len(tokenized_datasets["test"]),
        }
    }

    return results


In [None]:
evaluation_results = evaluate_text(trainer, tokenized_datasets)
print(evaluation_results)


# Save the model and tokenizer

In [None]:
# After training is complete
trainer.save_model("./results")
tokenizer.save_pretrained("./results")

# For the submission

This is how to change the .py file:

In [None]:
from fastapi import APIRouter
from datetime import datetime
from datasets import load_dataset
from sklearn.metrics import accuracy_score
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

from .utils.evaluation import TextEvaluationRequest
from .utils.emissions import tracker, clean_emissions_data, get_space_info

router = APIRouter()

DESCRIPTION = "Fine-tuned ModernBERT for Climate Disinformation Detection"
ROUTE = "/text"

@router.post(ROUTE, tags=["Text Task"],
             description=DESCRIPTION)
async def evaluate_text(request: TextEvaluationRequest):
    # Get space info
    username, space_url = get_space_info()

    # Define the label mapping
    LABEL_MAPPING = {
        "0_not_relevant": 0,
        "1_not_happening": 1,
        "2_not_human": 2,
        "3_not_bad": 3,
        "4_solutions_harmful_unnecessary": 4,
        "5_science_unreliable": 5,
        "6_proponents_biased": 6,
        "7_fossil_fuels_needed": 7
    }

    # Load the dataset
    dataset = load_dataset(request.dataset_name)

    # Start tracking emissions
    tracker.start()
    tracker.start_task("model_loading_and_inference")

    #--------------------------------------------------------------------------------------------
    # MODEL LOADING AND INFERENCE CODE

    # Load the fine-tuned model and tokenizer
    model_path = "path/to/your/saved/model"  # Replace with your model path or HuggingFace model ID
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    # Move model to GPU if available (T4 in this case)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Optimize model for inference
    model.eval()

    # Function to perform inference on a batch of texts
    def predict_batch(texts):
        inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        return torch.argmax(outputs.logits, dim=1).cpu().numpy()

    # Perform inference on the entire dataset
    batch_size = 32  # Adjust based on T4 memory constraints
    all_predictions = []

    for i in range(0, len(dataset['train']), batch_size):
        batch_texts = dataset['train'][i:i+batch_size]['text']
        batch_predictions = predict_batch(batch_texts)
        all_predictions.extend(batch_predictions)

    #--------------------------------------------------------------------------------------------
    # MODEL INFERENCE ENDS HERE

    # Stop tracking emissions
    emissions_data = tracker.stop_task()

    # Prepare results dictionary (without calculating accuracy)
    results = {
        "username": username,
        "space_url": space_url,
        "submission_timestamp": datetime.now().isoformat(),
        "model_description": DESCRIPTION,
        "energy_consumed_wh": emissions_data.energy_consumed * 1000,
        "emissions_gco2eq": emissions_data.emissions * 1000,
        "emissions_data": clean_emissions_data(emissions_data),
        "api_route": ROUTE,
        "dataset_config": {
            "dataset_name": request.dataset_name
        }
    }

    return results
