In [None]:
%%capture
# Install Pytorch & other libraries
%pip install "torch==2.4.1" tensorboard
%pip install flash-attn "setuptools<71.0.0" scikit-learn
%pip install --upgrade torchvision

# Install Hugging Face libraries
%pip install  --upgrade \
  "datasets==3.1.0" \
  "accelerate==1.2.1" \
  "hf-transfer==0.1.8"

# # ModernBERT is not yet available in an official transformers release, so we need to install it from github
# %pip install "git+https://github.com/huggingface/transformers.git@6e0515e99c39444caae39472ee1b2fd76ece32f1" --upgrade

In [2]:
import torch

torch.backends.cuda.enable_flash_sdp(False)
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_math_sdp(True)


In [None]:
!pip install transformers



In [3]:
from datasets import load_dataset

# Dataset id from huggingface.co/dataset
dataset_id = "DevQuasar/llm_router_dataset-synth"

# Load raw dataset
raw_dataset = load_dataset(dataset_id)

# Split into our Test & Train sets
train_dataset = raw_dataset['train']
test_dataset = raw_dataset['test']

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/698 [00:00<?, ?B/s]

train.jsonl: 0.00B [00:00, ?B/s]

test.jsonl: 0.00B [00:00, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Train dataset size: 15306
Test dataset size: 4921


In [None]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'prompt', 'label'],
        num_rows: 15306
    })
    test: Dataset({
        features: ['id', 'prompt', 'label'],
        num_rows: 4921
    })
})

The Transformers trainer is going to expect not just a train and test set, but two columns, one marked text and one marked label. We can clean up our train and test data using the datasets .remove_columns and .renamed_column methods to prepare our data.

In [4]:
train_dataset = train_dataset.remove_columns(["id"])
train_dataset = train_dataset.rename_column("prompt", "text")
train_dataset = train_dataset.rename_column("label", "labels")

test_dataset = test_dataset.remove_columns(["id"])
test_dataset = test_dataset.rename_column("prompt", "text")
test_dataset = test_dataset.rename_column("label", "labels")

In [5]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
# model = AutoModelForMaskedLM.from_pretrained("answerdotai/ModernBERT-base").to("cpu")

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

In [None]:
temp = tokenizer("hi bro")
temp

{'input_ids': [50281, 5801, 1795, 50282], 'attention_mask': [1, 1, 1, 1]}

In [None]:
x = tokenizer.get_vocab()
di = dict(zip(x.values(), x.keys()))
for ints in temp['input_ids']:
  print(di[ints])

[CLS]
hi
Ġbro
[SEP]


In [None]:
tokenizer.convert_ids_to_tokens(temp['input_ids'])

['[CLS]', 'hi', 'Ġbro', '[SEP]']

In [None]:
tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(temp['input_ids']))

'[CLS]hi bro[SEP]'

In [None]:
# help(tokenizer)
# remove this comment '#' to see details about tokenizer

In [6]:
tokenizer.model_max_length = 256 # changing from 1024 to 256

def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, return_tensors="pt")

In [7]:
tokenized_train_dataset = train_dataset.map(tokenize, batched=True,remove_columns=["text"])

tokenized_test_dataset = test_dataset.map(tokenize, batched=True,remove_columns=["text"])


print(tokenized_train_dataset.features.keys())
print(tokenized_test_dataset.features.keys())

Map:   0%|          | 0/15306 [00:00<?, ? examples/s]

Map:   0%|          | 0/4921 [00:00<?, ? examples/s]

dict_keys(['labels', 'input_ids', 'attention_mask'])
dict_keys(['labels', 'input_ids', 'attention_mask'])


In [None]:
model

ModernBertForMaskedLM(
  (model): ModernBertModel(
    (embeddings): ModernBertEmbeddings(
      (tok_embeddings): Embedding(50368, 768, padding_idx=50283)
      (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (drop): Dropout(p=0.0, inplace=False)
    )
    (layers): ModuleList(
      (0): ModernBertEncoderLayer(
        (attn_norm): Identity()
        (attn): ModernBertAttention(
          (Wqkv): Linear(in_features=768, out_features=2304, bias=False)
          (rotary_emb): ModernBertRotaryEmbedding()
          (Wo): Linear(in_features=768, out_features=768, bias=False)
          (out_drop): Identity()
        )
        (mlp_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): ModernBertMLP(
          (Wi): Linear(in_features=768, out_features=2304, bias=False)
          (act): GELUActivation()
          (drop): Dropout(p=0.0, inplace=False)
          (Wo): Linear(in_features=1152, out_features=768, bias=False)
        )
      )
      (1-21)

In [None]:
text = "Deep learning is [MASK] powerful."
# text = "hi, i am llm and [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK]"

inputs = tokenizer(text, return_tensors="pt").to("cuda")
outputs = model(**inputs)

mask_idx = (inputs.input_ids == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
logits = outputs.logits[0, mask_idx]
pred_id = logits.argmax(dim=-1)

tokenizer.decode(pred_id)


  return torch._C._get_cublas_allow_tf32()
W0110 13:59:33.511000 1657 torch/_inductor/utils.py:1558] [1/0_1] Not enough SMs to use max_autotune_gemm mode


' very'

In [8]:
from transformers import AutoModelForSequenceClassification

# Model id to load the tokenizer
model_id = "answerdotai/ModernBERT-base"

# Prepare model labels - useful for inference
labels = tokenized_train_dataset.features["labels"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

# Download the model from huggingface.co/models
model = AutoModelForSequenceClassification.from_pretrained(
    model_id, num_labels = 2
).to("cuda")

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model

ModernBertForSequenceClassification(
  (model): ModernBertModel(
    (embeddings): ModernBertEmbeddings(
      (tok_embeddings): Embedding(50368, 768, padding_idx=50283)
      (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (drop): Dropout(p=0.0, inplace=False)
    )
    (layers): ModuleList(
      (0): ModernBertEncoderLayer(
        (attn_norm): Identity()
        (attn): ModernBertAttention(
          (Wqkv): Linear(in_features=768, out_features=2304, bias=False)
          (rotary_emb): ModernBertRotaryEmbedding()
          (Wo): Linear(in_features=768, out_features=768, bias=False)
          (out_drop): Identity()
        )
        (mlp_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): ModernBertMLP(
          (Wi): Linear(in_features=768, out_features=2304, bias=False)
          (act): GELUActivation()
          (drop): Dropout(p=0.0, inplace=False)
          (Wo): Linear(in_features=1152, out_features=768, bias=False)
        )
      

Training Evaluation Metric
To evaluate our model's performance during training, we use the F1 score metric.
The F1 score combines
Precision: Out of all the times we predicted large_llm, how many were actually for a large LLMs?
Recall: Out of all the actual large_llm labels, how many did we catch?
Into F1 = 2 * (precision * recall) / (precision + recall)
The compute_metrics function processes our model's predictions in two steps:
Converts the model's raw output probabilities into actual predictions using argmax (selecting the class with highest probability)
Calculates the weighted F1 score comparing these predictions against the true labels
We use a weighted F1 score to account for both classes (small_llm and large_llm), with pos_label=1 indicating that large_llm is our positive class. The weighting ensures that both classes are properly considered in our evaluation, even if our dataset isn't perfectly balanced between the two classes.
This metric will be calculated during training to help us understand how well our model is learning.

In [9]:
import numpy as np
from sklearn.metrics import f1_score

# Metric helper method
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    score = f1_score(
            labels, predictions, labels=labels, pos_label=1, average="weighted"
        )
    return {"f1": float(score) if score == 1 else score}

In [13]:
from transformers import Trainer, TrainingArguments

# IMPORTANT: do this BEFORE creating Trainer
model.gradient_checkpointing_enable()
model.config.use_cache = False

training_args = TrainingArguments(
    output_dir="ModernBERT-large-llm-router",

    # --------- MEMORY-SAFE SETTINGS (T4) ---------
    per_device_train_batch_size=4,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,  # effective batch = 16

    fp16=True,          # T4 supports fp16
    bf16=False,         # must be false on T4

    optim="adamw_torch",  # safer than fused on T4

    learning_rate=5e-5,
    num_train_epochs=3,

    # --------- LOGGING / EVAL ---------
    logging_strategy="steps",
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics,
)


In [14]:
trainer.train(resume_from_checkpoint=False)

Epoch,Training Loss,Validation Loss,F1
1,0.1984,0.038315,0.990914
2,0.065,0.06221,0.99152
3,0.0093,0.066254,0.991924


TrainOutput(global_step=2871, training_loss=0.10756907190897871, metrics={'train_runtime': 2638.0869, 'train_samples_per_second': 17.406, 'train_steps_per_second': 1.088, 'total_flos': 7823459386681344.0, 'train_loss': 0.10756907190897871, 'epoch': 3.0})

In [15]:
trainer.save_model("final_model")

In [16]:
# loading saved model:

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("final_model")