## Create Config

In [1]:
from gliner import GLiNERConfig

modern_bert_config = GLiNERConfig(
    model_name="answerdotai/ModernBERT-base",
    encoder_config={
        "model_type": "modernbert",
        "vocab_size": 50368,
        "hidden_size": 768,
        "num_hidden_layers": 22,
        "num_attention_heads": 12,
        "intermediate_size": 1152,
        "hidden_activation": "gelu",
        "max_position_embeddings": 8192,
        "attention_dropout": 0.0,
        "embedding_dropout": 0.0,
        "classifier_dropout": 0.0,
        "pad_token_id": 50283,
        "eos_token_id": 50282,
        "bos_token_id": 50281,
        "cls_token_id": 50281,
        "sep_token_id": 50282
    },
    hidden_size=768,
    vocab_size=50368,
    max_len=8192,
    dropout=0.0,
    fine_tune=True,
    subtoken_pooling="first",
    span_mode="markerV0",
    max_width=12,
    has_rnn=True,
    fuse_layers=False,
    class_token_index=50281  # Using CLS token ID
)

# Verify configuration
print(f"Model name: {modern_bert_config.model_name}")
print(f"Encoder type: {modern_bert_config.encoder_config.model_type}")
print(f"Hidden size: {modern_bert_config.hidden_size}")

  from .autonotebook import tqdm as notebook_tqdm
  warn(


Model name: answerdotai/ModernBERT-base
Encoder type: modernbert
Hidden size: 768


## Initiate Model

In [11]:
from gliner import GLiNER

ModernBertGLiNER = GLiNER(modern_bert_config)

# Sample text for entity prediction
text = """
Cristiano Ronaldo dos Santos Aveiro, born 5 February 1985, is a Portuguese professional footballer who plays as a forward for and captains both Saudi Pro League club Al Nassr and the Portugal national team. Widely regarded as one of the greatest players of all time, Ronaldo has won five Ballon d'Or awards,[note 3] a record three UEFA Men's Player of the Year Awards, and four European Golden Shoes, the most by a European player. He has won 33 trophies in his career, including seven league titles, five UEFA Champions Leagues, the UEFA European Championship and the UEFA Nations League. Ronaldo holds the records for most appearances (183), goals (140) and assists (42) in the Champions League, goals in the European Championship (14), international goals (128) and international appearances (205). He is one of the few players to have made over 1,200 professional career appearances, the most by an outfield player, and has scored over 850 official senior career goals for club and country, making him the top goalscorer of all time.
"""

# Labels for entity prediction
# Most GLiNER models should work best when entity types are in lower case or title case
labels = ["Person", "Award", "Date", "Competitions", "Teams"]

# Perform entity prediction
entities = ModernBertGLiNER.predict_entities(text, labels, threshold=0.5)

# Display predicted entities and their labels
for entity in entities:
    print(entity["text"], "=>", entity["label"])

200 professional career => Person


## Prepare training data

In [6]:
import json
with open('pilener_train.json', 'r') as f:
        processed_data = json.load(f)

## Training

In [12]:
# 1. Initialize configuration and model
import torch
from transformers import AutoTokenizer
from gliner import GLiNER
from gliner.data_processing import WordsSplitter, GLiNERDataset
from gliner.data_processing.collator import DataCollator, DataCollatorWithPadding
from gliner.training import Trainer, TrainingArguments

# Setup device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 2. Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
words_splitter = WordsSplitter("whitespace")
model = GLiNER(modern_bert_config, tokenizer=tokenizer, words_splitter=words_splitter)

# 3. Prepare training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    learning_rate=5e-5,
    weight_decay=0.01,
    others_lr=1e-4,
    others_weight_decay=0.01,
    focal_loss_gamma=2.0,
    focal_loss_alpha=1.0,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    max_steps=10000,
    save_steps=1000,
    save_total_limit=2,
    dataloader_num_workers=4,
    bf16=True if torch.cuda.is_available() else False
)

# 4. Initialize trainer

train_dataset = GLiNERDataset(processed_data, modern_bert_config, tokenizer, words_splitter)
data_collator = DataCollatorWithPadding(modern_bert_config)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    # data_collator=DataCollator(model.config, 
    #                           data_processor=model.data_processor, 
    #                           prepare_labels=True)
)

# 5. Train
trainer.train()

Collecting all entities...


100%|██████████| 45889/45889 [00:00<00:00, 362395.16it/s]

Total number of entity classes:  15176



  trainer = Trainer(
W0429 11:17:46.591000 5769 site-packages/torch/_inductor/utils.py:1137] [1/0] Not enough SMs to use max_autotune_gemm mode


Skipping iteration due to error: Target size (torch.Size([321600, 1])) must be the same as input size (torch.Size([12864, 1]))


Step,Training Loss
500,0.0


Skipping iteration due to error: Target size (torch.Size([373200, 1])) must be the same as input size (torch.Size([14928, 1]))
Skipping iteration due to error: Target size (torch.Size([279312, 1])) must be the same as input size (torch.Size([12144, 1]))
Skipping iteration due to error: Target size (torch.Size([310800, 1])) must be the same as input size (torch.Size([12432, 1]))
Skipping iteration due to error: Target size (torch.Size([304800, 1])) must be the same as input size (torch.Size([12192, 1]))
Skipping iteration due to error: Target size (torch.Size([319200, 1])) must be the same as input size (torch.Size([12768, 1]))
Skipping iteration due to error: Target size (torch.Size([195840, 1])) must be the same as input size (torch.Size([12240, 1]))
Skipping iteration due to error: Target size (torch.Size([310800, 1])) must be the same as input size (torch.Size([12432, 1]))
Skipping iteration due to error: Target size (torch.Size([310800, 1])) must be the same as input size (torch.Si

KeyboardInterrupt: 