## Create Config

In [1]:
from gliner import GLiNERConfig

modern_bert_config = GLiNERConfig(
    model_name="answerdotai/ModernBERT-base",
    encoder_config={
        "model_type": "modernbert",
        "vocab_size": 50368,
        "hidden_size": 768,
        "num_hidden_layers": 22,
        "num_attention_heads": 12,
        "intermediate_size": 1152,
        "hidden_activation": "gelu",
        "max_position_embeddings": 8192,
        "attention_dropout": 0.0,
        "embedding_dropout": 0.0,
        "classifier_dropout": 0.0,
        "pad_token_id": 50283,
        "eos_token_id": 50282,
        "bos_token_id": 50281,
        "cls_token_id": 50281,
        "sep_token_id": 50282
    },
    hidden_size=768,
    vocab_size=50368,
    max_len=8192,
    dropout=0.0,
    fine_tune=True,
    subtoken_pooling="first",
    span_mode="markerV0",
    max_width=12,
    has_rnn=True,
    fuse_layers=False,
    class_token_index=50281  # Using CLS token ID
)

# Verify configuration
print(f"Model name: {modern_bert_config.model_name}")
print(f"Encoder type: {modern_bert_config.encoder_config.model_type}")
print(f"Hidden size: {modern_bert_config.hidden_size}")

  from .autonotebook import tqdm as notebook_tqdm
  warn(


Model name: answerdotai/ModernBERT-base
Encoder type: modernbert
Hidden size: 768


## Initiate Model

In [11]:
from gliner import GLiNER

ModernBertGLiNER = GLiNER(modern_bert_config)

# Sample text for entity prediction
text = """
Cristiano Ronaldo dos Santos Aveiro, born 5 February 1985, is a Portuguese professional footballer who plays as a forward for and captains both Saudi Pro League club Al Nassr and the Portugal national team. Widely regarded as one of the greatest players of all time, Ronaldo has won five Ballon d'Or awards,[note 3] a record three UEFA Men's Player of the Year Awards, and four European Golden Shoes, the most by a European player. He has won 33 trophies in his career, including seven league titles, five UEFA Champions Leagues, the UEFA European Championship and the UEFA Nations League. Ronaldo holds the records for most appearances (183), goals (140) and assists (42) in the Champions League, goals in the European Championship (14), international goals (128) and international appearances (205). He is one of the few players to have made over 1,200 professional career appearances, the most by an outfield player, and has scored over 850 official senior career goals for club and country, making him the top goalscorer of all time.
"""

# Labels for entity prediction
# Most GLiNER models should work best when entity types are in lower case or title case
labels = ["Person", "Award", "Date", "Competitions", "Teams"]

# Perform entity prediction
entities = ModernBertGLiNER.predict_entities(text, labels, threshold=0.5)

# Display predicted entities and their labels
for entity in entities:
    print(entity["text"], "=>", entity["label"])

200 professional career => Person


## Prepare training data

In [13]:
from datasets import load_dataset
import re
import ast
import json
from tqdm import tqdm


def tokenize_text(text):
    """Tokenizes the input text into a list of tokens."""
    return re.findall(r'\w+(?:[-_]\w+)*|\S', text)


def process_entities(dataset):
    """Processes entities in the dataset to extract tokenized text and named entity spans."""
    all_data = []
    for el in tqdm(dataset["entity"]):
        try:
            tokenized_text = tokenize_text(el["input"])
            parsed_output = ast.literal_eval(el["output"])
            entity_texts, entity_types = zip(*[i.split(" <> ") for i in parsed_output])

            entity_spans = []
            for j, entity_text in enumerate(entity_texts):
                entity_tokens = tokenize_text(entity_text)
                matches = []
                for i in range(len(tokenized_text) - len(entity_tokens) + 1):
                    if " ".join(tokenized_text[i:i + len(entity_tokens)]).lower() == " ".join(entity_tokens).lower():
                        matches.append((i, i + len(entity_tokens) - 1, entity_types[j]))
                if matches:
                    entity_spans.extend(matches)

        except Exception as e:
            continue

        all_data.append({"tokenized_text": tokenized_text, "ner": entity_spans})
    return all_data


def save_data_to_file(data, filepath):
    """Saves the processed data to a JSON file."""
    with open(filepath, 'w') as f:
        json.dump(data, f)


In [16]:
from datasets import load_dataset
import re
import ast
import json
from tqdm import tqdm

# 1. Load dataset with download mode
try:
    dataset = load_dataset(
        "numind/NuNER",
        download_mode="force_redownload",
        verification_mode="no_checks"
    )
    
    # Access train split
    train_dataset = dataset['train'] if 'train' in dataset else dataset
    
    # Process entities
    processed_data = process_entities(train_dataset)
    
    # Save processed data
    save_data_to_file(processed_data, 'nuner_train.json')
    
except Exception as e:
    print(f"Error loading dataset: {e}")
    # Alternative: manual download
    print("Trying manual download...")
    
    import requests
    import os
    
    def download_dataset():
        url = "https://huggingface.co/datasets/numind/NuNER/raw/main/data/train.json"
        response = requests.get(url)
        if response.status_code == 200:
            with open("nuner_train.json", "wb") as f:
                f.write(response.content)
            return json.loads(response.content)
        return None
    
    raw_dataset = download_dataset()
    if raw_dataset:
        processed_data = process_entities({"entity": raw_dataset})
        save_data_to_file(processed_data, 'nuner_processed.json')

print("dataset size:", len(processed_data))

Downloading readme: 100%|██████████| 387/387 [00:00<00:00, 3.76MB/s]
Downloading data: 100%|██████████| 288M/288M [00:04<00:00, 65.1MB/s]
Downloading data: 100%|██████████| 522M/522M [00:07<00:00, 68.0MB/s]]
Downloading data files: 100%|██████████| 2/2 [00:12<00:00,  6.08s/it]
Extracting data files: 100%|██████████| 2/2 [00:00<00:00, 2237.56it/s]
  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)
Generating entity split: 100%|██████████| 1000000/1000000 [00:03<00:00, 301910.52 examples/s]
  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)
Generating full split: 100%|██████████| 1000000/1000000 [00:05<00:00, 193201.09 examples/s]


Error loading dataset: Loading a dataset cached in a LocalFileSystem is not supported.
Trying manual download...
dataset size: 45889


In [22]:
with open('nuner_train.json', 'r') as f:
        processed_data = json.load(f)

## Training

In [25]:
# 1. Initialize configuration and model
import torch
from transformers import AutoTokenizer
from gliner import GLiNER
from gliner.data_processing import WordsSplitter
from gliner.data_processing.collator import DataCollator
from gliner.training import Trainer, TrainingArguments

# Setup device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 2. Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
words_splitter = WordsSplitter("whitespace")
model = GLiNER(modern_bert_config, tokenizer=tokenizer, words_splitter=words_splitter)

# 3. Prepare training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    learning_rate=5e-5,
    weight_decay=0.01,
    others_lr=1e-4,
    others_weight_decay=0.01,
    focal_loss_gamma=2.0,
    focal_loss_alpha=1.0,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    max_steps=10000,
    save_steps=1000,
    save_total_limit=2,
    dataloader_num_workers=4,
    bf16=True if torch.cuda.is_available() else False
)

# 4. Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_data,
    tokenizer=tokenizer,
    data_collator=DataCollator(model.config, 
                              data_processor=model.data_processor, 
                              prepare_labels=True)
)

# 5. Train
trainer.train()

  trainer = Trainer(


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# 1. Initialize configuration and model
import torch
from transformers import AutoTokenizer
from gliner import GLiNER
from gliner.data_processing import WordsSplitter
from gliner.data_processing.collator import DataCollator
from gliner.training import Trainer, TrainingArguments

# Setup device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 2. Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
words_splitter = WordsSplitter("whitespace")
model = GLiNER.from_pretrained("answerdotai/ModernBERT-base")

# 3. Prepare training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    learning_rate=5e-5,
    weight_decay=0.01,
    others_lr=1e-4,
    others_weight_decay=0.01,
    focal_loss_gamma=2.0,
    focal_loss_alpha=1.0,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    max_steps=10000,
    save_steps=1000,
    save_total_limit=2,
    dataloader_num_workers=4,
    bf16=True if torch.cuda.is_available() else False
)

# 4. Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_data,
    tokenizer=tokenizer,
    data_collator=DataCollator(model.config, 
                              data_processor=model.data_processor, 
                              prepare_labels=True)
)

# 5. Train
trainer.train()

Fetching 16 files: 100%|██████████| 16/16 [00:34<00:00,  2.16s/it]


FileNotFoundError: [Errno 2] No such file or directory: '/home/leo/.cache/huggingface/hub/models--answerdotai--ModernBERT-base/snapshots/8949b909ec900327062f0ebf497f51aef5e6f0c8/gliner_config.json'