In [1]:
! pip install gliner
! pip install accelerate -U
! pip install faker

Collecting gliner
  Downloading gliner-0.2.8-py3-none-any.whl.metadata (6.3 kB)
Collecting onnxruntime (from gliner)
  Downloading onnxruntime-1.18.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=2.0.0->gliner)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=2.0.0->gliner)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=2.0.0->gliner)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=2.0.0->gliner)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=2.0.0->gliner)
  Using cached nvidia_cublas_cu12-12.1

In [3]:
import json
import random

train_path = 'ner_dataset.json'

# Charger les données
with open(train_path, "r") as f:
    data = json.load(f)

print('Dataset size:', len(data))


# Mélanger les données
random.shuffle(data)
print('Dataset is shuffled...')

# Diviser les données en ensemble d'entraînement et de test
train_dataset = data[:int(len(data) * 0.9)]
test_dataset = data[int(len(data) * 0.9):]

# Définir les chemins de fichier pour enregistrer les ensembles de données
train_dataset_path = 'train_dataset.json'
test_dataset_path = 'test_dataset.json'

# Enregistrer l'ensemble de données d'entraînement
with open(train_dataset_path, "w") as train_file:
    json.dump(train_dataset, train_file)
print(f'Training dataset saved to {train_dataset_path}')

# Enregistrer l'ensemble de données de test
with open(test_dataset_path, "w") as test_file:
    json.dump(test_dataset, test_file)
print(f'Test dataset saved to {test_dataset_path}')


Dataset size: 456
Dataset is shuffled...
Training dataset saved to train_dataset.json
Test dataset saved to test_dataset.json


In [4]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"

import torch
from gliner import GLiNERConfig, GLiNER
from gliner.training import Trainer, TrainingArguments
from gliner.data_processing.collator import DataCollatorWithPadding, DataCollator
from gliner.utils import load_config_as_namespace
from gliner.data_processing import WordsSplitter, GLiNERDataset

In [5]:
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')

model = GLiNER.from_pretrained("urchade/gliner_small-v2.1")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/611M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.76k [00:00<?, ?B/s]

gliner_config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [6]:
# use it for better performance, it mimics original implementation but it's less memory efficient
data_collator = DataCollator(model.config, data_processor=model.data_processor, prepare_labels=True)

In [7]:
# Optional: compile model for faster training
model.to(device)
print("done")

done


In [8]:
# calculate number of epochs
num_steps = 500
batch_size = 8
data_size = len(train_dataset)
num_batches = data_size // batch_size
num_epochs = max(1, num_steps // num_batches)

training_args = TrainingArguments(
    output_dir="models",
    learning_rate=5e-6,
    weight_decay=0.01,
    others_lr=1e-5,
    others_weight_decay=0.01,
    lr_scheduler_type="linear", #cosine
    warmup_ratio=0.1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    evaluation_strategy="steps",
    save_steps = 100,
    save_total_limit=10,
    dataloader_num_workers = 0,
    use_cpu = False,
    report_to="none",
    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=model.data_processor.transformer_tokenizer,
    data_collator=data_collator,
)

trainer.train()

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Step,Training Loss,Validation Loss


TrainOutput(global_step=468, training_loss=25.175777076655983, metrics={'train_runtime': 148.0506, 'train_samples_per_second': 24.924, 'train_steps_per_second': 3.161, 'total_flos': 0.0, 'train_loss': 25.175777076655983, 'epoch': 9.0})

In [9]:
# Save the model checkpoint
trainer.save_model("models/gliner_small-v2.1_V1")