In [1]:
from datasets import load_dataset
from gliner import GLiNER
from gliner.training import Trainer, TrainingArguments
from gliner.data_processing.collator import DataCollator
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def ner_tags_to_spans(samples, tag_to_id):
    """
    Converts NER tags in the dataset samples to spans (start, end, entity type).

    Args:
        samples (dict): A dictionary containing the tokens and NER tags.
        tag_to_id (dict): A dictionary mapping NER tags to IDs.

    Returns:
        dict: A dictionary containing tokenized text and corresponding NER spans.
    """
    ner_tags = samples["ner_tags"]
    id_to_tag = {v: k for k, v in tag_to_id.items()}
    spans = []
    start_pos = None
    entity_name = None

    for i, tag in enumerate(ner_tags):
        if tag == 0:  # 'O' tag
            if entity_name is not None:
                spans.append((start_pos, i - 1, entity_name))
                entity_name = None
                start_pos = None
        else:
            tag_name = id_to_tag[tag]
            if tag_name.startswith('B-'):
                if entity_name is not None:
                    spans.append((start_pos, i - 1, entity_name))
                entity_name = tag_name[2:]
                start_pos = i
            elif tag_name.startswith('I-'):
                continue
    if entity_name is not None:
        spans.append((start_pos, len(samples["tokens"]) - 1, entity_name))

    return {"tokenized_text": samples["tokens"], "ner": spans}

In [3]:
tag_to_id = {
    'O': 0, 'B-person': 1, 'I-person': 2, 'B-organization': 3, 'I-organization': 4,
    'B-location': 5, 'I-location': 6, 'B-misc': 7, 'I-misc': 8
}

In [4]:
dataset = load_dataset("eriktks/conll2003", trust_remote_code=True)

In [5]:
data_train = [ner_tags_to_spans(i, tag_to_id) for i in dataset['train']]
data_test = [ner_tags_to_spans(i, tag_to_id) for i in dataset['test']]
data_val = [ner_tags_to_spans(i, tag_to_id) for i in dataset['validation']]

In [6]:
model = GLiNER.from_pretrained("urchade/gliner_small")

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

model = model.to(device)

Fetching 4 files: 100%|██████████| 4/4 [00:00<?, ?it/s]
  return self.fget.__get__(instance, owner)()


In [7]:
data_collator = DataCollator(model.config, data_processor=model.data_processor, prepare_labels=True)

In [8]:
evaluation_results = model.evaluate(
    data_test[:100], flat_ner=True, entity_types=["person", "organization", "location", "misc"]
)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [9]:
print("Результаты без дообучения")
print(evaluation_results)

Результаты без дообучения
('P: 79.24%\tR: 86.18%\tF1: 82.56%\n', 0.8256070640176602)


In [10]:
tokenizers = model.data_processor.transformer_tokenizer
tokenizers.model_max_length = 800
model.data_processor.config.max_len = 800

In [11]:
training_args = TrainingArguments(
    output_dir=".\gliner_finetuned_conll2003",
    learning_rate=5e-5,
    weight_decay=0.01,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    save_strategy="epoch",
    save_total_limit=2,
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data_train[:500],
    eval_dataset=data_val[:100],
    tokenizer=tokenizers,
    data_collator=data_collator,
)



In [12]:
trainer.train()

 20%|██        | 63/315 [05:16<16:46,  3.99s/it]

{'loss': 11.5054, 'grad_norm': 0.3242212235927582, 'learning_rate': 4e-05, 'epoch': 1.0}


                                                
 20%|██        | 63/315 [05:24<16:46,  3.99s/it]

{'eval_loss': 11.181735038757324, 'eval_runtime': 8.2701, 'eval_samples_per_second': 12.092, 'eval_steps_per_second': 1.572, 'epoch': 1.0}


 40%|████      | 126/315 [10:28<13:44,  4.36s/it]

{'loss': 5.4851, 'grad_norm': 466.9226379394531, 'learning_rate': 3e-05, 'epoch': 2.0}


                                                 
 40%|████      | 126/315 [10:37<13:44,  4.36s/it]

{'eval_loss': 5.476375102996826, 'eval_runtime': 8.195, 'eval_samples_per_second': 12.203, 'eval_steps_per_second': 1.586, 'epoch': 2.0}


 60%|██████    | 189/315 [15:43<08:58,  4.27s/it]

{'loss': 3.3371, 'grad_norm': 86.00082397460938, 'learning_rate': 2e-05, 'epoch': 3.0}


                                                 
 60%|██████    | 189/315 [15:51<08:58,  4.27s/it]

{'eval_loss': 17.714698791503906, 'eval_runtime': 7.9143, 'eval_samples_per_second': 12.635, 'eval_steps_per_second': 1.643, 'epoch': 3.0}


 80%|████████  | 252/315 [21:50<05:12,  4.96s/it]

{'loss': 2.2341, 'grad_norm': 0.0049573942087590694, 'learning_rate': 1e-05, 'epoch': 4.0}


                                                 
 80%|████████  | 252/315 [21:58<05:12,  4.96s/it]

{'eval_loss': 15.849194526672363, 'eval_runtime': 8.1637, 'eval_samples_per_second': 12.249, 'eval_steps_per_second': 1.592, 'epoch': 4.0}


100%|██████████| 315/315 [27:15<00:00,  4.44s/it]

{'loss': 0.7583, 'grad_norm': 0.19619490206241608, 'learning_rate': 0.0, 'epoch': 5.0}


                                                 
100%|██████████| 315/315 [27:23<00:00,  4.44s/it]

{'eval_loss': 24.19757843017578, 'eval_runtime': 8.5049, 'eval_samples_per_second': 11.758, 'eval_steps_per_second': 1.529, 'epoch': 5.0}


100%|██████████| 315/315 [27:29<00:00,  5.24s/it]


{'train_runtime': 1649.4809, 'train_samples_per_second': 1.516, 'train_steps_per_second': 0.191, 'train_loss': 4.664005279541016, 'epoch': 5.0}


TrainOutput(global_step=315, training_loss=4.664005279541016, metrics={'train_runtime': 1649.4809, 'train_samples_per_second': 1.516, 'train_steps_per_second': 0.191, 'total_flos': 0.0, 'train_loss': 4.664005279541016, 'epoch': 5.0})

In [10]:
model = GLiNER.from_pretrained(".\gliner_finetuned_conll2003\checkpoint-315")
model = model.to(device)

config.json not found in D:\Documents\NLP_NSU\Lab2\gliner_finetuned_conll2003\checkpoint-315


In [11]:
evaluation_results = model.evaluate(
    data_test[:100], flat_ner=True, entity_types=["person", "organization", "location", "misc"]
)

In [12]:
print(evaluation_results)

('P: 95.02%\tR: 96.77%\tF1: 95.89%\n', 0.958904109589041)
