In [1]:
!python -m pip install --upgrade pip==24.0 > /dev/null
!pip install evaluate fairseq seqeval sacremoses > /dev/null

[33mDEPRECATION: omegaconf 2.0.6 has a non-standard dependency specifier PyYAML>=5.1.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of omegaconf or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [10]:
from transformers import(
    AutoTokenizer, 
    AutoModelForTokenClassification, 
    TrainingArguments, 
    Trainer, 
    DataCollatorForTokenClassification,
    EarlyStoppingCallback,
)

from datasets import (
    load_dataset, 
    load_from_disk,
    DatasetDict, 
    concatenate_datasets,
) 

import evaluate
import os
import numpy as np
import pandas as pd
import torch

In [None]:
dataset = load_dataset("DrBenchmark/QUAERO", "emea", trust_remote_code=True)

In [3]:
# Commenter load from disk lorsque première execution puis vis versa avec save to disk pour éviter de recharger depuis HF

path = "dataset/"

dataset = load_from_disk(path)
# dataset.save_to_disk(path)

dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'document_id', 'tokens', 'ner_tags'],
        num_rows: 954
    })
    validation: Dataset({
        features: ['id', 'document_id', 'tokens', 'ner_tags'],
        num_rows: 119
    })
    test: Dataset({
        features: ['id', 'document_id', 'tokens', 'ner_tags'],
        num_rows: 120
    })
})

In [4]:
label_list = dataset['train'].features['ner_tags'].feature.names
num_labels = len(label_list)
id2label = {i: tag for i, tag in enumerate(label_list)}
label2id = {tag: i for i, tag in enumerate(label_list)}

In [5]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, padding="max_length", max_length=258,
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [14]:
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [13]:
NUM_CPU = os.cpu_count()

model_checkpoint = "/kaggle/working/bio-bert/checkpoint-620"

tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1", trust_remote_code=True)

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True, num_proc=NUM_CPU)

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
    trust_remote_code=True,
)

In [30]:
# early_stopping = EarlyStoppingCallback(
#    early_stopping_patience=3,  
#    early_stopping_threshold=0.0  
# )

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="./emea/biobert",
    logging_dir="./logs",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps=50,
    logging_steps=50,
    save_strategy="epoch",
    eval_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    greater_is_better=True,
    metric_for_best_model="f1",
    report_to=["tensorboard"],
    save_safetensors=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [14]:
from transformers import pipeline

nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
res = nlp("Tysabri est utilisé dans le traitement des adultes atteints de sclérose en plaques ( SEP )")
res

Device set to use cuda:0


[{'entity_group': 'CHEM',
  'score': 0.9955258,
  'word': 'Ty',
  'start': 0,
  'end': 2},
 {'entity_group': 'CHEM',
  'score': 0.9379011,
  'word': '##sa',
  'start': 2,
  'end': 4},
 {'entity_group': 'CHEM',
  'score': 0.5792198,
  'word': '##bri',
  'start': 4,
  'end': 7},
 {'entity_group': 'PROC',
  'score': 0.99566215,
  'word': 'trait',
  'start': 28,
  'end': 33},
 {'entity_group': 'LIVB',
  'score': 0.983051,
  'word': 'adult',
  'start': 43,
  'end': 48},
 {'entity_group': 'LIVB',
  'score': 0.69939804,
  'word': '##es',
  'start': 48,
  'end': 50},
 {'entity_group': 'DISO',
  'score': 0.9901805,
  'word': 's',
  'start': 63,
  'end': 64},
 {'entity_group': 'DISO',
  'score': 0.9233911,
  'word': '##clérose en plaques',
  'start': 64,
  'end': 82},
 {'entity_group': 'DISO',
  'score': 0.75769997,
  'word': 'SEP',
  'start': 85,
  'end': 88}]

In [24]:
from sklearn.metrics import classification_report, confusion_matrix

predictions = trainer.predict(trainer.eval_dataset)

logits = predictions.predictions
labels = predictions.label_ids

y_pred = np.argmax(logits, axis=-1)  
y_true = labels

y_pred = np.concatenate(y_pred)
y_true = np.concatenate(y_true)

valid_indices = np.array(y_true) != -100

y_true_filtered = np.array(y_true)[valid_indices]
y_pred_filtered = np.array(y_pred)[valid_indices]

unique_labels = set(y_true_filtered)
filtered_target_names = [id2label[i] for i in unique_labels]

assert len(unique_labels) == len(filtered_target_names), f"Erreur : {len(unique_labels)} classes dans y_true mais {len(filtered_target_names)} dans target_names"

report_dict = classification_report(
    y_true_filtered, 
    y_pred_filtered, 
    labels=list(unique_labels),  
    target_names=filtered_target_names,  
    digits=4, 
    zero_division=0, 
    output_dict=True
)

correct_counts = (y_true_filtered == y_pred_filtered).astype(int)
correct_per_label = {id2label[label]: sum(correct_counts[y_true_filtered == label]) for label in unique_labels}

for label in report_dict.keys():
    if label in correct_per_label:
        report_dict[label]["correct"] = f"{correct_per_label[label]}"

df_report = pd.DataFrame(report_dict).transpose()
df_report


Unnamed: 0,precision,recall,f1-score,support,correct
O,0.993523,0.990772,0.992145,3251.0,3221.0
B-LIVB,0.953846,0.984127,0.96875,63.0,62.0
I-LIVB,0.954545,1.0,0.976744,21.0,21.0
B-PROC,0.948718,0.948718,0.948718,78.0,74.0
I-PROC,0.894737,0.809524,0.85,21.0,17.0
B-ANAT,0.923077,0.888889,0.90566,27.0,24.0
I-ANAT,0.909091,0.909091,0.909091,11.0,10.0
B-DEVI,1.0,0.875,0.933333,8.0,7.0
I-DEVI,1.0,1.0,1.0,2.0,2.0
B-CHEM,0.957806,0.986957,0.972163,230.0,227.0
