In [1]:
# %pip install datasets -q
# %pip install transformers -q
# %pip install torch -q
# %pip install seqeval -q
# %pip install evaluate -q
# %pip install accelerate -q

In [2]:
from datasetutils import decode
from iob2converter import iob2_to_dataset
from transformers import AutoModelForTokenClassification

In [3]:
fr_ds = iob2_to_dataset(r'C:\Users\serru\OneDrive\Documents\GitHub\NLP\TaggedSeparated\French\synopses_1.iob2')

ner_feature_fr = fr_ds.features['ner_tags']
label_names = ner_feature_fr.feature.names

In [4]:
words = fr_ds[0]['tokens']
labels = fr_ds[0]['ner_tags']
print('\n'.join(decode(words, labels, label_names)))

Il y a bien longtemps dans une petite ville du bord de mer appellée Alto  Mare  vivait un vieux couple qui était seul sans enfant Un jour ils trouvirent 2 enfants un garçon et une fille Ce vieux couple aimait au plus profond leurs enfants et nauraient jamais souhaités les perdre Malheureusement un sombre nuage sinstalla sur Alto  Mare  pour des raisons inconnues La mère de ces deux enfants se promennaient avec eux en ville en direction de leur domicile quand soudain un éclair maléfique frappa la mère qui mourrut sur le coup Les deux enfants se mirent à pleurer leur mère et comme la légende le dit ça sest vu dans Pokémon 1 ou le retour de Mewtwo Les larmes venues du coeur ont des pouvoirs extraordinaires Juste après leur mort un second éclair frappa les deux enfants et furent eux aussi tués sur le coup Mais leur tristesse et leur larmes se mirent à former une sphère bleue  la Larme  venue du coeur Cette sphère aux pouvoirs magiques ressussita les deux enfants en Pokémon Latias et Latios

In [5]:
from transformers import AutoTokenizer

model_id = 'google-bert/bert-base-multilingual-cased'
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [6]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], 
        truncation=True, 
        padding="max_length", 
        max_length=128,
        is_split_into_words=True
    )

    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

def align_labels_with_tokens(labels, word_ids):
    """
    This function aligns labels with tokens produced by the tokenizer.
    - `-100` is used for special tokens to ignore them during training.
    - If the label is B-XXX, subsequent sub-tokens receive I-XXX.
    """
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            new_labels.append(-100)
        else:
            label = labels[word_id]
            # Convert B-XXX to I-XXX for sub-tokens
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels


In [7]:
from datasets import Dataset, DatasetDict, Features, Sequence, ClassLabel, Value

def iob2_to_dataset(file_path):
    """
    Converts an IOB2 file into a DatasetDict with train and validation splits.
    Assumes the input file uses whitespace to separate tokens and tags, and that each sentence is separated by a blank line.
    """
    tokens, ner_tags = [], []
    sentences, sentence_tags = [], []

    label_set = set()
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                if tokens and ner_tags:
                    sentences.append(tokens)
                    sentence_tags.append(ner_tags)
                tokens, ner_tags = [], []
            else:
                word, tag = line.split()
                tokens.append(word)
                ner_tags.append(tag)
                label_set.add(tag)

        if tokens and ner_tags:
            sentences.append(tokens)
            sentence_tags.append(ner_tags)

    label_list = sorted(label_set)
    label_mapping = {label: i for i, label in enumerate(label_list)}

    indexed_tags = [[label_mapping[tag] for tag in tags] for tags in sentence_tags]
    dataset = Dataset.from_dict({"tokens": sentences, "ner_tags": indexed_tags})
    
    train_size = int(0.8 * len(dataset))
    datasets = dataset.train_test_split(train_size=train_size)

    features = Features({
        "tokens": Sequence(Value("string")),
        "ner_tags": Sequence(ClassLabel(names=label_list))
    })

    datasets = DatasetDict({
        "train": datasets["train"].cast(features),
        "validation": datasets["test"].cast(features)
    })

    return datasets

file_path = r'C:\Users\serru\OneDrive\Documents\GitHub\NLP\TaggedSeparated\French\synopses_1.iob2'
fr_ds = iob2_to_dataset(file_path)


Casting the dataset:   0%|          | 0/5 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2 [00:00<?, ? examples/s]

In [8]:
model = AutoModelForTokenClassification.from_pretrained(
    model_id, num_labels=len(label_names)
)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
tokenized_ds = fr_ds.map(tokenize_and_align_labels, batched=True)

print(tokenized_ds)


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 5
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2
    })
})


In [10]:
for i in range(2):
    print(tokenized_ds['train']['tokens'][i])
    print(tokenized_ds['train']['labels'][i])
    print()

['Dans', 'le', 'Royaume', 'des', 'Diamants', 'souterrain', 'où', 'de', 'nombreux', 'Strassie', 'vivent', 'le', 'Pokémon', 'fabuleux', 'Diancie', 'en', 'est', 'le', 'chef', 'Le', 'Cœur', 'de', 'Diamant', 'qui', 'assure', 'la', 'pérennité', 'du', 'royaume', 'séteint', 'peu', 'à', 'peu', 'et', 'Diancie', 'nest', 'pas', 'encore', 'suffisamment', 'forte', 'pour', 'en', 'créer', 'un', 'nouveau', 'Alors', 'quelle', 'recherche', 'laide', 'du', 'Pokémon', 'légendaire', 'Xerneas', 'Diancie', 'rencontre', 'un', 'groupe', 'de', 'voleurs', 'voulant', 'sapproprier', 'sa', 'capacité', 'à', 'créer', 'des', 'diamants', 'et', 'qui', 'par', 'la', 'même', 'occasion', 'réveillent', 'le', 'Pokémon', 'légendaire', 'Yveltal', 'et', 'le', 'tirent', 'de', 'son', 'cocon', 'Sacha', 'et', 'ses', 'amis', 'vontils', 'parvenir', 'à', 'aider', 'Diancie', 'à', 'découvrir', 'son', 'vrai', 'pouvoir', 'arrêter', 'la', 'rage', 'destructive', 'dYveltal', 'et', 'sauver', 'le', 'Royaume', 'des', 'Diamants']
[-100, 9, 9, 0, 5,

In [13]:
import evaluate

metric = evaluate.load("seqeval")

In [14]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [15]:
from transformers import AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained (
    model_id,
    num_labels=len(label_names),
    id2label={id: label for id, label in enumerate(label_names)},
    label2id={label: id for id, label in enumerate(label_names)},
)
model.config.num_labels

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


12

In [17]:
from transformers import TrainingArguments
from transformers import Trainer

args = TrainingArguments(
    "mbert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
)


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    compute_metrics=compute_metrics,
)

trainer.train()


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 2.135202407836914, 'eval_precision': 0.4230769230769231, 'eval_recall': 0.55, 'eval_f1': 0.47826086956521735, 'eval_accuracy': 0.4948453608247423, 'eval_runtime': 0.8062, 'eval_samples_per_second': 2.481, 'eval_steps_per_second': 1.24, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.7529469728469849, 'eval_precision': 0.5, 'eval_recall': 0.6928571428571428, 'eval_f1': 0.5808383233532934, 'eval_accuracy': 0.6082474226804123, 'eval_runtime': 0.7561, 'eval_samples_per_second': 2.645, 'eval_steps_per_second': 1.322, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.6290361881256104, 'eval_precision': 0.5051546391752577, 'eval_recall': 0.7, 'eval_f1': 0.5868263473053892, 'eval_accuracy': 0.6134020618556701, 'eval_runtime': 0.8089, 'eval_samples_per_second': 2.472, 'eval_steps_per_second': 1.236, 'epoch': 3.0}
{'train_runtime': 57.8894, 'train_samples_per_second': 0.259, 'train_steps_per_second': 0.052, 'train_loss': 2.0811095237731934, 'epoch': 3.0}


TrainOutput(global_step=3, training_loss=2.0811095237731934, metrics={'train_runtime': 57.8894, 'train_samples_per_second': 0.259, 'train_steps_per_second': 0.052, 'train_loss': 2.0811095237731934, 'epoch': 3.0})

In [18]:
trainer.save_model(r"C:\Users\serru\OneDrive\Documents\GitHub\NLP\TaggedSeparated\French\ModelS1")