In [40]:
# %pip install datasets -q
# %pip install transformers -q
# %pip install torch -q
# %pip install seqeval -q
# %pip install evaluate -q
# %pip install accelerate -q

In [1]:
from datasetutils import decode
from iob2converter import iob2_to_dataset
from transformers import AutoModelForTokenClassification

In [4]:
file_path = '/Users/bogdancristianmihaila/Desktop/NLP/TaggedSeparated/English/synopses_1.iob2'

fr_ds = iob2_to_dataset(file_path)

ner_feature_fr = fr_ds.features['ner_tags']
label_names = ner_feature_fr.feature.names

In [5]:
words = fr_ds[0]['tokens']
labels = fr_ds[0]['ner_tags']
print('\n'.join(decode(words, labels, label_names)))

Ash   Misty and Brock travel to the water city of AltoMare where Ash   and Misty compete in the   Water Chariot Race  which Misty ends up winning After the race Ash   ends up saving a girl from Annie and Oakley two members of Team  Rocket who are in the city to try and capture the Legendary Pokémon Latios and Latias for Giovanni Little does Ash   know but this girl is actually Latias in disguise Ash   follows her back to her secret hideout where he meets her brother Latios Ash   also learns that Latias takes the form of her best friend Bianca Bianca and her grandfather tell Ash   all about Latios and Latias and Ash   plays with them That night Annie and Oakley invade the hideout and capture Latios and steal the    Soul   Dew    in order to power the DMA Now Latias must find Ash   and the others and help to save AltoMare and her brother 
B-PER B-PER O   B-PER O      O  O   O     O    O  B-LOC    O     B-PER O   B-PER O       O  B-ORG I-ORG I-ORG   I-ORG O     B-PER O    O  O       O    

In [6]:
from transformers import AutoTokenizer

model_id = 'google-bert/bert-base-multilingual-cased'
tokenizer = AutoTokenizer.from_pretrained(model_id)



In [7]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], 
        truncation=True, 
        padding="max_length", 
        max_length=128,
        is_split_into_words=True
    )

    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

def align_labels_with_tokens(labels, word_ids):
    """
    This function aligns labels with tokens produced by the tokenizer.
    - `-100` is used for special tokens to ignore them during training.
    - If the label is B-XXX, subsequent sub-tokens receive I-XXX.
    """
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            new_labels.append(-100)
        else:
            label = labels[word_id]
            # Convert B-XXX to I-XXX for sub-tokens
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels


In [8]:
def iob2_to_dataset(file_path):
    """
    Converts an IOB2 file into a DatasetDict with train and validation splits.
    Assumes the input file uses whitespace to separate tokens and tags, and that each sentence is separated by a blank line.
    """
    tokens, ner_tags = [], []
    sentences, sentence_tags = [], []

    label_set = set()
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                if tokens and ner_tags:
                    sentences.append(tokens)
                    sentence_tags.append(ner_tags)
                tokens, ner_tags = [], []
            else:
                parts = line.split()
                if len(parts) == 2:
                    word, tag = parts
                    tokens.append(word)
                    ner_tags.append(tag)
                    label_set.add(tag)
                else:
                    print(f"Ignoring malformed line: {line}")

        if tokens and ner_tags:
            sentences.append(tokens)
            sentence_tags.append(ner_tags)

    label_list = sorted(label_set)
    label_mapping = {label: i for i, label in enumerate(label_list)}

    indexed_tags = [[label_mapping[tag] for tag in tags] for tags in sentence_tags]
    dataset = Dataset.from_dict({"tokens": sentences, "ner_tags": indexed_tags})
    
    train_size = int(0.8 * len(dataset))
    datasets = dataset.train_test_split(train_size=train_size)

    features = Features({
        "tokens": Sequence(Value("string")),
        "ner_tags": Sequence(ClassLabel(names=label_list))
    })

    datasets = DatasetDict({
        "train": datasets["train"].cast(features),
        "validation": datasets["test"].cast(features)
    })

    return datasets


In [9]:
model = AutoModelForTokenClassification.from_pretrained(
    model_id, num_labels=len(label_names)
)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
tokenized_ds = fr_ds.map(tokenize_and_align_labels, batched=True)

print(tokenized_ds)


Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Dataset({
    features: ['tokens', 'ner_tags', 'ner_tags_id', 'index', 'id', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 6
})


In [11]:
for i in range(2):
    print(tokenized_ds['train']['tokens'][i])
    print(tokenized_ds['train']['labels'][i])
    print()

KeyError: "Column train not in the dataset. Current columns in the dataset: ['tokens', 'ner_tags', 'ner_tags_id', 'index', 'id', 'input_ids', 'token_type_ids', 'attention_mask', 'labels']"

In [62]:
import evaluate

metric = evaluate.load("seqeval")

In [63]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [64]:
from transformers import AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained (
    model_id,
    num_labels=len(label_names),
    id2label={id: label for id, label in enumerate(label_names)},
    label2id={label: id for id, label in enumerate(label_names)},
)
model.config.num_labels

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


12

In [65]:
from transformers import TrainingArguments
from transformers import Trainer

args = TrainingArguments(
    "mbert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
)


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    compute_metrics=compute_metrics,
)

trainer.train()


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/6 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [18]:
trainer.save_model(r"C:\Users\serru\OneDrive\Documents\GitHub\NLP\TaggedSeparated\French\ModelS2")

In [66]:
import os

def fine_tune_models_in_folder(folder_path, output_dir):
    files = os.listdir(folder_path)
    
    for file in files:
        file_path = os.path.join(folder_path, file)
        
        fr_ds = iob2_to_dataset(file_path)
        
        tokenized_ds = fr_ds.map(tokenize_and_align_labels, batched=True)
        
        model = AutoModelForTokenClassification.from_pretrained(
            model_id, num_labels=len(label_names)
        )
        
        trainer = Trainer(
            model=model,
            args=args,
            train_dataset=tokenized_ds["train"],
            eval_dataset=tokenized_ds["validation"],
            compute_metrics=compute_metrics,
        )
        
        trainer.train()
        
        model_output_dir = os.path.join(output_dir, f"model_{file}")
        trainer.save_model(model_output_dir)

folder_path = r'C:\Users\serru\OneDrive\Documents\GitHub\NLP\TaggedSeparated\French'
output_dir = r'C:\Users\serru\OneDrive\Documents\GitHub\NLP\Models\m_fr'

fine_tune_models_in_folder(folder_path, output_dir)


Casting the dataset:   0%|          | 0/5 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 2.136644124984741, 'eval_precision': 0.3008130081300813, 'eval_recall': 0.39361702127659576, 'eval_f1': 0.34101382488479265, 'eval_accuracy': 0.42857142857142855, 'eval_runtime': 0.9881, 'eval_samples_per_second': 2.024, 'eval_steps_per_second': 1.012, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.916606068611145, 'eval_precision': 0.4295774647887324, 'eval_recall': 0.648936170212766, 'eval_f1': 0.5169491525423728, 'eval_accuracy': 0.54421768707483, 'eval_runtime': 0.9628, 'eval_samples_per_second': 2.077, 'eval_steps_per_second': 1.039, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.8209679126739502, 'eval_precision': 0.43448275862068964, 'eval_recall': 0.6702127659574468, 'eval_f1': 0.5271966527196652, 'eval_accuracy': 0.5510204081632653, 'eval_runtime': 0.7782, 'eval_samples_per_second': 2.57, 'eval_steps_per_second': 1.285, 'epoch': 3.0}
{'train_runtime': 53.8417, 'train_samples_per_second': 0.279, 'train_steps_per_second': 0.056, 'train_loss': 2.1192288398742676, 'epoch': 3.0}


KeyboardInterrupt: 