In [None]:
# %pip install datasets -q
# %pip install transformers -q
# %pip install torch -q
# %pip install seqeval -q
# %pip install evaluate -q
# %pip install accelerate -q

In [None]:
from datasetutils import decode
from iob2converter import iob2_to_dataset
from transformers import AutoModelForTokenClassification

In [None]:
file_path = '../data/TaggedSeparated/German/synopses_02.iob2'

de_ds = iob2_to_dataset(file_path)

ner_feature_fr = de_ds.features['ner_tags']
label_names = ner_feature_fr.feature.names
print(label_names)

In [None]:
words = de_ds[0]['tokens']
labels = de_ds[0]['ner_tags']
print('\n'.join(decode(words, labels, label_names)))

In [None]:
from transformers import AutoTokenizer

model_id = 'google-bert/bert-base-multilingual-cased'
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], 
        truncation=True, 
        padding="max_length", 
        max_length=128,
        is_split_into_words=True
    )

    all_labels = examples["ner_tags"]

    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels

    return tokenized_inputs

def align_labels_with_tokens(labels, word_ids):
    """
    This function aligns labels with tokens produced by the tokenizer.
    - `-100` is used for special tokens to ignore them during training.
    - If the label is B-XXX, subsequent sub-tokens receive I-XXX.
    """
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            new_labels.append(-100)
        else:
            label = labels[word_id]
            # Convert B-XXX to I-XXX for sub-tokens
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [None]:
from datasets import Dataset, DatasetDict, Features, Sequence, ClassLabel, Value

tag_to_id = {
	'O': 0,
	'B-PER': 1,
	'I-PER': 2,
	'B-ORG': 3,
	'I-ORG': 4,
	'B-LOC': 5,
	'I-LOC': 6,
	'B-MISC': 7,
	'I-MISC': 8,
	'B-POK': 9,
	'I-POK': 10,
}
id_to_tag = {id: tag for tag, id in tag_to_id.items()}

def iob2_to_datasets(file_path, reference_path):
    """
    Converts an IOB2 file into a DatasetDict with train and validation splits.
    Assumes the input file uses whitespace to separate tokens and tags, and that each sentence is separated by a blank line.
    """
    tokens, ner_tags = [], []
    sentences, sentence_tags = [], []

    label_set = set()
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            line = line.strip()
            if not line:
                if tokens and ner_tags:
                    sentences.append(tokens)
                    sentence_tags.append(ner_tags)
                tokens, ner_tags = [], []
            else:
                try:
                    word, tag = line.split()
                except:
                    raise ValueError(f"Each line must have two columns: ({i}) {line}")
                tokens.append(word)
                ner_tags.append(tag)
                label_set.add(tag)

        if tokens and ner_tags:
            sentences.append(tokens)
            sentence_tags.append(ner_tags)

    label_list = list(tag_to_id.keys())
    label_mapping = {label: i for i, label in enumerate(label_list)}

    indexed_tags = [[label_mapping[tag] for tag in tags] for tags in sentence_tags]
    dataset = Dataset.from_dict({"tokens": sentences, "ner_tags": indexed_tags})
    reference_german = iob2_to_dataset(reference_path[0]).remove_columns(["ner_tags_id", "index", "id"])
    reference_french = iob2_to_dataset(reference_path[1]).remove_columns(["ner_tags_id", "index", "id"])
    reference_english = iob2_to_dataset(reference_path[2]).remove_columns(["ner_tags_id", "index", "id"])
    

    features = Features({
        "tokens": Sequence(Value("string")),
        "ner_tags": Sequence(ClassLabel(names=label_list))
    })

    datasets = DatasetDict({
        "train": dataset.cast(features),
        "val_de": reference_german.cast(features),
        "val_fr": reference_french.cast(features),
        "val_en": reference_english.cast(features),
    })

    return datasets

file_path = '../data/TaggedSeparated/German/synopses_02.iob2'
reference_grp_path = ['../ReferenceText/ReferenceTextGerman.iob2', '../ReferenceText/ReferenceTextFrench.iob2', '../ReferenceText/ReferenceTextEnglish.iob2']

de_ds = iob2_to_datasets(file_path, reference_grp_path)

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    model_id, num_labels=len(label_names)
)

In [None]:
tokenized_ds = de_ds.map(tokenize_and_align_labels, batched=True)

print(tokenized_ds)

In [None]:
import evaluate

metric = evaluate.load("seqeval")

In [None]:
# Rob span-f1 

def toSpans(tags):
    spans = set()
    for beg in range(len(tags)):
        if tags[beg][0] == 'B':
            end = beg
            for end in range(beg+1, len(tags)):
                if tags[end][0] != 'I':
                    break
            spans.add(str(beg) + '-' + str(end) + ':' + tags[beg][2:])
    return spans


def getInstanceScores(predSpans, goldSpans):
    tp = 0
    fp = 0
    fn = 0
    overlap = len(goldSpans.intersection(predSpans))
    tp += overlap
    fp += len(predSpans) - overlap
    fn += len(goldSpans) - overlap
        
    prec = 0.0 if tp+fp == 0 else tp/(tp+fp)
    rec = 0.0 if tp+fn == 0 else tp/(tp+fn)
    f1 = 0.0 if prec+rec == 0.0 else 2 * (prec * rec) / (prec + rec)
    return f1


In [None]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)

    pred_spans, true_spans = toSpans(true_predictions[0]), toSpans(true_labels[0])
    score = getInstanceScores(pred_spans, true_spans)

    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
        "span_f1": score
    }

In [None]:
from transformers import AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained (
    model_id,
    num_labels=len(label_names),
    id2label={id: label for id, label in enumerate(label_names)},
    label2id={label: id for id, label in enumerate(label_names)},
)
model.config.num_labels

In [None]:
from transformers import TrainingArguments
from transformers import Trainer


args = TrainingArguments(
    "mbert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [None]:
# trainer = Trainer(
#     model=model,
#     args=args,
#     train_dataset=tokenized_ds["train"],
#     eval_dataset=tokenized_ds["validation"],
#     compute_metrics=compute_metrics,
# )

# trainer.train()


In [None]:
import os
import pandas as pd

llang = {
    "fr": "French",
    "en": "English",
    "de": "German"
}

# create new datframe for results.
df = pd.DataFrame(columns=["train_lang", "train_file", "test_lang", "precision", "recall", "f1", "accuracy", "span_f1"])
for lang in ["fr", "en", "de"]:

    def fine_tune_models_in_folder(folder_path, output_dir):
        files = os.listdir(folder_path)
        
        for file in files:
            file_path = os.path.join(folder_path, file)
            
            data = iob2_to_datasets(file_path, reference_grp_path)
            
            tokenized_ds = data.map(tokenize_and_align_labels, batched=True)
            
            model = AutoModelForTokenClassification.from_pretrained(
                model_id, num_labels=len(label_names)
            )
            
            trainer = Trainer(
                model=model,
                args=args,
                train_dataset=tokenized_ds["train"],
                eval_dataset=tokenized_ds["val_" + lang],
                compute_metrics=compute_metrics,

            )

            trainer.train()

            res_de = trainer.predict(tokenized_ds["val_de"]).metrics
            res_fr = trainer.predict(tokenized_ds["val_fr"]).metrics
            res_en = trainer.predict(tokenized_ds["val_en"]).metrics

            print (res_de)

            # Append result to dataframe
            df.loc[-1] = [
                llang[lang],
                file,
                "german",
                res_de["test_precision"],
                res_de["test_recall"],
                res_de["test_f1"],
                res_de["test_accuracy"],
                res_de["test_span_f1"]
            ]
            df.index = df.index + 1
            df.loc[-1] = [
                llang[lang],
                file,
                "french",
                res_fr["test_precision"],
                res_fr["test_recall"],
                res_fr["test_f1"],
                res_fr["test_accuracy"],
                res_fr["test_span_f1"]
            ]
            df.index = df.index + 1
            df.loc[-1] = [
                llang[lang],
                file,
                "english",
                res_en["test_precision"],
                res_en["test_recall"],
                res_en["test_f1"],
                res_en["test_accuracy"],
                res_en["test_span_f1"]
            ]
            df.index = df.index + 1

    folder_path = '../data/TaggedSeparated/' + llang[lang]
    output_dir = '../Models/m_' + lang

    fine_tune_models_in_folder(folder_path, output_dir)

# Save dataframe as tsv
df.to_csv('results.tsv', sep='\t', index=False)