# Setup

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer

model_name = {}
model_name['fr'] = 'camembert-base'
model_name['en'] = 'roberta-base'
model_translation = {}
model_translation['fr_en'] = 'Helsinki-NLP/opus-mt-fr-en'
model_translation['en_fr'] = 'Helsinki-NLP/opus-mt-en-fr'
dataset_name = 'wikiann'

# Get tokenized dataset

In [None]:
dataset_en = load_dataset(dataset_name, 'en')
dataset_fr = load_dataset(dataset_name, 'fr')

tokenizer = {}
tokenizer['fr'] = AutoTokenizer.from_pretrained(model_name['fr'])
tokenizer['en'] = AutoTokenizer.from_pretrained(model_name['en'], add_prefix_space=True)

In [None]:
#Get the values for input_ids, token_type_ids, attention_mask
def tokenize_adjust_labels(examples, tokenizer):
    tokenized_samples = tokenizer.batch_encode_plus(examples["tokens"], is_split_into_words=True, return_overflowing_tokens=True)

    sample_map = tokenized_samples.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        tokenized_samples[key] = [values[i] for i in sample_map]
    #tokenized_samples is not a datasets object so this alone won't work with Trainer API, hence map is used 
    #so the new keys [input_ids, labels (after adjustment)]
    #can be added to the datasets dict for each train test validation split
    total_adjusted_labels = []
    print(len(tokenized_samples["input_ids"]))
    for k in range(0, len(tokenized_samples["input_ids"])):
        prev_wid = -1
        word_ids_list = tokenized_samples.word_ids(batch_index=k)
        existing_label_ids = examples["ner_tags"][k]
        i = -1
        adjusted_label_ids = []
     
    for wid in word_ids_list:
        if(wid is None):
            adjusted_label_ids.append(-100)
        elif(wid!=prev_wid):
            i = i + 1
            adjusted_label_ids.append(existing_label_ids[i])
            prev_wid = wid
        else:
            label_name = label_names[existing_label_ids[i]]
            adjusted_label_ids.append(existing_label_ids[i])
            
        total_adjusted_labels.append(adjusted_label_ids)
    tokenized_samples["labels"] = total_adjusted_labels
    return tokenized_samples

In [None]:
def tokenize_and_align_labels(examples, language, label_all_tokens = True):
    tokenized_inputs = tokenizer[language](examples["tokens"], truncation=True, is_split_into_words=True)
    
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenized_fr = dataset_fr.map(lambda examples : tokenize_and_align_labels(examples, 'fr'),batched=True)
tokenized_fr = tokenized_fr.remove_columns(dataset_fr["train"].column_names)

tokenized_en = dataset_en.map(lambda examples : tokenize_and_align_labels(examples, 'en'),batched=True)

# Compute trainers

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
import numpy as np
from datasets import load_metric

metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    flattened_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],
    }
    for k in results.keys():
        if(k not in flattened_results.keys()):
            flattened_results[k+"_f1"]=results[k]["f1"]

    return flattened_results

In [None]:
from transformers import DataCollatorForTokenClassification
data_collator = {}
data_collator['fr'] = DataCollatorForTokenClassification(tokenizer['fr'])
data_collator['en'] = DataCollatorForTokenClassification(tokenizer['en'])

model = {}
model['fr'] = AutoModelForTokenClassification.from_pretrained(model_name['fr'])
model['en'] = AutoModelForTokenClassification.from_pretrained(model_name['en'])

training_args = {}
training_args['fr'] = TrainingArguments(
    output_dir="/data/desponds/data/NER/trainer_fr/",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy='epoch',
    remove_unused_columns=False
)


trainer = {}
trainer['fr'] = Trainer(
    model=model['fr'],
    args=training_args['fr'],
    train_dataset=tokenized_fr["train"].select(range(100)),
    eval_dataset=tokenized_fr["validation"],
    data_collator=data_collator['fr'],
    tokenizer=tokenizer['fr'],
    compute_metrics=compute_metrics
)

In [None]:
trainer['fr'].train()

## Translation

In [None]:
from helper import translate_fr_en
def translate_fr_en_qa(example):
    example['tokens'] = translate_fr_en(' '.join(example['tokens'])).split(' ')
    
    return example

In [None]:
import pickle
# Translate the test split of the french dataset
translated_fr_en = dataset_fr['valid'].map(translate_fr_en_qa)

with open('/data/desponds/data/NER/translated_dataset.pickle', 'wb') as handle:
    pickle.dump(translated_fr_en, handle)

## Evaluate

In [None]:
trainer['fr'].predict(tokenized_fr['test'])

## Results

In [None]:
import pandas as pd
data = {
    'task' : ['NER', 'NER', 'NER'],
    'dataset' : ['', '', ''],
    'translated' : ['no', 'no', 'yes'],
    'model'   : ['CamemBERT', 'Roberta', 'Roberta'],
    'test_loss' : [????],
    'test_accuracy' : [????]
}
results = pd.DataFrame(data)
results