# Setup

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

import evaluate
import numpy as np

%load_ext autoreload
%autoreload 2

# Training of the translators

In [None]:
books = load_dataset("opus_books", "en-fr")
books = books["train"].train_test_split(test_size=0.2, seed = 42)

checkpoint = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, cache_dir="/data/desponds/.cache")
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, cache_dir="/data/desponds/.cache")

In [None]:
source_lang = 'en' #"fr"
target_lang = 'fr' #"en"
prefix = "translate English to French: "#"translate French to English: "

def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

tokenized_books = books.map(preprocess_function, batched=True)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [None]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels
metric = evaluate.load("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
num_epoch = 3
def get_training_arguments(quartile):
    return Seq2SeqTrainingArguments(
    output_dir=f"/data/desponds/data/translation_models/en_fr_model_t5_base_4_4_epoch_3",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    save_strategy="epoch",
    num_train_epochs=num_epoch ,
    predict_with_generate=True,
    fp16=True,
)

def get_trainer(quartile):
    if quartile == 4 :
        train_set = tokenized_books["train"]
    elif quartile == 0:
        train_set = tokenized_books["train"].select([1,2,3,4])
    else :
        train_set = tokenized_books["train"].train_test_split(train_size=quartile/4.)['train']
    return Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset= train_set,
    eval_dataset=tokenized_books["test"].select([0,1]),
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# for quartile in range(4,5):
quartile = 4
training_args = get_training_arguments(quartile)
trainer = get_trainer(quartile)
trainer.train()

# Benchmarking

In [None]:
from datasets import load_dataset
import evaluate
from transformers import pipeline
from translation import translate_fr_en
import translators as ts
metric = evaluate.load("sacrebleu")
results,datasets = {}, {}

In [None]:
# Load the dataset and preprocess it 
def preprocess(dataset):
    def reformat(examples):
        examples['sourceLang'], examples['targetLang'] = examples['targetlang'] ,examples['sourceLang']  
        examples['sourceString'], examples['targetString'] = examples['targetString'] ,examples['sourceString']  
        return examples
    dataset = dataset.map(reformat)
    dataset = dataset.remove_columns(['targetlang'])
    return dataset


def translate_fr_benchmarking(examples, translator = None):
    if translator == None : 
        examples['translated'] = translate_fr_en(examples['sourceString'])
    else :
        examples['translated'] = translate_fr_en(examples['sourceString'],translator)
    return examples


dataset = load_dataset('Helsinki-NLP/tatoeba_mt', 'eng-fra', split = 'test[:20%]', cache_dir="/data/desponds/.cache")
dataset = preprocess(dataset)

In [None]:
datasets, results = {},{}
for i in range(1,1+1):
    datasets[f"translated_{8*i}"] = dataset.map(lambda examples :  translate_fr_benchmarking(examples, translator = translators_epoch_1[i]), batched = True)
    score = metric.compute(predictions=datasets[f"translated_{8*i}"]['translated'], references=datasets[f"translated_{8*i}"]['targetString'])
    results[f"translated_{8*i}"] =  {"bleu": score["score"]}
results

In [None]:
datasets[f"translated_Helsinki"] = dataset.map(translate_fr_benchmarking, batched = True)
score = metric.compute(predictions=datasets[f"translated_Helsinki"]['translated'], references=datasets[f"translated_Helsinki"]['targetString'])
results[f"translated_Helsinki"] =  {"bleu": score["score"]}

In [None]:
name = 'model_t5_small_1_8_epoch_1'
translator = pipeline("translation", model=f"/data/desponds/data/translation_models/model_t5_small_1_8_epoch_1/checkpoint-795")
datasets[f"translator_{name}"] = dataset.map(lambda examples : translate_fr_benchmarking(examples, translator = translator), batched = True)
score = metric.compute(predictions=datasets[f"translator_{name}"]['translated'], references=datasets[f"translator_{name}"]['targetString'])
results[f"translator_{name}"] =  {"bleu": score["score"]}

In [None]:
results
#  
#{
#  'translator_model_t5_small_1_8_epoch_1': {'bleu': 6.079863457383406} x
#  'translator_0_4': {'bleu': 0.7212123764045337}},
#  'translated_1_4': {'bleu': 18.309957728250946},                xx
#  'translated_2_4': {'bleu': 18.954448420186928},
#  'translated_3_4': {'bleu': 19.75110141776996},
#  'translated_4_4': {'bleu': 13.231679360817093},
#  'translator_model_4_4_epoch_5': {'bleu': 20.91257821000636}    xx
#  'translated_t5_base_4_4_epoch_1': {'bleu': 20.89527497402696}, 
#  'translator_t5_base_4_4_epoch_2': {'bleu': 26.04773730194601},
#  'translated_t5_base_4_4_epoch_3': {'bleu': 26.04773730194601}, xx

#  'translated_Helsinki': {'bleu': 56.39336432554474},            xx
#  'translated_tc-big': {'bleu': 58.4968895273038}}               xx

### Trying to import other traductors 
[Documentation](https://pypi.org/project/translators/)

In [None]:
from transformers import pipeline
def ts_translate(example, translator = 'google') :
#     print(example['sourceString'])
    example['translated'] = ts.translate_text(example['sourceString'], translator=translator, from_language='fr', to_language = 'en')
    return example

In [None]:
datasets[f"translated_Google"] = dataset.map(lambda ex : ts_translate(ex, 'google'))
score = metric.compute(predictions=datasets[f"translated_Google"]['translated'], references=datasets[f"translated_Google"]['targetString'])
results[f"translated_google"] =  {"bleu": score["score"]}
results

### Translator Helsinky tc-big

In [None]:
translator_tc_big = pipeline("translation", model="Helsinki-NLP/opus-mt-tc-big-fr-en")
def translate_tc_big(examples):
    examples['translated'] = [t['translation_text'] for t in translator_tc_big(examples['sourceString'])]
    return examples
datasets[f"translated_tc-big"] = dataset.map(translate_tc_big, batched = True)

In [None]:
score = metric.compute(predictions=datasets[f"translated_tc-big"]['translated'], references=datasets[f"translated_tc-big"]['targetString'])
results[f"translated_tc-big"] =  {"bleu": score["score"]}
results

### Translator t5-base

In [None]:
translator_t5_large_epoch_3 = pipeline("translation", model=f"/data/desponds/data/translation_models/model_t5_large_4_4_epoch_3/checkpoint-19065")
def translate_t5(examples):
    examples['translated'] = [t['translation_text'] for t in translator_t5_large_epoch_3(examples['sourceString'])]
    return examples
datasets[f"translator_t5_large_epoch_3"] = dataset.map(translate_t5, batched = True)

In [None]:
score = metric.compute(predictions=datasets[f"translator_t5_large_epoch_3"]['translated'], references=datasets[f"translator_t5_large_epoch_3"]['targetString'])
results[f"translator_t5_large_epoch_3"] =  {"bleu": score["score"]}
results
# {'translated_t5_base_epoch_1': {'bleu': 20.89527497402696},
#  'translated_t5_base_epoch_3': {'bleu': 26.04773730194601}}