In [80]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

import evaluate
import numpy as np


%load_ext autoreload
%autoreload 2

In [81]:
books = load_dataset("opus_books", "en-fr")
books = books["train"].train_test_split(test_size=0.2)
print(books["train"][0])

checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

Downloading and preparing dataset opus_books/en-fr to /home/desponds/.cache/huggingface/datasets/opus_books/en-fr/1.0.0/e8f950a4f32dc39b7f9088908216cd2d7e21ac35f893d04d39eb594746af2daf...


Generating train split:   0%|          | 0/127085 [00:00<?, ? examples/s]

Dataset opus_books downloaded and prepared to /home/desponds/.cache/huggingface/datasets/opus_books/en-fr/1.0.0/e8f950a4f32dc39b7f9088908216cd2d7e21ac35f893d04d39eb594746af2daf. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

{'id': '31134', 'translation': {'en': '"How is that?" asked my uncle, in surprise.', 'fr': '-- Comment cela? demanda mon oncle étonné.'}}


In [83]:
source_lang = "fr"
target_lang = "en"
prefix = "translate French to English: "

def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

tokenized_books = books.map(preprocess_function, batched=True)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

Map:   0%|          | 0/101668 [00:00<?, ? examples/s]

Map:   0%|          | 0/25417 [00:00<?, ? examples/s]

In [84]:
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
num_epoch = 1
def get_training_arguments(quartile):
    return Seq2SeqTrainingArguments(
    output_dir=f"/data/desponds/data/translation_models/model_{quartile}_4_epoch_{num_epoch}",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    save_strategy="epoch",
    num_train_epochs=num_epoch ,
    predict_with_generate=True,
    fp16=True,
)

def get_trainer(quartile):
    if quartile == 4 :
        train_set = tokenized_books["train"]
    elif quartile == 0:
        train_set = tokenized_books["train"].select([1,2,3,4])
    else :
        tokenized_books["train"].train_test_split(train_size=quartile/4.)['train']
    return Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset= train_set ,
    eval_dataset=tokenized_books["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

for quartile in range(0,1):
    training_args = get_training_arguments(quartile)
    trainer = get_trainer(quartile)
    trainer.train()

In [None]:
from transformers import pipeline
text = "Les légumes partagent des ressources avec des bactéries azotantes."
translator_1_4 = pipeline("translation", model=f"/data/desponds/data/translation_models/model_1_4/checkpoint-795")
translator_2_4 = pipeline("translation", model=f"/data/desponds/data/translation_models/model_2_4/checkpoint-1589")
translator_3_4 = pipeline("translation", model=f"/data/desponds/data/translation_models/model_3_4/checkpoint-2383")
translator_4_4 = pipeline("translation", model=f"/data/desponds/data/translation_models/model_4_4/checkpoint-9500")

translators = [None, translator_1_4, translator_2_4, translator_3_4, translator_4_4]

In [None]:
texts = ['Ce n’est pas parce que votre a exprimé ouvertement son orientation sexuelle que tout doit changer. Si vous avez aimé aller au cinéma ensemble ou jouer à des jeux vidéos, continuez simplement à faire ces choses. Il se peut que votre ami se soit exprimé à des personnes moins compréhensives. Si tel est le cas, discutez avec les autres amis et essayez de les convaincre de se montrer conciliants et d’être de bons amis.']
for i,translator in enumerate(translators_epoch_1[1:]):
    print(f"\nTranslator {i+1}/4 with 1 epoch")
    print(translator(texts)[0]['translation_text'])

In [None]:
from translation import translate_fr_en
translate_fr_en(texts)

In [None]:
import tensorflow as tf
from tensorflow.python.summary.summary_iterator import summary_iterator
import pandas as pd
eval_bleu, epoch, loss, eval_loss = [], [], [], []
for event in summary_iterator('/data/desponds/data/translation_models/model_4_4/runs/Apr20_16-40-51_students/events.out.tfevents.1682001651.students.1648792.6'):
    for value in event.summary.value:
#         print(value.tag, value.step, value.simple_value)
        if value.tag == 'train/loss' or value.tag == 'eval/loss' and value.HasField('simple_value'):
            loss.append(value.simple_value)
        elif value.tag == 'train/epoch' and value.HasField('simple_value'):
            epoch.append(value.simple_value)
        if value.tag == 'eval/loss' and value.HasField('simple_value'):
            eval_loss.append(value.simple_value)
        elif value.tag == 'eval/bleu' and value.HasField('simple_value'):
            eval_bleu.append(value.simple_value)
#         elif value.tag == 'train/epoch' and value.HasField('simple_value'):
#             epoch.append(value.simple_value)
df_eval = pd.DataFrame({'eval_loss' : eval_loss, 'eval_bleu':eval_bleu})
df_train = pd.DataFrame({'epoch': epoch[:len(loss)], 'loss' : loss})
df_eval

In [None]:
df_train.plot(kind = 'scatter', x = 'epoch', y = 'loss')

# Use it : Review Classification

In [65]:
from transformers import pipeline
text = "Les légumes partagent des ressources avec des bactéries azotantes."
translator_1_4_epoch_1 = pipeline("translation", model=f"/data/desponds/data/translation_models/model_1_4_epoch_1/checkpoint-795")
translator_2_4_epoch_1 = pipeline("translation", model=f"/data/desponds/data/translation_models/model_2_4_epoch_1/checkpoint-1589")
translator_3_4_epoch_1 = pipeline("translation", model=f"/data/desponds/data/translation_models/model_3_4_epoch_1/checkpoint-2383")
translator_4_4_epoch_1 = pipeline("translation", model=f"/data/desponds/data/translation_models/model_4_4_epoch_1/checkpoint-3178")

translators_epoch_1 = [None, translator_1_4_epoch_1, translator_2_4_epoch_1, translator_3_4_epoch_1, translator_4_4_epoch_1]

In [None]:
#Load dataset review classification 
from preprocessing import preprocessing_review_classification
datasets, tokenized = preprocessing_review_classification(langs = ['fr'])

In [None]:
import pickle
import torch
from translation import translate_fr_en
def translate_fr_en_rc(examples, translator):
    examples['text'] = translate_fr_en(examples['text'], translator)
    return examples
dataset = {}
dataset['base'] = datasets['fr']['test']
for i in range(1,4+1):
    dataset[f"translated_{i}_4"] = datasets['fr']['test'].map(lambda examples :  translate_fr_en_rc(examples, translator= translators_epoch_1[i]), batched = True)
    with open(f'/data/desponds/data/Classification/translated_dataset/translated_epoch_3_{i}_4.pickle', 'wb') as handle:
        pickle.dump(dataset[f"translated_{i}_4"], handle)                                                                                            

In [None]:
with open(f'/data/desponds/data/Classification/translated_dataset/translated_epoch_1_{i}_4.pickle', 'wb') as handle:
    pickle.dump(dataset, handle)

In [None]:
translator_4_4_epoch_5 = pipeline("translation", model=f"/data/desponds/data/translation_models/model_4_4_epoch_5/checkpoint-15890")
dataset = {}
dataset[f"translated_epoch_5_4_4"] = tokenized['fr']['test'].map(lambda examples :  translate_fr_en_rc(examples, translator= translator_4_4_epoch_5), batched = True)
with open(f'/data/desponds/data/Classification/translated_dataset/translated_epoch_5_4_4.pickle', 'wb') as handle:
    pickle.dump(dataset[f"translated_epoch_5_4_4"], handle)    

In [None]:
from transformers import pipeline
translator_0_4 = pipeline("translation", model=f"t5-small")

# Benchmarking

In [6]:
from datasets import load_dataset
import evaluate

In [72]:
def preprocess(dataset):
    def reformat(examples):
        examples['sourceLang'], examples['targetLang'] = examples['targetlang'] ,examples['sourceLang']  
        examples['sourceString'], examples['targetString'] = examples['targetString'] ,examples['sourceString']  
        return examples
    dataset = dataset.map(reformat)
    dataset = dataset.remove_columns(['targetlang'])
    return dataset
def translate_fr_benchmarking(examples, translator = None):
    if translator == None : 
        examples['translated'] = translate_fr_en(examples['sourceString'])
    else :
        examples['translated'] = translate_fr_en(examples['sourceString'],translator)
    return examples
dataset = load_dataset('Helsinki-NLP/tatoeba_mt', 'eng-fra', split = 'test[:20%]')
dataset = preprocess(dataset)

Found cached dataset tatoeba_mt (/home/desponds/.cache/huggingface/datasets/Helsinki-NLP___tatoeba_mt/eng-fra/0.0.0/01e819f3f64a772a2ca70949061d295d3a2dc99d05183fe4776a3be23f75f619)


Map:   0%|          | 0/2536 [00:00<?, ? examples/s]

In [73]:
dataset

Dataset({
    features: ['sourceLang', 'sourceString', 'targetString', 'targetLang'],
    num_rows: 2536
})

In [74]:
datasets, results = {},{}
for i in range(1,4+1):
    datasets[f"translated_{i}_4"] = dataset.map(lambda examples :  translate_fr_benchmarking(examples, translator= translators_epoch_1[i]), batched = True)
    score = metric.compute(predictions=datasets[f"translated_{i}_4"]['translated'], references=datasets[f"translated_{i}_4"]['targetString'])
    results[f"translated_{i}_4"] =  {"bleu": score["score"]}
results

Map:   0%|          | 0/2536 [00:00<?, ? examples/s]

Map:   0%|          | 0/2536 [00:00<?, ? examples/s]

Map:   0%|          | 0/2536 [00:00<?, ? examples/s]

Map:   0%|          | 0/2536 [00:00<?, ? examples/s]

{'translated_1_4': {'bleu': 18.309957728250946},
 'translated_2_4': {'bleu': 18.954448420186928},
 'translated_3_4': {'bleu': 19.75110141776996},
 'translated_4_4': {'bleu': 13.231679360817093}}

In [75]:
from translation import translate_fr_en
datasets[f"translated_Helsinki"] = dataset.map(translate_fr_benchmarking, batched = True)
score = metric.compute(predictions=datasets[f"translated_Helsinki"]['translated'], references=datasets[f"translated_Helsinki"]['targetString'])
results[f"translated_Helsinki"] =  {"bleu": score["score"]}

Map:   0%|          | 0/2536 [00:00<?, ? examples/s]

In [None]:
translator_0_4 = pipeline("translation", model=f"/data/desponds/data/translation_models/model_0_4_epoch_1/checkpoint-1")
datasets[f"translator_0_4"] = dataset.map(lambda examples : translate_fr_benchmarking(examples, translator_0_4), batched = True)
score = metric.compute(predictions=datasets[f"translator_0_4"]['translated'], references=datasets[f"translator_0_4"]['targetString'])
results[f"translator_0_4"] =  {"bleu": score["score"]}

In [91]:
results

{'translated_1_4': {'bleu': 18.309957728250946},
 'translated_2_4': {'bleu': 18.954448420186928},
 'translated_3_4': {'bleu': 19.75110141776996},
 'translated_4_4': {'bleu': 13.231679360817093},
 'translated_Helsinki': {'bleu': 56.39336432554474},
 'translator_0_4': {'bleu': 0.7212123764045337}}

In [97]:
translator_1_4_epoch_1("Je m'appelle Mathieu et je suis dans ma maison entrain de travailler")

[{'translation_text': 'I call Mathieu and I am in my house to work.'}]

In [100]:
translator_4_4_epoch_5("Je m'appelle Mathieu et je suis dans ma maison entrain de travailler")

NameError: name 'translator_4_4_epoch_5' is not defined