In [None]:
!pip install transformers
!pip install transformers[sentencepiece]
!pip install datasets
!pip install evaluate
!pip install sacrebleu
!pip install neptune-client
!pip install --upgrade accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# import shutil

# shutil.rmtree('/content/t5-small-finetuned-en-to-de')

### Using pre-defined pipelines

In [None]:
from transformers import pipeline

translator = pipeline("translation_en_to_de")
text = "Hello world!"
translation = translator(text)

print(translation)

No model was supplied, defaulted to t5-base and revision 686f1db (https://huggingface.co/t5-base).
Using a pipeline without specifying a model name and revision in production is not recommended.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


[{'translation_text': 'Hallo Welt!'}]


### 1. Import and Initialize the tokenizer


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-nl")



### 2. Import the model


In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-nl")


### 3. Tokenize and encode the text in seq2seq manner


In [None]:
text = "Hello my friends! having a lecture today ?"
tokenized_text = tokenizer(text, return_tensors="pt")
print(tokenized_text)


{'input_ids': tensor([[  147,  2105,   121,  2108,    54,  1205,    19, 14364,  1042,  2758,
             0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


### 4. Translate and decode the elements in batch


In [None]:
translation = model.generate(**tokenized_text)
translated_text = tokenizer.batch_decode(translation, skip_special_tokens=True)[0]
print(translated_text)



Hallo mijn vrienden, heb je vandaag een lezing?


# Create your own machine learning translator & fine tune them

### 1. Load the data set

In [None]:
from datasets import load_dataset, load_metric
from datasets.dataset_dict import DatasetDict

# raw_datasets = load_dataset("wmt16", "de-en")
raw_datasets = DatasetDict({"train": load_dataset("wmt16", "de-en", split='train[1%:2%]'),
                            "validation": load_dataset("wmt16", "de-en", split='validation'),
                            "test": load_dataset("wmt16", "de-en", split='test')})
raw_datasets



DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 45489
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2169
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2999
    })
})

### 2. Pre-process the data set


In [None]:
model_marianMT = "Helsinki-NLP/opus-mt-en-de"
model_name = 'opus-mt-en-de-finetuned'

model_mbart = 'facebook/mbart-large-50-one-to-many-mmt'
model_name = 'mbart-large-50-one-to-many-mmt'

model_t5 = "t5-small"
model_name = 't5-small'

from transformers import AutoTokenizer
from transformers import MBart50TokenizerFast

# tokenizer = AutoTokenizer.from_pretrained(model_marianMT,use_fast=False)
# tokenizer = MBart50TokenizerFast.from_pretrained(model_mbart,src_lang="en_XX",tgt_lang = "de_DE")
tokenizer = AutoTokenizer.from_pretrained(model_t5,use_fast=False)

In [None]:
prefix = "" #for mBART and MarianMT
prefix = "translate English to German:" #for T5


In [None]:
max_input_length = 128
max_target_length = 128
source_lang = "en"
target_lang = "de"

In [None]:
def preprocess_function(examples):
   inputs = [prefix + ex[source_lang] for ex in examples["translation"]]
   targets = [ex[target_lang] for ex in examples["translation"]]
   model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
   # Setup the tokenizer for targets
   with tokenizer.as_target_tokenizer():
       labels = tokenizer(targets, max_length=max_target_length, truncation=True)
   model_inputs["labels"] = labels["input_ids"]
   return model_inputs


In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)



### 3. Create a subset of the data set


In [None]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))



### 4. Train and fine-tune the model


In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import MBartForConditionalGeneration
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

# model = AutoModelForSeq2SeqLM.from_pretrained(model_marianMT)
# model = MBartForConditionalGeneration.from_pretrained(model_mbart)
model = AutoModelForSeq2SeqLM.from_pretrained(model_t5)


In [None]:
batch_size = 512

args = Seq2SeqTrainingArguments(
   f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
   evaluation_strategy = "epoch",
   learning_rate=2e-5,
   per_device_train_batch_size=batch_size,
   per_device_eval_batch_size=batch_size,
   weight_decay=0.01,
   save_total_limit=3,
   num_train_epochs=1,
   predict_with_generate=True,
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


In [None]:
import numpy as np
import evaluate

metric = evaluate.load("sacrebleu")
meteor = evaluate.load('meteor')

In [None]:
api_token = "*********"
project_name = "**********"

import os
os.environ["NEPTUNE_API_TOKEN"] = api_token
os.environ["NEPTUNE_PROJECT"] = project_name

import neptune

run = neptune.init_run(
    project=project_name,
    api_token=api_token,
)  # your credentials



In [None]:
def postprocess_text(preds, labels):
   preds = [pred.strip() for pred in preds]
   labels = [[label.strip()] for label in labels]
   return preds, labels

In [None]:
def compute_metrics(eval_preds):
   preds, labels = eval_preds
   if isinstance(preds, tuple):
       preds = preds[0]
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
   # Some simple post-processing
   decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
   result = metric.compute(predictions=decoded_preds, references=decoded_labels)
   meteor_result = meteor.compute(predictions=decoded_preds, references=decoded_labels)
   prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
   result = {'bleu' : result['score']}
   result["gen_len"] = np.mean(prediction_lens)
   result["meteor"] = meteor_result["meteor"]
   result = {k: round(v, 4) for k, v in result.items()}
   return result

In [None]:
trainer = Seq2SeqTrainer(
   model,
   args,
   train_dataset=small_train_dataset,
   eval_dataset=small_eval_dataset,
   data_collator=data_collator,
   tokenizer=tokenizer,
   compute_metrics=compute_metrics,
)
trainer.train()

  from neptune.version import version as neptune_client_version
  self._run = init_run(**self._init_run_kwargs, **additional_neptune_kwargs)


https://app.neptune.ai/mohamed.ahm.cs/CourseTransformerTunning/e/COUR-7


/usr/local/lib/python3.10/dist-packages/transformers/integrations.py:1290: NeptuneUnsupportedType: You're attempting to log a type that is not directly supported by Neptune (<class 'NoneType'>).
        Convert the value to a supported type, such as a string or float, or use stringify_unsupported(obj)
        for dictionaries or collections that contain unsupported values.
        For more, see https://docs.neptune.ai/help/value_of_unsupported_type
  self._metadata_namespace[NeptuneCallback.model_parameters_key] = model.config.to_dict()
/usr/local/lib/python3.10/dist-packages/transformers/integrations.py:1290: NeptuneUnsupportedType: You're attempting to log a type that is not directly supported by Neptune (<class 'list'>).
        Convert the value to a supported type, such as a string or float, or use stringify_unsupported(obj)
        for dictionaries or collections that contain unsupported values.
        For more, see https://docs.neptune.ai/help/value_of_unsupported_type
  self._

In [None]:
trainer.save_model()
                                           

# Evaluate & track model performance – choose the best model

### 1. Pre-trained vs fine-tuned vs google translator


In [None]:
from huggingface_hub import notebook_login

notebook_login()

# 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

###MarianMT model


In [None]:
# from transformers import MarianMTModel, MarianTokenizer

# for dirname, _, filenames in os.walk('/content/opus-mt-en-de-finetuned-en-to-de'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# src_text = ['USA Today is an American daily middle-market newspaper that is the flagship publication of its owner, Gannett. Founded by Al Neuharth on September 15, 1982.']
# model_name = 'opus-mt-en-de-finetuned-en-to-de'
# tokenizer = MarianTokenizer.from_pretrained(model_name)
# model = MarianMTModel.from_pretrained(model_name)
# translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
# [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

### MBart50 model


In [None]:
# from transformers import MBart50TokenizerFast, MBartForConditionalGeneration

# for dirname, _, filenames in os.walk('/content/mbart-large-50-one-to-many-mmt-finetuned-en-to-de'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# src_text = ["USA Today is an American daily middle-market newspaper that is the flagship publication of its owner, Gannett. Founded by Al Neuharth on September 15, 1982."]
# model_name = 'mbart-large-50-one-to-many-mmt-finetuned-en-to-hi'
# tokenizer = MBart50TokenizerFast.from_pretrained(model_name,src_lang="en_XX")
# model = MBartForConditionalGeneration.from_pretrained(model_name)
# model_inputs = tokenizer(src_text, return_tensors="pt")

# generated_tokens = model.generate(
#    **model_inputs,forced_bos_token_id=tokenizer.lang_code_to_id["de_DE"])
# translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
# translation

###T5 model


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

for dirname, _, filenames in os.walk('/content/t5-small-finetuned-en-to-de'):
   for filename in filenames:
       print(os.path.join(dirname, filename))

src_text = ['USA Today is an American daily middle-market newspaper that is the flagship publication of its owner, Gannett. Founded by Al Neuharth on September 15, 1982.']
model_name = 't5-small-finetuned-en-to-de'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
[tokenizer.decode(t, skip_special_tokens=True) for t in translated]

#3. Track your model data – parameters, training loss, CPU usage, metrics and more

In [None]:
import neptune

run = neptune.init_run(
    project=api_token,  
    api_token=project_name,
)

In [None]:
evaluate_results = trainer.evaluate()
run["epoch"].append(evaluate_results["epoch"])
run["bleu"].append(evaluate_results["bleu"])
run["meteor"].append(evaluate_results["meteor"])