# Translation (PyTorch)

Install the Transformers and Datasets libraries to run this notebook.

In [1]:
# Install dependencies
!pip install datasets transformers[sentencepiece]
!pip install accelerate
# To run the training on TPU, you will need to uncomment the followin line:
# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
!apt install git-lfs

Collecting datasets
  Downloading datasets-2.0.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 26.9 MB/s 
[?25hCollecting transformers[sentencepiece]
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 62.3 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 80.5 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 66.1 MB/s 
[?25hCollecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.3.0-py3-none-any.whl (136 kB)
[K     |████████████████████████████████| 136 kB 77.4 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading hugg

You will need to setup git, adapt your email and name in the following cell.

In [2]:
!git config --global user.email "miesner.jacob@gmail.com"
!git config --global user.name "miesnerjacob"

You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.

In [5]:
# Notebook HF login
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [6]:
# Load dataset
from datasets import load_dataset, load_metric

raw_datasets = load_dataset("kde4", lang1="en", lang2="fr")

Downloading builder script:   0%|          | 0.00/1.89k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Using custom data configuration en-fr-lang1=en,lang2=fr


Downloading and preparing dataset kde4/en-fr (download: 6.72 MiB, generated: 24.46 MiB, post-processed: Unknown size, total: 31.18 MiB) to /root/.cache/huggingface/datasets/kde4/en-fr-lang1=en,lang2=fr/0.0.0/243129fb2398d5b0b4f7f6831ab27ad84774b7ce374cf10f60f6e1ff331648ac...


Downloading data:   0%|          | 0.00/7.05M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/210173 [00:00<?, ? examples/s]

Dataset kde4 downloaded and prepared to /root/.cache/huggingface/datasets/kde4/en-fr-lang1=en,lang2=fr/0.0.0/243129fb2398d5b0b4f7f6831ab27ad84774b7ce374cf10f60f6e1ff331648ac. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
# Check data structure
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 210173
    })
})

In [8]:
# Create tesst split
split_datasets = raw_datasets["train"].train_test_split(train_size=0.9, seed=20)
split_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 189155
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 21018
    })
})

In [9]:
# rename test split to validation
split_datasets["validation"] = split_datasets.pop("test")

In [10]:
# Check one train example
split_datasets["train"][1]["translation"]

{'en': 'Default to expanded threads',
 'fr': 'Par défaut, développer les fils de discussion'}

In [11]:
# Load model
from transformers import pipeline

model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
translator = pipeline("translation", model=model_checkpoint)
translator("Default to expanded threads")

Downloading:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/287M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/784k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.28M [00:00<?, ?B/s]

[{'translation_text': 'Par défaut pour les threads élargis'}]

In [12]:
# Test model on one example
translator(
    "Unable to import %1 using the OFX importer plugin. This file is not the correct format."
)

[{'translation_text': "Impossible d'importer %1 en utilisant le plugin d'importateur OFX. Ce fichier n'est pas le bon format."}]

In [13]:
# Load tokenizer
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="tf")

In [14]:
# using context manager to tokenize french example
en_sentence = split_datasets["train"][1]["translation"]["en"]
fr_sentence = split_datasets["train"][1]["translation"]["fr"]

inputs = tokenizer(en_sentence)
with tokenizer.as_target_tokenizer():
    targets = tokenizer(fr_sentence)

In [15]:
# Showing ramification of using sae tokenizer for different lanugages
wrong_targets = tokenizer(fr_sentence)
print(tokenizer.convert_ids_to_tokens(wrong_targets["input_ids"]))
print(tokenizer.convert_ids_to_tokens(targets["input_ids"]))

['▁Par', '▁dé', 'f', 'aut', ',', '▁dé', 've', 'lop', 'per', '▁les', '▁fil', 's', '▁de', '▁discussion', '</s>']
['▁Par', '▁défaut', ',', '▁développer', '▁les', '▁fils', '▁de', '▁discussion', '</s>']


In [16]:
# Function to tokenize datasets
max_input_length = 128
max_target_length = 128


def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [17]:
# Map tokenization function to datasets
tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
)

  0%|          | 0/190 [00:00<?, ?ba/s]

  0%|          | 0/22 [00:00<?, ?ba/s]

In [18]:
# Import model for training with Seq2SeqLM head
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [19]:
# Get data collator
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [20]:
# test data collator on small batch
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])
batch.keys()

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])

In [21]:
# Check labels feature of small batch
batch["labels"]

tensor([[  577,  5891,     2,  3184,    16,  2542,     5,  1710,     0,  -100,
          -100,  -100,  -100,  -100,  -100,  -100],
        [ 1211,     3,    49,  9409,  1211,     3, 29140,   817,  3124,   817,
           550,  7032,  5821,  7907, 12649,     0]])

In [22]:
# Check decoder input ids feature of small batch
batch["decoder_input_ids"]

tensor([[59513,   577,  5891,     2,  3184,    16,  2542,     5,  1710,     0,
         59513, 59513, 59513, 59513, 59513, 59513],
        [59513,  1211,     3,    49,  9409,  1211,     3, 29140,   817,  3124,
           817,   550,  7032,  5821,  7907, 12649]])

In [23]:
# print tokenized labels of small batch
for i in range(1, 3):
    print(tokenized_datasets["train"][i]["labels"])

[577, 5891, 2, 3184, 16, 2542, 5, 1710, 0]
[1211, 3, 49, 9409, 1211, 3, 29140, 817, 3124, 817, 550, 7032, 5821, 7907, 12649, 0]


In [24]:
# Download sacrebleu evaluation metric
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.0.0-py3-none-any.whl (90 kB)
[?25l[K     |███▋                            | 10 kB 38.8 MB/s eta 0:00:01[K     |███████▏                        | 20 kB 46.5 MB/s eta 0:00:01[K     |██████████▉                     | 30 kB 52.8 MB/s eta 0:00:01[K     |██████████████▍                 | 40 kB 33.1 MB/s eta 0:00:01[K     |██████████████████              | 51 kB 37.4 MB/s eta 0:00:01[K     |█████████████████████▋          | 61 kB 42.6 MB/s eta 0:00:01[K     |█████████████████████████▎      | 71 kB 29.8 MB/s eta 0:00:01[K     |████████████████████████████▉   | 81 kB 31.3 MB/s eta 0:00:01[K     |████████████████████████████████| 90 kB 10.3 MB/s 
Collecting colorama
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Collecting portalocker
  Downloading portalocker-2.4.0-py2.py3-none-any.whl (16 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.4 portalocker-2.4.0 sacrebl

In [25]:
# Load in evaluation metric
from datasets import load_metric

metric = load_metric("sacrebleu")

Downloading builder script:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

In [26]:
# Testing Belu metric
predictions = [
    "This plugin lets you translate web pages between several languages automatically."
]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]
metric.compute(predictions=predictions, references=references)

{'bp': 0.9200444146293233,
 'counts': [11, 6, 4, 3],
 'precisions': [91.66666666666667,
  54.54545454545455,
  40.0,
  33.333333333333336],
 'ref_len': 13,
 'score': 46.750469682990165,
 'sys_len': 12,
 'totals': [12, 11, 10, 9]}

In [27]:
# Testing Belu metric
predictions = ["This This This This"]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]
metric.compute(predictions=predictions, references=references)

{'bp': 0.10539922456186433,
 'counts': [1, 0, 0, 0],
 'precisions': [25.0, 16.666666666666668, 12.5, 12.5],
 'ref_len': 13,
 'score': 1.683602693167689,
 'sys_len': 4,
 'totals': [4, 3, 2, 1]}

In [28]:
# Testing Belu metric
predictions = ["This plugin"]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]
metric.compute(predictions=predictions, references=references)

{'bp': 0.004086771438464067,
 'counts': [2, 1, 0, 0],
 'precisions': [100.0, 100.0, 0.0, 0.0],
 'ref_len': 13,
 'score': 0.0,
 'sys_len': 2,
 'totals': [2, 1, 0, 0]}

In [29]:
# evaluation function for training
import numpy as np


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [30]:
# Login to HF Hub
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [31]:
# Define training args
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    f"marian-finetuned-kde4-en-to-fr",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

In [32]:
# Define trainer
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Cloning https://huggingface.co/miesnerjacob/marian-finetuned-kde4-en-to-fr into local empty directory.
Using amp half precision backend


In [33]:
# Check performance before training
trainer.evaluate(max_length=max_target_length)

***** Running Evaluation *****
  Num examples = 21018
  Batch size = 64


{'eval_bleu': 39.27124165416069,
 'eval_loss': 1.6964441537857056,
 'eval_runtime': 1379.9969,
 'eval_samples_per_second': 15.23,
 'eval_steps_per_second': 0.238}

In [34]:
# Train!
trainer.train()

***** Running training *****
  Num examples = 189155
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 17736


Step,Training Loss
500,1.4154
1000,1.2205
1500,1.1688
2000,1.1284
2500,1.1192
3000,1.0676
3500,1.0664
4000,1.0286
4500,1.0233
5000,1.0252


Saving model checkpoint to marian-finetuned-kde4-en-to-fr/checkpoint-5912
Configuration saved in marian-finetuned-kde4-en-to-fr/checkpoint-5912/config.json
Model weights saved in marian-finetuned-kde4-en-to-fr/checkpoint-5912/pytorch_model.bin
tokenizer config file saved in marian-finetuned-kde4-en-to-fr/checkpoint-5912/tokenizer_config.json
Special tokens file saved in marian-finetuned-kde4-en-to-fr/checkpoint-5912/special_tokens_map.json
tokenizer config file saved in marian-finetuned-kde4-en-to-fr/tokenizer_config.json
Special tokens file saved in marian-finetuned-kde4-en-to-fr/special_tokens_map.json
Saving model checkpoint to marian-finetuned-kde4-en-to-fr/checkpoint-11824
Configuration saved in marian-finetuned-kde4-en-to-fr/checkpoint-11824/config.json
Model weights saved in marian-finetuned-kde4-en-to-fr/checkpoint-11824/pytorch_model.bin
tokenizer config file saved in marian-finetuned-kde4-en-to-fr/checkpoint-11824/tokenizer_config.json
Special tokens file saved in marian-fine

TrainOutput(global_step=17736, training_loss=0.9373906814788557, metrics={'train_runtime': 3771.5157, 'train_samples_per_second': 150.461, 'train_steps_per_second': 4.703, 'total_flos': 1.1313459326287872e+16, 'train_loss': 0.9373906814788557, 'epoch': 3.0})

In [35]:
# Evaluate after training
trainer.evaluate(max_length=max_target_length)

***** Running Evaluation *****
  Num examples = 21018
  Batch size = 64


{'epoch': 3.0,
 'eval_bleu': 52.94560734092563,
 'eval_loss': 0.8559092283248901,
 'eval_runtime': 1438.515,
 'eval_samples_per_second': 14.611,
 'eval_steps_per_second': 0.229}

In [36]:
# Push model to HF Hub
trainer.push_to_hub(tags="translation", commit_message="Training complete")

Saving model checkpoint to marian-finetuned-kde4-en-to-fr
Configuration saved in marian-finetuned-kde4-en-to-fr/config.json
Model weights saved in marian-finetuned-kde4-en-to-fr/pytorch_model.bin
tokenizer config file saved in marian-finetuned-kde4-en-to-fr/tokenizer_config.json
Special tokens file saved in marian-finetuned-kde4-en-to-fr/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.34k/285M [00:00<?, ?B/s]

Upload file runs/Apr05_18-34-00_02ac9ff9bbb4/events.out.tfevents.1649190253.02ac9ff9bbb4.80.2: 100%|##########…

Upload file runs/Apr05_18-34-00_02ac9ff9bbb4/events.out.tfevents.1649185043.02ac9ff9bbb4.80.0:  33%|###3      …

To https://huggingface.co/miesnerjacob/marian-finetuned-kde4-en-to-fr
   1072e38..3a14e87  main -> main

To https://huggingface.co/miesnerjacob/marian-finetuned-kde4-en-to-fr
   3a14e87..25e0a2b  main -> main



'https://huggingface.co/miesnerjacob/marian-finetuned-kde4-en-to-fr/commit/3a14e876e0a9a892ad3e2a97dcb2ab34e86ee40c'

In [37]:
# Set dataloaders
from torch.utils.data import DataLoader

tokenized_datasets.set_format("torch")
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
)

In [38]:
# Get pretrained model
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

loading configuration file https://huggingface.co/Helsinki-NLP/opus-mt-en-fr/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/5ad88432037ab18b1eb95761258d2b1b3a32e1e401d5f610f86eb3f479e59e8c.2b4f07b3f8de3922d42e6312c55d0597e44d2273507e7c5d0b6daf75fb2cc673
Model config MarianConfig {
  "_name_or_path": "Helsinki-NLP/opus-mt-en-fr",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "swish",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "MarianMTModel"
  ],
  "attention_dropout": 0.0,
  "bad_words_ids": [
    [
      59513
    ]
  ],
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 512,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 59513,
  "dropout": 0.1,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "e

In [39]:
# Define opptimizer
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)



In [40]:
# Pass objects to accelerator
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [41]:
# Define learning rate scheduler
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [42]:
# Create model repo
from huggingface_hub import Repository, get_full_repo_name

model_name = "marian-finetuned-kde4-en-to-fr-accelerate"
repo_name = get_full_repo_name(model_name)
repo_name

'miesnerjacob/marian-finetuned-kde4-en-to-fr-accelerate'

In [43]:
# Clone model repo
output_dir = "marian-finetuned-kde4-en-to-fr-accelerate"
repo = Repository(output_dir, clone_from=repo_name)

Cloning https://huggingface.co/miesnerjacob/marian-finetuned-kde4-en-to-fr-accelerate into local empty directory.


In [44]:
# Func to process model outputs
def postprocess(predictions, labels):
    predictions = predictions.cpu().numpy()
    labels = labels.cpu().numpy()

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    return decoded_preds, decoded_labels

In [45]:
# Custom training loop
from tqdm.auto import tqdm
import torch

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for batch in tqdm(eval_dataloader):
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
                max_length=128,
            )
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        generated_tokens = accelerator.pad_across_processes(
            generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
        )
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(generated_tokens)
        labels_gathered = accelerator.gather(labels)

        decoded_preds, decoded_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=decoded_preds, references=decoded_labels)

    results = metric.compute()
    print(f"epoch {epoch}, BLEU score: {results['score']:.2f}")

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        repo.push_to_hub(
            commit_message=f"Training in progress epoch {epoch}", blocking=False
        )

  0%|          | 0/70935 [00:00<?, ?it/s]

  0%|          | 0/2628 [00:00<?, ?it/s]

Configuration saved in marian-finetuned-kde4-en-to-fr-accelerate/config.json


epoch 0, BLEU score: 51.84


Model weights saved in marian-finetuned-kde4-en-to-fr-accelerate/pytorch_model.bin
tokenizer config file saved in marian-finetuned-kde4-en-to-fr-accelerate/tokenizer_config.json
Special tokens file saved in marian-finetuned-kde4-en-to-fr-accelerate/special_tokens_map.json


  0%|          | 0/2628 [00:00<?, ?it/s]

Configuration saved in marian-finetuned-kde4-en-to-fr-accelerate/config.json


epoch 1, BLEU score: 53.45


Model weights saved in marian-finetuned-kde4-en-to-fr-accelerate/pytorch_model.bin
tokenizer config file saved in marian-finetuned-kde4-en-to-fr-accelerate/tokenizer_config.json
Special tokens file saved in marian-finetuned-kde4-en-to-fr-accelerate/special_tokens_map.json
Several commits (2) will be pushed upstream.


  0%|          | 0/2628 [00:00<?, ?it/s]

Configuration saved in marian-finetuned-kde4-en-to-fr-accelerate/config.json


epoch 2, BLEU score: 53.95


Model weights saved in marian-finetuned-kde4-en-to-fr-accelerate/pytorch_model.bin
tokenizer config file saved in marian-finetuned-kde4-en-to-fr-accelerate/tokenizer_config.json
Special tokens file saved in marian-finetuned-kde4-en-to-fr-accelerate/special_tokens_map.json
Several commits (3) will be pushed upstream.


In [46]:
# Load model from Hub
from transformers import pipeline


model_checkpoint = "miesnerjacob/marian-finetuned-kde4-en-to-fr"
translator = pipeline("translation", model=model_checkpoint)
translator("Default to expanded threads")

https://huggingface.co/miesnerjacob/marian-finetuned-kde4-en-to-fr/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpeahq9vpo


Downloading:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

storing https://huggingface.co/miesnerjacob/marian-finetuned-kde4-en-to-fr/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/f1690cb78432a58c9cae41a4aef66ad40676eac6036ad9ce74543bb6cf382c8e.1401c5559b140a27163163cdadcaf8bd5aa3a21b157c04f8a333a9accbbe01fc
creating metadata file for /root/.cache/huggingface/transformers/f1690cb78432a58c9cae41a4aef66ad40676eac6036ad9ce74543bb6cf382c8e.1401c5559b140a27163163cdadcaf8bd5aa3a21b157c04f8a333a9accbbe01fc
loading configuration file https://huggingface.co/miesnerjacob/marian-finetuned-kde4-en-to-fr/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/f1690cb78432a58c9cae41a4aef66ad40676eac6036ad9ce74543bb6cf382c8e.1401c5559b140a27163163cdadcaf8bd5aa3a21b157c04f8a333a9accbbe01fc
Model config MarianConfig {
  "_name_or_path": "miesnerjacob/marian-finetuned-kde4-en-to-fr",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "swish",
  "add_bias_logits": false,
  "add_final_layer_

Downloading:   0%|          | 0.00/285M [00:00<?, ?B/s]

storing https://huggingface.co/miesnerjacob/marian-finetuned-kde4-en-to-fr/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/c73be832785fdeaa0cd7922a722bd7b376cbf140eb5ef93acee82f70657dcf7b.e02b4fb8b675a89fe02a781b4ae6cf2fbf263c4a6bd95108e8c6bb4569f65498
creating metadata file for /root/.cache/huggingface/transformers/c73be832785fdeaa0cd7922a722bd7b376cbf140eb5ef93acee82f70657dcf7b.e02b4fb8b675a89fe02a781b4ae6cf2fbf263c4a6bd95108e8c6bb4569f65498
loading weights file https://huggingface.co/miesnerjacob/marian-finetuned-kde4-en-to-fr/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/c73be832785fdeaa0cd7922a722bd7b376cbf140eb5ef93acee82f70657dcf7b.e02b4fb8b675a89fe02a781b4ae6cf2fbf263c4a6bd95108e8c6bb4569f65498
All model checkpoint weights were used when initializing MarianMTModel.

All the weights of MarianMTModel were initialized from the model checkpoint at miesnerjacob/marian-finetuned-kde4-en-to-fr.
If your task is simil

Downloading:   0%|          | 0.00/296 [00:00<?, ?B/s]

storing https://huggingface.co/miesnerjacob/marian-finetuned-kde4-en-to-fr/resolve/main/tokenizer_config.json in cache at /root/.cache/huggingface/transformers/353d86290d99185630559e493f48290c6a2ade4b4704ce12b061e18672c59641.d5354816f7f36571cb2402d98d69555eebc0d8d9a2204c1c96d2aab38c801be5
creating metadata file for /root/.cache/huggingface/transformers/353d86290d99185630559e493f48290c6a2ade4b4704ce12b061e18672c59641.d5354816f7f36571cb2402d98d69555eebc0d8d9a2204c1c96d2aab38c801be5
https://huggingface.co/miesnerjacob/marian-finetuned-kde4-en-to-fr/resolve/main/source.spm not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmp3e1cid5p


Downloading:   0%|          | 0.00/760k [00:00<?, ?B/s]

storing https://huggingface.co/miesnerjacob/marian-finetuned-kde4-en-to-fr/resolve/main/source.spm in cache at /root/.cache/huggingface/transformers/b6e24c6697cf396989f405da93d40de0972a90e9c5d609b05d361781c95cf40c.1a8b1c99c8359ed99f2d577f69114f5e285203705b08e5b9177f626b259660ec
creating metadata file for /root/.cache/huggingface/transformers/b6e24c6697cf396989f405da93d40de0972a90e9c5d609b05d361781c95cf40c.1a8b1c99c8359ed99f2d577f69114f5e285203705b08e5b9177f626b259660ec
https://huggingface.co/miesnerjacob/marian-finetuned-kde4-en-to-fr/resolve/main/target.spm not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmp967prm3u


Downloading:   0%|          | 0.00/784k [00:00<?, ?B/s]

storing https://huggingface.co/miesnerjacob/marian-finetuned-kde4-en-to-fr/resolve/main/target.spm in cache at /root/.cache/huggingface/transformers/9f2303a3f74540c0f0258a84995dc177fda24dfa0e1b38f379d0bbaccc80d133.7a3fadd05a0cee82a22786164d20d49e7b313753bf53c7e219cd382f47c08871
creating metadata file for /root/.cache/huggingface/transformers/9f2303a3f74540c0f0258a84995dc177fda24dfa0e1b38f379d0bbaccc80d133.7a3fadd05a0cee82a22786164d20d49e7b313753bf53c7e219cd382f47c08871
https://huggingface.co/miesnerjacob/marian-finetuned-kde4-en-to-fr/resolve/main/vocab.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpr4ott6jr


Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

storing https://huggingface.co/miesnerjacob/marian-finetuned-kde4-en-to-fr/resolve/main/vocab.json in cache at /root/.cache/huggingface/transformers/a2b62fde4f48f746b45935766b1e5cb40e2f6110fc040fb492aca6d19486cc46.f0e9eb9c8120de9b276d39c458593bb6f470220163cc5be81c17eabfd243816d
creating metadata file for /root/.cache/huggingface/transformers/a2b62fde4f48f746b45935766b1e5cb40e2f6110fc040fb492aca6d19486cc46.f0e9eb9c8120de9b276d39c458593bb6f470220163cc5be81c17eabfd243816d
https://huggingface.co/miesnerjacob/marian-finetuned-kde4-en-to-fr/resolve/main/special_tokens_map.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmp4u105qst


Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

storing https://huggingface.co/miesnerjacob/marian-finetuned-kde4-en-to-fr/resolve/main/special_tokens_map.json in cache at /root/.cache/huggingface/transformers/ba0930a926cbda9d9adc8e0c9b72f61e6168cfa5309f1c3bee1eb1cb89492218.294ebaa4cd17bb284635004c92d2c4d522ec488c828dcce0c2471b6f28e3fe82
creating metadata file for /root/.cache/huggingface/transformers/ba0930a926cbda9d9adc8e0c9b72f61e6168cfa5309f1c3bee1eb1cb89492218.294ebaa4cd17bb284635004c92d2c4d522ec488c828dcce0c2471b6f28e3fe82
loading file https://huggingface.co/miesnerjacob/marian-finetuned-kde4-en-to-fr/resolve/main/source.spm from cache at /root/.cache/huggingface/transformers/b6e24c6697cf396989f405da93d40de0972a90e9c5d609b05d361781c95cf40c.1a8b1c99c8359ed99f2d577f69114f5e285203705b08e5b9177f626b259660ec
loading file https://huggingface.co/miesnerjacob/marian-finetuned-kde4-en-to-fr/resolve/main/target.spm from cache at /root/.cache/huggingface/transformers/9f2303a3f74540c0f0258a84995dc177fda24dfa0e1b38f379d0bbaccc80d133.7a3fad

[{'translation_text': 'Par défaut, développer les fils de discussion'}]

In [47]:
# Test finetuned model
translator(
    "Unable to import %1 using the OFX importer plugin. This file is not the correct format."
)

[{'translation_text': "Impossible d'importer %1 en utilisant le module d'importation OFX. Ce fichier n'est pas le bon format."}]