In [1]:
%%capture
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
!pip install sacrebleu

# **DATA PREP**

For Translation type model, I want to build a way to generalize this step.

- READ DATA 
- CLEAN

- TOKENIZE
- ADAPT: two step of adapt (adapt tehn inputs for the model and adapt the outputs for metrics)

In [2]:
from datasets import load_dataset

raw_datasets = load_dataset("kde4", lang1="en", lang2="fr")

Downloading builder script:   0%|          | 0.00/4.25k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/8.45k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

Downloading and preparing dataset kde4/en-fr to /root/.cache/huggingface/datasets/kde4/en-fr-lang1=en,lang2=fr/0.0.0/243129fb2398d5b0b4f7f6831ab27ad84774b7ce374cf10f60f6e1ff331648ac...


Downloading data:   0%|          | 0.00/7.05M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/210173 [00:00<?, ? examples/s]

Dataset kde4 downloaded and prepared to /root/.cache/huggingface/datasets/kde4/en-fr-lang1=en,lang2=fr/0.0.0/243129fb2398d5b0b4f7f6831ab27ad84774b7ce374cf10f60f6e1ff331648ac. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [3]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 210173
    })
})

In [4]:
split_datasets = raw_datasets["train"].train_test_split(train_size=0.9, seed=20)
split_datasets["validation"] = split_datasets.pop("test")
split_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 189155
    })
    validation: Dataset({
        features: ['id', 'translation'],
        num_rows: 21018
    })
})

In [5]:
# sample
split_datasets["train"]= split_datasets["train"].shuffle(seed=42).select(range(1000))
split_datasets["validation"]= split_datasets["validation"].shuffle(seed=42).select(range(100))

In [6]:
split_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['id', 'translation'],
        num_rows: 100
    })
})

In [23]:
split_datasets.save_to_disk("kde")

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]

In [25]:
import datasets

In [26]:
kde=datasets.load_from_disk("kde")
kde

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['id', 'translation'],
        num_rows: 100
    })
})

In [36]:
%%writefile Seq2SeqTranslation.py

import argparse
import os

parser = argparse.ArgumentParser()

parser.add_argument("--input_dataset", type=str, help="input_dataset")
parser.add_argument("--max_length", type=int, help="max_length")
parser.add_argument("--model_checkpoint", type=str, help="model_checkpoint")
parser.add_argument("--metric_data_load", type=str, help="metric_data_load")
parser.add_argument("--output_dir", type=str, help="output_dir")
parser.add_argument("--num_train_epochs", type=int, help="num_train_epochs")
parser.add_argument("--learning_rate", type=float, help="learning_rate")

#parser.add_argument("--stride", type=int, help="stride")
#parser.add_argument("--n_best", type=int, help="n_best")
#parser.add_argument("--max_answer_length", type=int, help="max_answer_length")

args = parser.parse_args()


#stride=args.stride
#n_best=args.n_best
#max_answer_length=args.max_answer_length

input_dataset=args.input_dataset
max_length=args.max_length
model_checkpoint=args.model_checkpoint
metric_data_load=args.metric_data_load
output_dir=args.output_dir
num_train_epochs=args.num_train_epochs
learning_rate=args.learning_rate


import evaluate
import datasets
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM
from torch.utils.data import DataLoader
import numpy as np

from transformers import AdamW
from accelerate import Accelerator
from transformers import get_scheduler
from tqdm.auto import tqdm
import torch


tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")

def preprocess_function(examples):
    
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]
    
    model_inputs = tokenizer(
        inputs, 
        text_target=targets, 
        max_length=max_length, 
        truncation=True
    )
    
    return model_inputs


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

def postprocess(predictions, labels):
    predictions = predictions.cpu().numpy()
    labels = labels.cpu().numpy()

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    return decoded_preds, decoded_labels




def main():

  # load_from disk
  raw_datasets= datasets.load_from_disk(input_dataset)
  print("raw_datasets is read")

  # define processors
  #tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")
  model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
  data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
  metric = evaluate.load(metric_data_load)


  # tokenize
  tokenized_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
  )
  print("raw_datasets is tokenized")

  tokenized_datasets.set_format("torch")

  # dataset wrapper 
  train_dataloader = DataLoader(
      tokenized_datasets["train"],
      shuffle=True,
      collate_fn=data_collator,
      batch_size=8,
  )

  eval_dataloader = DataLoader(
      tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
  )

  print("data is wrapped with DataLoader")

  # train
  print(model)

  optimizer = AdamW(model.parameters(), lr=learning_rate)
  accelerator = Accelerator()

  model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
      model, optimizer, train_dataloader, eval_dataloader
  )

  num_update_steps_per_epoch = len(train_dataloader)
  num_training_steps = num_train_epochs * num_update_steps_per_epoch

  lr_scheduler = get_scheduler(
      "linear",
      optimizer=optimizer,
      num_warmup_steps=0,
      num_training_steps=num_training_steps,
  )


  progress_bar = tqdm(range(num_training_steps))

  for epoch in range(num_train_epochs):
      # Training
      model.train()
      for batch in train_dataloader:
          outputs = model(**batch)
          loss = outputs.loss
          accelerator.backward(loss)

          optimizer.step()
          lr_scheduler.step()
          optimizer.zero_grad()
          progress_bar.update(1)

      # Evaluation
      model.eval()
      for batch in tqdm(eval_dataloader):
          with torch.no_grad():
              generated_tokens = accelerator.unwrap_model(model).generate(
                  batch["input_ids"],
                  attention_mask=batch["attention_mask"],
                  max_length=128,
              )
          labels = batch["labels"]

          # Necessary to pad predictions and labels for being gathered
          generated_tokens = accelerator.pad_across_processes(
              generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
          )
          labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

          predictions_gathered = accelerator.gather(generated_tokens)
          labels_gathered = accelerator.gather(labels)

          decoded_preds, decoded_labels = postprocess(predictions_gathered, labels_gathered)
          metric.add_batch(predictions=decoded_preds, references=decoded_labels)

      results = metric.compute()
      print(f"epoch {epoch}, BLEU score: {results['score']:.2f}")

      # Save and upload
      accelerator.wait_for_everyone()
      unwrapped_model = accelerator.unwrap_model(model)
      unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
      if accelerator.is_main_process:
          tokenizer.save_pretrained(output_dir)


if __name__=="__main__":
    main()



Overwriting Seq2SeqTranslation.py


In [None]:
#max_length = 128
#num_train_epochs = 3
#output_dir = "marian-finetuned-kde4-en-to-fr-accelerate"
#learning_rate=2e-5
#model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"


In [37]:
# RUN
!python ./Seq2SeqTranslation.py --input_dataset="kde" \
                                --max_length=384  \
                                --model_checkpoint="Helsinki-NLP/opus-mt-en-fr" \
                                --metric_data_load="sacrebleu" \
                                --output_dir="seq2seq-marian-kde4" \
                                --num_train_epochs=2 \
                                --learning_rate=2e-5 


raw_datasets is read
Loading cached processed dataset at /content/kde/train/cache-90aa6551085026ec.arrow
Loading cached processed dataset at /content/kde/validation/cache-771a3ebdc541b154.arrow
raw_datasets is tokenized
data is wrapped with DataLoader
MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(59514, 512, padding_idx=59513)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(59514, 512, padding_idx=59513)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elemen

In [38]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "seq2seq-marian-kde4"

translator = pipeline("translation", model=model_checkpoint)

translator("Default to expanded threads")



[{'translation_text': 'Par défaut pour les threads élargis'}]

In [None]:
# TOKENIZE
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")

In [8]:
# ADAPT AND TOKENIZE
max_length = 128


def preprocess_function(examples):
    
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]
    
    model_inputs = tokenizer(
        inputs, 
        text_target=targets, 
        max_length=max_length, 
        truncation=True
    )
    
    return model_inputs

In [9]:
tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [10]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
})

In [11]:
# ADAPT INPUT FOR DYNAMIC PADDING
from transformers import DataCollatorForSeq2Seq
# MODEL 
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)


data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

Downloading pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [None]:
from tensorflow import keras

In [12]:
# AFAPT OUTPUT FOr METRICS
#%%capture
!pip install sacrebleu

import evaluate

metric = evaluate.load("sacrebleu")

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker
  Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)
Collecting colorama
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-2.7.0 sacrebleu-2.3.1


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [13]:
import numpy as np


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [14]:
# TRAIN - FINE-TUNE
from torch.utils.data import DataLoader

tokenized_datasets.set_format("torch")

train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
)

In [15]:
from transformers import AdamW
from accelerate import Accelerator
from transformers import get_scheduler
from tqdm.auto import tqdm
import torch

num_train_epochs = 3
output_dir = "marian-finetuned-kde4-en-to-fr-accelerate"

optimizer = AdamW(model.parameters(), lr=2e-5)

accelerator = Accelerator()

model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)


progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for batch in tqdm(eval_dataloader):
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
                max_length=128,
            )
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        generated_tokens = accelerator.pad_across_processes(
            generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
        )
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(generated_tokens)
        labels_gathered = accelerator.gather(labels)

        decoded_preds, decoded_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=decoded_preds, references=decoded_labels)

    results = metric.compute()
    print(f"epoch {epoch}, BLEU score: {results['score']:.2f}")

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        #repo.push_to_hub(
         #   commit_message=f"Training in progress epoch {epoch}", blocking=False
        #)




In [16]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [17]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [18]:
output_dir = "marian-finetuned-kde4-en-to-fr-accelerate"

In [19]:
def postprocess(predictions, labels):
    predictions = predictions.cpu().numpy()
    labels = labels.cpu().numpy()

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    return decoded_preds, decoded_labels

In [20]:
from tqdm.auto import tqdm
import torch

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for batch in tqdm(eval_dataloader):
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
                max_length=128,
            )
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        generated_tokens = accelerator.pad_across_processes(
            generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
        )
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(generated_tokens)
        labels_gathered = accelerator.gather(labels)

        decoded_preds, decoded_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=decoded_preds, references=decoded_labels)

    results = metric.compute()
    print(f"epoch {epoch}, BLEU score: {results['score']:.2f}")

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        #repo.push_to_hub(
         #   commit_message=f"Training in progress epoch {epoch}", blocking=False
        #)

  0%|          | 0/375 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

epoch 0, BLEU score: 39.27


  0%|          | 0/13 [00:00<?, ?it/s]

epoch 1, BLEU score: 43.73


  0%|          | 0/13 [00:00<?, ?it/s]

epoch 2, BLEU score: 43.08


In [21]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = output_dir

translator = pipeline("translation", model=model_checkpoint)

translator("Default to expanded threads")



[{'translation_text': 'Par défaut pour les fils élargis'}]

In [22]:
translator(
    "Unable to import %1 using the OFX importer plugin. This file is not the correct format."
)

[{'translation_text': "Impossible d'importer %1 en utilisant le plugin d'importateur OFX. Ce fichier n'est pas le bon format."}]