<a href="https://colab.research.google.com/github/Kira1108/huggingface-examples/blob/main/Seq2Seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%writefile requirements.txt
transformers
datasets 
sentencepiece
evaluate
sacrebleu
bert-score
git+https://github.com/Kira1108/codefaster.git

Writing requirements.txt


In [2]:
from IPython.display import clear_output
!pip install -r requirements.txt
clear_output()

In [3]:
from codefaster import view, what_container

**Load Dataset**

In [4]:
from datasets import load_dataset

data = load_dataset('kde4', lang1 = 'en', lang2 = "fr")

small = data['train'].shuffle(seed = 42).select(range(1000))
datasets = small.train_test_split(seed = 42)

clear_output()

**Tokenizer** - you need a separate tokenizer for each language

In [5]:
from transformers import AutoTokenizer

checkpoint = "Helsinki-NLP/opus-mt-en-fr"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

clear_output()

**Tokenize function**

In [6]:
from dataclasses import dataclass

@dataclass
class Seq2SeqTokenizeFn:
    
    max_length_input:int = 128
    max_legnth_target:int = 128

    def tokenize_fn(self, batch):
        input_texts = []
        target_texts = []
        for d in batch['translation']:
            input_texts.append(d['en'])
            target_texts.append(d['fr'])

        tokenized_inputs = tokenizer(input_texts, truncation = True, max_length = self.max_length_input)
        tokenized_targets = tokenizer(text_target = target_texts, truncation = True, max_length = self.max_legnth_target)

        tokenized_inputs['labels'] = tokenized_targets['input_ids']

        return tokenized_inputs    


    def __call__(self, batch):
        return self.tokenize_fn(batch)


In [7]:
tokenize_fn = Seq2SeqTokenizeFn(128,128)

**Tokenized datasets**

In [8]:
tokenized_datasets = datasets.map(
    tokenize_fn, 
    batched = True, 
    remove_columns = datasets['train'].column_names)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

**Model**

In [9]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/301M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

**Collator**

In [10]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer, model = model)

# tokenized_samples = [tokenized_datasets['train'][i] for i in range(3)]
# data_collator(tokenized_samples)

**Metric**

In [11]:
from datasets import load_metric

In [12]:
bleu_metric = load_metric("sacrebleu")

bert_metric = load_metric("bertscore")


bleu_metric.compute(predictions = ['I like this movie so much, it is good'],
            references = [["I love this movie so much, it is good"]])

bert_metric.compute(predictions = ['I like this movie'],
            references = [["I love this movie",
                           "I enjoy this movie",
                           "I think this movie is great",
                           "I hate this movie very much"]], lang = 'en')

s = "I like cats"

# Bleu Metric: Single score for batch inputs
bleu_metric.compute(predictions = [s,s],
            references = [[s,s],[s,s]])

# Bert Metric: A score for each sample
bert_metric.compute(predictions = [s,s],
            references = [[s,s,s],[s,s]], lang = 'en')

  bleu_metric = load_metric("sacrebleu")


Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.92k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

{'precision': [0.9999999403953552, 0.9999999403953552],
 'recall': [0.9999999403953552, 0.9999999403953552],
 'f1': [0.9999999403953552, 0.9999999403953552],
 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.26.0)'}

In [13]:
bert_metric.compute(predictions = [s,s],
            references = [[s,s,s],[s,s]], lang = 'en')

{'precision': [0.9999999403953552, 0.9999999403953552],
 'recall': [0.9999999403953552, 0.9999999403953552],
 'f1': [0.9999999403953552, 0.9999999403953552],
 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.26.0)'}

In [14]:
batch = tokenized_datasets['train'][:3]


tokenizer.batch_decode(batch['labels'],skip_special_tokens = True)

["& Refermer l' arborescence",
 'KDE',
 "Erreur de perte de données & #160;: Si cela se reproduit, contactez l'auteur."]

In [15]:
tokenized_samples = [tokenized_datasets['train'][i] for i in range(3)]
batch = data_collator(tokenized_samples)

In [23]:
import numpy as np
from typing import Any

@dataclass
class Seq2SeqMetric:
    tokenizer:Any
    target_lang:str

    def compute_metrics(self, preds_and_labels):
        preds, labels = preds_and_labels

        labels = np.where(labels!=-100, labels, self.tokenizer.pad_token_id)
        decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens = True)

        decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens = True)

        preds = [p.strip() for p in decoded_preds]
        labels = [[l.strip()] for l in decoded_labels]

        bleu_score = bleu_metric.compute(predictions = preds,
                references = labels)
        
        bert_score = bert_metric.compute(predictions = preds,
                references = labels, lang = self.target_lang)
        
        return {
            "blue-score":bleu_score['score'],
            "bert-f1":np.mean(bert_score['f1']),
            'bert-precision':np.mean(bert_score['precision']),
            "bert-recall":np.mean(bert_score['recall'])
        }

    def __call__(self, preds_and_labels):
        return self.compute_metrics(preds_and_labels)

In [24]:
compute_metrics = Seq2SeqMetric(tokenizer, 'fr')

In [25]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    "finetuned-model",
    evaluation_strategy = "no",
    save_strategy = "epoch",
    learning_rate = 2e-5,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 64,
    weight_decay = 0.001,
    save_total_limit = 3,
    num_train_epochs = 3,
    predict_with_generate = True,
    fp16 = True
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [26]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model, 
    training_args,
    train_dataset = tokenized_datasets['train'],
    eval_dataset = tokenized_datasets['test'],
    data_collator = data_collator,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

Using cuda_amp half precision backend


In [27]:
trainer.evaluate(max_length = 128)

***** Running Evaluation *****
  Num examples = 250
  Batch size = 64
Generate config GenerationConfig {
  "bad_words_ids": [
    [
      59513
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 59513,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 4,
  "pad_token_id": 59513,
  "transformers_version": "4.26.0"
}



Generate config GenerationConfig {
  "bad_words_ids": [
    [
      59513
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 59513,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 4,
  "pad_token_id": 59513,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "bad_words_ids": [
    [
      59513
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 59513,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 4,
  "pad_token_id": 59513,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "bad_words_ids": [
    [
      59513
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 59513,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 4,
  "pad_token_id": 59513,
  "transformers_version": "4.26.0"
}



Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-multilingual-cased/snapshots/fdfce55e83dbed325647a63e7e1f5de19f0382ba/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-multilingual-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size":

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--bert-base-multilingual-cased/snapshots/fdfce55e83dbed325647a63e7e1f5de19f0382ba/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--bert-base-multilingual-cased/snapshots/fdfce55e83dbed325647a63e7e1f5de19f0382ba/tokenizer_config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-multilingual-cased/snapshots/fdfce55e83dbed325647a63e7e1f5de19f0382ba/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-multilingual-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "l

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/714M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--bert-base-multilingual-cased/snapshots/fdfce55e83dbed325647a63e7e1f5de19f0382ba/pytorch_model.bin
All the weights of BertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.


{'eval_loss': 1.6042568683624268,
 'eval_blue-score': 38.89561045764889,
 'eval_bert-f1': 0.8636148879528046,
 'eval_bert-precision': 0.8752532267570495,
 'eval_bert-recall': 0.8536002657413483,
 'eval_runtime': 24.9345,
 'eval_samples_per_second': 10.026,
 'eval_steps_per_second': 0.16}

In [28]:
trainer.train()

***** Running training *****
  Num examples = 750
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 72
  Number of trainable parameters = 74609664


Step,Training Loss


Saving model checkpoint to finetuned-model/checkpoint-24
Configuration saved in finetuned-model/checkpoint-24/config.json
Configuration saved in finetuned-model/checkpoint-24/generation_config.json
Model weights saved in finetuned-model/checkpoint-24/pytorch_model.bin
tokenizer config file saved in finetuned-model/checkpoint-24/tokenizer_config.json
Special tokens file saved in finetuned-model/checkpoint-24/special_tokens_map.json
Saving model checkpoint to finetuned-model/checkpoint-48
Configuration saved in finetuned-model/checkpoint-48/config.json
Configuration saved in finetuned-model/checkpoint-48/generation_config.json
Model weights saved in finetuned-model/checkpoint-48/pytorch_model.bin
tokenizer config file saved in finetuned-model/checkpoint-48/tokenizer_config.json
Special tokens file saved in finetuned-model/checkpoint-48/special_tokens_map.json
Saving model checkpoint to finetuned-model/checkpoint-72
Configuration saved in finetuned-model/checkpoint-72/config.json
Configur

TrainOutput(global_step=72, training_loss=1.5227012634277344, metrics={'train_runtime': 12.399, 'train_samples_per_second': 181.467, 'train_steps_per_second': 5.807, 'total_flos': 50952950120448.0, 'train_loss': 1.5227012634277344, 'epoch': 3.0})

In [29]:
trainer.evaluate(max_length = 128)

***** Running Evaluation *****
  Num examples = 250
  Batch size = 64
Generate config GenerationConfig {
  "bad_words_ids": [
    [
      59513
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 59513,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 4,
  "pad_token_id": 59513,
  "transformers_version": "4.26.0"
}



Generate config GenerationConfig {
  "bad_words_ids": [
    [
      59513
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 59513,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 4,
  "pad_token_id": 59513,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "bad_words_ids": [
    [
      59513
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 59513,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 4,
  "pad_token_id": 59513,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "bad_words_ids": [
    [
      59513
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 59513,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 4,
  "pad_token_id": 59513,
  "transformers_version": "4.26.0"
}



{'eval_loss': 1.3941287994384766,
 'eval_blue-score': 43.58199803173762,
 'eval_bert-f1': 0.8786511988639831,
 'eval_bert-precision': 0.8877503349781036,
 'eval_bert-recall': 0.8710091326236725,
 'eval_runtime': 9.2245,
 'eval_samples_per_second': 27.102,
 'eval_steps_per_second': 0.434,
 'epoch': 3.0}

In [30]:
trainer.save_model("my_saved_model")

Saving model checkpoint to my_saved_model
Configuration saved in my_saved_model/config.json
Configuration saved in my_saved_model/generation_config.json
Model weights saved in my_saved_model/pytorch_model.bin
tokenizer config file saved in my_saved_model/tokenizer_config.json
Special tokens file saved in my_saved_model/special_tokens_map.json


In [31]:
from transformers import pipeline

translator = pipeline("translation", model = 'my_saved_model', device = 0)

loading configuration file my_saved_model/config.json
Model config MarianConfig {
  "_name_or_path": "my_saved_model",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "swish",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "MarianMTModel"
  ],
  "attention_dropout": 0.0,
  "bad_words_ids": [
    [
      59513
    ]
  ],
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 512,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 59513,
  "decoder_vocab_size": 59514,
  "dropout": 0.1,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
 

In [32]:
translator("I hope this could be helpful.")

Generate config GenerationConfig {
  "bad_words_ids": [
    [
      59513
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 59513,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 4,
  "pad_token_id": 59513,
  "transformers_version": "4.26.0"
}



[{'translation_text': "J'espère que ça pourrait être utile."}]