In [1]:
%matplotlib inline
import os
import torch
import random
import evaluate
import transformers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
from typing import Optional
from dataclasses import dataclass 
from time import perf_counter
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset, disable_progress_bar
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    EarlyStoppingCallback
)

2024-01-25 13:49:49.250254: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
@dataclass
class Config:
    cache_dir: str = "./data/english_to_spanish" 
    data_dir: str = os.path.join(cache_dir, "dataset")
    source_lang: str = "en"
    target_lang: str = "sp"    
    
    batch_size: int = 16
    num_workers: int = 4
    seed: int = 42
    max_source_length: int = 128
    max_target_length: int = 128

    lr: float = 0.0005
    weight_decay: float = 0.01
    epochs: int=20 #int = 20
    device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model_checkpoint: str = "google/mt5-small"

    def __post_init__(self):
        random.seed(self.seed)
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        torch.cuda.manual_seed_all(self.seed)

In [3]:
config=Config()

In [4]:
dataset=pd.read_csv(os.path.join(config.data_dir, "data.csv"))

In [5]:
dataset

Unnamed: 0,english,spanish
0,Go.,Ve.
1,Go.,Vete.
2,Go.,Vaya.
3,Go.,Váyase.
4,Hi.,Hola.
...,...,...
118959,There are four main causes of alcohol-related ...,Hay cuatro causas principales de muertes relac...
118960,There are mothers and fathers who will lie awa...,Hay madres y padres que se quedan despiertos d...
118961,A carbon footprint is the amount of carbon dio...,Una huella de carbono es la cantidad de contam...
118962,Since there are usually multiple websites on a...,Como suele haber varias páginas web sobre cual...


In [6]:
dataset=dataset.sample(frac=1, random_state=1)

In [7]:
dataset

Unnamed: 0,english,spanish
26027,We need to chat soon.,Necesitamos charlar pronto.
33060,She married a musician.,Se casó con un músico.
7550,Do it right now.,Hazlo ahoritita.
37768,This is the ticket line.,Esta es la cola para sacar los billetes.
2075,No one came.,Nadie vino.
...,...,...
50057,The world is changing fast.,El mundo está cambiando rápidamente.
98047,Tom did a good job organizing the workers.,Tom hizo un buen trabajo organizando a los tra...
5192,Tom left Mary.,Tom dejó a Mary.
77708,They made me wait for a long time.,Me han hecho esperar mucho.


In [8]:
dataset=dataset.head(10000)

In [9]:
dataset=dataset.reset_index()
dataset=dataset.drop(columns='index')

In [10]:
dataset

Unnamed: 0,english,spanish
0,We need to chat soon.,Necesitamos charlar pronto.
1,She married a musician.,Se casó con un músico.
2,Do it right now.,Hazlo ahoritita.
3,This is the ticket line.,Esta es la cola para sacar los billetes.
4,No one came.,Nadie vino.
...,...,...
9995,Did Tom tell you anything interesting?,¿Tom te dijo algo interesante?
9996,Please keep this a secret.,"Por favor, mantén esto en secreto."
9997,Have you lost your mind?,¿Has perdido el juicio?
9998,He is unable to do it.,Él es incapaz de hacerlo.


In [11]:
def train_validate_test_split(df, train_percent=.6, validate_percent=.2, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

In [12]:
train, validate, test = train_validate_test_split(dataset)

In [13]:
train.to_csv(os.path.join(config.data_dir, "train.csv"), index=False, header=False)
validate.to_csv(os.path.join(config.data_dir, "validate.csv"), index=False, header=False)
test.to_csv(os.path.join(config.data_dir, "test.csv"), index=False, header=False)

In [14]:
data_files = {"train": os.path.join(config.data_dir, "train.csv"),
              "validation":os.path.join(config.data_dir, "validate.csv"), 
              "test": os.path.join(config.data_dir, "test.csv")}
data_files

{'train': './data/english_to_spanish/dataset/train.csv',
 'validation': './data/english_to_spanish/dataset/validate.csv',
 'test': './data/english_to_spanish/dataset/test.csv'}

In [15]:
dataset_dict = load_dataset(
    "csv",
    delimiter=",",
    column_names=[config.source_lang, config.target_lang],
    data_files=data_files
)
dataset_dict

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['en', 'sp'],
        num_rows: 6000
    })
    validation: Dataset({
        features: ['en', 'sp'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['en', 'sp'],
        num_rows: 2000
    })
})

In [16]:
sample = dataset_dict["train"][0]
sample

{'en': 'He got all he wanted.', 'sp': 'Lograba todo lo que quería.'}

In [17]:
!pip install rouge_score sacrebleu

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting sacrebleu
  Downloading sacrebleu-2.4.0-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.4/57.4 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-2.8.2-py3-none-any.whl.metadata (8.5 kB)
Collecting lxml (from sacrebleu)
  Downloading lxml-5.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.5 kB)
Downloading sacrebleu-2.4.0-py3-none-any.whl (106 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.3/106.3 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lxml-5.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.0/8.0 MB[0m [31m69.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading portalocker-2

In [18]:
rouge_score = evaluate.load("rouge", cache_dir=config.cache_dir)
bleu_score = evaluate.load("bleu", cache_dir=config.cache_dir)
sacrebleu_score = evaluate.load("sacrebleu", cache_dir=config.cache_dir)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [19]:
tokenizer = AutoTokenizer.from_pretrained(config.model_checkpoint, cache_dir=config.cache_dir)

tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


In [20]:
model_name = config.model_checkpoint.split("/")[-1]

fine_tuned_model_checkpoint = os.path.join(
    config.cache_dir,
    f"{model_name}_{config.source_lang}-{config.target_lang}",
    "checkpoint-4500"
)

In [21]:
if os.path.isdir(fine_tuned_model_checkpoint):
    do_train = False
    model = AutoModelForSeq2SeqLM.from_pretrained(fine_tuned_model_checkpoint, cache_dir=config.cache_dir)
else:
    do_train = True
    model = AutoModelForSeq2SeqLM.from_pretrained(config.model_checkpoint, cache_dir=config.cache_dir)

print("number of parameters:", model.num_parameters())
#print("number of trainable parameters:", model.print_trainable_parameters())

pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

number of parameters: 300176768


In [22]:
def batch_tokenize_fn(examples):
    """
    Generate the input_ids and labels field for huggingface dataset/dataset dict.

    Truncation is enabled where we cap the sentence to the max length. Padding will be done later
    in a data collator, so we pad examples to the longest length within a mini-batch and not
    the whole dataset.
    """
    sources = examples[config.source_lang]
    targets = examples[config.target_lang]
    model_inputs = tokenizer(sources, max_length=config.max_source_length, truncation=True)

    # setup the tokenizer for targets,
    # huggingface expects the target tokenized ids to be stored in the labels field
    # note, newer version of tokenizer supports a text_target argument, where we can create
    # source and target sentences in one go
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=config.max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [23]:
dataset_dict_tokenized = dataset_dict.map(
    batch_tokenize_fn,
    batched=True,
    remove_columns=dataset_dict["train"].column_names
)
dataset_dict_tokenized

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]



Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 6000
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
})

In [24]:
model_name = config.model_checkpoint.split("/")[-1]
output_dir = os.path.join(config.cache_dir, f"{model_name}_{config.source_lang}-{config.target_lang}")

In [25]:
args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="steps",
    learning_rate=config.lr,
    per_device_train_batch_size=config.batch_size,
    per_device_eval_batch_size=config.batch_size,
    weight_decay=config.weight_decay,
    save_total_limit=2,
    num_train_epochs=config.epochs,
    predict_with_generate=True,
    load_best_model_at_end=True,
    greater_is_better=True,
    metric_for_best_model="rougeL",
    gradient_accumulation_steps=8,
    do_train=do_train,
    #report_to="wandb",  # enable logging to W&B
    # careful when attempting to train t5 models on fp16 mixed precision,
    # the model was trained on bfloat16 mixed precision, and mixing different mixed precision
    # type might result in nan loss
    # https://discuss.huggingface.co/t/mixed-precision-for-bfloat16-pretrained-models/5315
    fp16=False
)

In [26]:
def compute_metrics(eval_pred):
    """
    Compute rouge and bleu metrics for seq2seq model generated prediction.
    
    tip: we can run trainer.predict on our eval/test dataset to see what a sample
    eval_pred object would look like when implementing custom compute metrics function
    """
    predictions, labels = eval_pred
    # Decode generated summaries, which is in ids into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode labels, a.k.a. reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge_score.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        rouge_types=["rouge1", "rouge2", "rougeL"]
    )
    score = sacrebleu_score.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )
    result["sacrebleu"] = score["score"]
    return {k: round(v, 4) for k, v in result.items()}

In [27]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=dataset_dict_tokenized["train"],
    eval_dataset=dataset_dict_tokenized["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback()]
)

In [28]:
# should take around 4117.78 seconds on a single V100 GPU
if trainer.args.do_train:
    os.environ["DISABLE_MLFLOW_INTEGRATION"] = "TRUE"
    t1_start = perf_counter()
    train_output = trainer.train()
    t1_stop = perf_counter()
    print("Training elapsed time:", t1_stop - t1_start)

    # saving the model which allows us to leverage
    # .from_pretrained(model_path)
    trainer.save_model(fine_tuned_model_checkpoint)

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/sagemaker-user/.netrc


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Sacrebleu
500,2.8928,1.394342,0.5562,0.3475,0.544,28.3053


Training elapsed time: 13814.898851252


In [29]:
def generate_translation(model, tokenizer, example):
    """print out the source, target and predicted raw text."""
    source = example[config.source_lang]
    print(source)
    target = example[config.target_lang]
    print(target)
    input_ids = tokenizer(source)["input_ids"]
    input_ids = torch.LongTensor(input_ids).view(1, -1).to(model.device)
    print(input_ids)
    #input_ids=input_ids[0]
    generated_ids = model.generate(inputs=input_ids, max_length=20)
    prediction = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    print('source: ', source)
    print('target: ', target)
    print('prediction: ', prediction)

In [30]:
example = dataset_dict['validation'][0]
example

{'en': 'Compare this copy with the original.',
 'sp': 'Compare usted esta copia con el original.'}

In [31]:
generate_translation(model, tokenizer, example)

Compare this copy with the original.
Compare usted esta copia con el original.
tensor([[73343,   714, 27613,   514,   287,  4703,   260,     1]])
source:  Compare this copy with the original.
target:  Compare usted esta copia con el original.
prediction:  Comparte este copia con el original.


In [32]:
def generate_translation2(model, tokenizer, example):
    """print out the source, target and predicted raw text."""
    source = example
    print(source)
    #target = example[config.target_lang]
    #print(target)
    input_ids = tokenizer(source)["input_ids"]
    input_ids = torch.LongTensor(input_ids).view(1, -1).to(model.device)
    print(input_ids)
    #input_ids=input_ids[0]
    generated_ids = model.generate(inputs=input_ids, max_length=20)
    prediction = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    print('source: ', source)
    #print('target: ', target)
    print('prediction: ', prediction)

In [33]:
example2='I enjoy my days with you'

In [34]:
generate_translation2(model, tokenizer, example2)

I enjoy my days with you
tensor([[ 336, 9070, 1037, 5382,  514,  521,    1]])
source:  I enjoy my days with you
prediction:  Adorcé mi días contigo.
