In [1]:
import os
import torch
import random
import evaluate
import transformers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
from typing import Optional
from dataclasses import dataclass 
from time import perf_counter
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset, disable_progress_bar
from transformers import (AutoConfig,AutoTokenizer,AutoModelForSeq2SeqLM,DataCollatorForSeq2Seq,Seq2SeqTrainingArguments,Seq2SeqTrainer,EarlyStoppingCallback)

  from .autonotebook import tqdm as notebook_tqdm
2024-03-31 12:04:46.306118: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-31 12:04:46.306169: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-31 12:04:46.307659: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
@dataclass
class Config:
    cache_dir: str = "./2c" 
    data_dir: str = os.path.join(cache_dir, "wmt16")
    source_lang: str = "de"
    target_lang: str = "en"    
    
    batch_size: int = 16
    num_workers: int = 4
    seed: int = 42
    max_source_length: int = 128
    max_target_length: int = 128

    lr: float = 0.0005
    weight_decay: float = 0.01
    epochs: int = 3
    device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model_checkpoint: str = "google-t5/t5-small"

    def __post_init__(self):
        random.seed(self.seed)
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        torch.cuda.manual_seed_all(self.seed)

In [3]:
config = Config()

Making the datasets

In [4]:
trainl = []
with open("train.de","r") as f:
    with open("train.en","r") as g:
        de = f.readlines()
        en = g.readlines()
        for i in range(len(de)):
            d = {}
            d["de"] = de[i].strip()
            d["en"] = en[i].strip()
            trainl.append(d)


In [5]:
vall = []
with open("val.de","r") as f:
    with open("val.en","r") as g:
        de = f.readlines()
        en = g.readlines()
        for i in range(len(de)):
            d = {}
            d["de"] = de[i].strip()
            d["en"] = en[i].strip()
            vall.append(d)

In [6]:
testl = []
with open("test.de","r") as f:
    with open("test.en","r") as g:
        de = f.readlines()
        en = g.readlines()
        for i in range(len(de)):
            d = {}
            d["de"] = de[i].strip()
            d["en"] = en[i].strip()
            testl.append(d)

In [7]:
from datasets import Dataset as DDDD

In [8]:
# trainl = trainl[:1000]
# testl = testl[:1000]
# vall = vall[:1000]

In [9]:
traindataset = DDDD.from_list(trainl)
testdataset = DDDD.from_list(testl)
valdataset = DDDD.from_list(vall)

In [10]:
from datasets import DatasetDict

In [11]:
dataset_dict = DatasetDict({
    "train": traindataset,
    "val": valdataset,
    "test": testdataset
})

In [12]:
print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['de', 'en'],
        num_rows: 50000
    })
    val: Dataset({
        features: ['de', 'en'],
        num_rows: 2169
    })
    test: Dataset({
        features: ['de', 'en'],
        num_rows: 2999
    })
})


In [13]:
dataset_dict["train"][0]

{'de': 'Wiederaufnahme der Sitzungsperiode', 'en': 'Resumption of the session'}

In [14]:
rouge_score = evaluate.load("rouge", cache_dir=config.cache_dir)
bleu_score = evaluate.load("bleu", cache_dir=config.cache_dir)
sacrebleu_score = evaluate.load("sacrebleu", cache_dir=config.cache_dir)

In [15]:
tokenizer = AutoTokenizer.from_pretrained(config.model_checkpoint, cache_dir=config.cache_dir)

model_name = config.model_checkpoint.split("/")[-1]
fine_tuned_model_checkpoint = os.path.join(
    config.cache_dir,
    f"{model_name}_{config.source_lang}-{config.target_lang}",
    "checkpoint-4500"
)
if os.path.isdir(fine_tuned_model_checkpoint):
    do_train = False
    model = AutoModelForSeq2SeqLM.from_pretrained(fine_tuned_model_checkpoint, cache_dir=config.cache_dir)
else:
    do_train = True
    model = AutoModelForSeq2SeqLM.from_pretrained(config.model_checkpoint, cache_dir=config.cache_dir)

print("number of parameters:", model.num_parameters())

tokenizer_config.json: 100%|██████████| 2.32k/2.32k [00:00<00:00, 243kB/s]
spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 939kB/s]
tokenizer.json: 100%|██████████| 1.39M/1.39M [00:01<00:00, 1.32MB/s]
config.json: 100%|██████████| 1.21k/1.21k [00:00<00:00, 161kB/s]
model.safetensors: 100%|██████████| 242M/242M [00:02<00:00, 111MB/s]  
generation_config.json: 100%|██████████| 147/147 [00:00<00:00, 19.3kB/s]

number of parameters: 60506624





In [16]:
def batch_tokenize_fn(examples):
    """
    Generate the input_ids and labels field for huggingface dataset/dataset dict.

    Truncation is enabled where we cap the sentence to the max length. Padding will be done later
    in a data collator, so we pad examples to the longest length within a mini-batch and not
    the whole dataset.
    """
    sources = examples[config.source_lang]
    targets = examples[config.target_lang]
    model_inputs = tokenizer(sources, max_length=config.max_source_length, truncation=True)

    # setup the tokenizer for targets,
    # huggingface expects the target tokenized ids to be stored in the labels field
    # note, newer version of tokenizer supports a text_target argument, where we can create
    # source and target sentences in one go
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=config.max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [17]:
dataset_dict_tokenized = dataset_dict.map(
    batch_tokenize_fn,
    batched=True,
    remove_columns=dataset_dict["train"].column_names
)
dataset_dict_tokenized

                                                                    

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 50000
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2169
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2999
    })
})

In [18]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

features = [dataset_dict_tokenized["train"][i] for i in range(2)]
output = data_collator(features)
output

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[15158, 24860,    74, 11216,   425,     7,  4267,    32,   221,     1,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [ 1674,     3,    49, 20635,    15,    67,   183, 17874,     6,   340,
         11030, 17900,  1199,  5702,  1559,    15, 11216,   425,     7,  4267,
            32,   221,    93,     3, 30604,    29, 13636,     7,   218,  1403,
          3019,  7026,     6,     3, 25084,  2587, 18794,     7,  3532,  7756,
            15,   674,  9242, 11621,    64,     3, 11950,    15,     6,     3,
            26,  7118,   292, 11878, 16849,  8827,     5,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,

In [19]:
model_name = config.model_checkpoint.split("/")[-1]
output_dir = os.path.join(config.cache_dir, f"{model_name}_{config.source_lang}-{config.target_lang}")

args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="steps",
    #save_strategy="epoch",
    learning_rate=config.lr,
    per_device_train_batch_size=config.batch_size,
    per_device_eval_batch_size=config.batch_size,
    weight_decay=config.weight_decay,
    save_total_limit=2,
    num_train_epochs=config.epochs,
    predict_with_generate=True,
    load_best_model_at_end=True,
    greater_is_better=True,
    metric_for_best_model="rougeL",
    gradient_accumulation_steps=8,
    do_train=do_train,
    report_to = "none",
    logging_strategy="steps",
    logging_steps = 1,
    # careful when attempting to train t5 models on fp16 mixed precision,
    # the model was trained on bfloat16 mixed precision, and mixing different mixed precision
    # type might result in nan loss
    # https://discuss.huggingface.co/t/mixed-precision-for-bfloat16-pretrained-models/5315
    fp16=False
)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [20]:
def compute_metrics(eval_pred):
    """
    Compute rouge and bleu metrics for seq2seq model generated prediction.
    tip: we can run trainer.predict on our eval/test dataset to see what a sample
    eval_pred object would look like when implementing custom compute metrics function
    """
    predictions, labels = eval_pred
    # Decode generated summaries, which is in ids into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode labels, a.k.a. reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge_score.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        rouge_types=["rouge1", "rouge2", "rougeL"]
    )
    score = sacrebleu_score.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )
    result["sacrebleu"] = score["score"]
    return {k: round(v, 4) for k, v in result.items()}


In [21]:
from transformers import TrainerCallback

In [22]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=dataset_dict_tokenized["train"],
    eval_dataset=dataset_dict_tokenized["val"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback()]
)

In [23]:
if trainer.args.do_train:
    os.environ["DISABLE_MLFLOW_INTEGRATION"] = "TRUE"
    t1_start = perf_counter()
    train_output = trainer.train()
    t1_stop = perf_counter()
    print("Training elapsed time:", t1_stop - t1_start)

    # saving the model which allows us to leverage
    # .from_pretrained(model_path)
    trainer.save_model(fine_tuned_model_checkpoint)





Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Sacrebleu
1,3.7324,2.732646,0.0873,0.0261,0.083,1.3178
2,3.348,2.597449,0.0837,0.0247,0.0799,1.2673
3,3.0771,2.513496,0.1004,0.0325,0.0943,1.5881
4,3.0537,2.442006,0.124,0.0423,0.1131,1.9555
5,2.9986,2.401659,0.1693,0.0597,0.1513,2.6994
6,2.8452,2.371405,0.222,0.0828,0.197,3.8244
7,2.7669,2.348754,0.2606,0.0989,0.2328,4.6697
8,2.7302,2.332909,0.2928,0.1102,0.2612,5.3461
9,2.7656,2.321006,0.3184,0.1212,0.284,5.9864
10,2.6938,2.309531,0.334,0.1275,0.299,6.276




Training elapsed time: 6545.4099350939505


In [24]:
trainer.evaluate()

{'eval_loss': 1.9923362731933594,
 'eval_rouge1': 0.4169,
 'eval_rouge2': 0.1919,
 'eval_rougeL': 0.379,
 'eval_sacrebleu': 10.0464,
 'eval_runtime': 11.8494,
 'eval_samples_per_second': 183.048,
 'eval_steps_per_second': 5.739,
 'epoch': 2.56}

In [25]:
l = trainer.state.log_history
print(l)

[{'loss': 3.7324, 'learning_rate': 0.0004991452991452991, 'epoch': 0.01, 'step': 1}, {'eval_loss': 2.7326464653015137, 'eval_rouge1': 0.0873, 'eval_rouge2': 0.0261, 'eval_rougeL': 0.083, 'eval_sacrebleu': 1.3178, 'eval_runtime': 11.8766, 'eval_samples_per_second': 182.629, 'eval_steps_per_second': 5.726, 'epoch': 0.01, 'step': 1}, {'loss': 3.348, 'learning_rate': 0.0004982905982905984, 'epoch': 0.01, 'step': 2}, {'eval_loss': 2.5974490642547607, 'eval_rouge1': 0.0837, 'eval_rouge2': 0.0247, 'eval_rougeL': 0.0799, 'eval_sacrebleu': 1.2673, 'eval_runtime': 12.0441, 'eval_samples_per_second': 180.088, 'eval_steps_per_second': 5.646, 'epoch': 0.01, 'step': 2}, {'loss': 3.0771, 'learning_rate': 0.0004974358974358975, 'epoch': 0.02, 'step': 3}, {'eval_loss': 2.513495922088623, 'eval_rouge1': 0.1004, 'eval_rouge2': 0.0325, 'eval_rougeL': 0.0943, 'eval_sacrebleu': 1.5881, 'eval_runtime': 11.9318, 'eval_samples_per_second': 181.783, 'eval_steps_per_second': 5.699, 'epoch': 0.02, 'step': 3}, {'l

In [26]:
import pickle

In [27]:
with open('log_history.pkl', 'wb') as f:
    pickle.dump(l, f)

In [36]:
from transformers import T5ForConditionalGeneration

In [37]:
model = T5ForConditionalGeneration.from_pretrained(fine_tuned_model_checkpoint)

In [38]:
def generate_translation(model, tokenizer, example):
    """print out the source, target and predicted raw text."""
    source = example[config.source_lang]
    target = example[config.target_lang]
    input_ids = tokenizer(source)["input_ids"]
    input_ids = torch.LongTensor(input_ids).view(1, -1).to(model.device)
    generated_ids = model.generate(input_ids, max_new_tokens=20)
    prediction = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    print('source: ', source)
    print('target: ', target)
    print('prediction: ', prediction)

In [39]:
example = dataset_dict['train'][1]
generate_translation(model, tokenizer, example)

source:  Ich erkläre die am Freitag, dem 17. Dezember unterbrochene Sitzungsperiode des Europäischen Parlaments für wiederaufgenommen, wünsche Ihnen nochmals alles Gute zum Jahreswechsel und hoffe, daß Sie schöne Ferien hatten.
target:  I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.
prediction:  I re-examined the session of the European Parliament on Friday 17 December, and I
