In [1]:
pip install datasets evaluate -q

Note: you may need to restart the kernel to use updated packages.


In [17]:
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, TFAutoModelForSeq2SeqLM, create_optimizer, AdamWeightDecay, pipeline
from datasets import load_dataset
import tensorflow as tf
from datasets import Dataset
import evaluate
import numpy as np
import torch
import os
import warnings
warnings.filterwarnings('ignore')


In [3]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [80]:
folder_path = r"/kaggle/input/engtoesp/"
dataset_name = "eng1.csv"
path = os.path.join(folder_path, dataset_name)
print(path)
data = Dataset.from_csv(path)
data = data.select(range(15000)) 
data = data.train_test_split(test_size=0.3)
print(data)

/kaggle/input/engtoesp/eng1.csv
DatasetDict({
    train: Dataset({
        features: ['engl', 'spa'],
        num_rows: 10500
    })
    test: Dataset({
        features: ['engl', 'spa'],
        num_rows: 4500
    })
})


In [81]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")  #google/flan-t5-small
model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-small") #google/flan-t5-small
#model = export_and_get_onnx_model('t5-small')

prefix = "translate: "
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["engl"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)
    labels = tokenizer(text_target=examples["spa"], max_length=128, truncation=True) #max length was 128
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [82]:
tokenized_data = data.map(preprocess_function, batched=True, remove_columns=["engl", "spa"])

Map:   0%|          | 0/10500 [00:00<?, ? examples/s]

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

In [19]:
pip install rouge_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [83]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    return {k: round(v, 4) for k, v in result.items()}

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="tf")
optimizer = AdamWeightDecay(learning_rate=2e-4, weight_decay_rate=0.01) #2e-5 was before wd was 1e-2, Typically, 1e-4 and 3e-4 work well for most problems

In [84]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_data["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)
tf_test_set = model.prepare_tf_dataset(
    tokenized_data["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [85]:
epochs = 1
model.compile(optimizer=optimizer)

In [86]:
model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=epochs, callbacks=None)



<tf_keras.src.callbacks.History at 0x7d414d13ccd0>

In [87]:
# Guarda el modelo entrenado
folder_path = '/kaggle/working/model/'
model_name = "NMT-2024-04-11-epocs-" + str(epochs)
path = os.path.join(folder_path, model_name + ".h5")
model.save_pretrained(path)
del model

In [88]:
#Para inferir desde aquí.
model_name = "NMT-2024-04-11-epocs-" + str(epochs)
path = os.path.join(folder_path, model_name + ".h5")

tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = TFAutoModelForSeq2SeqLM.from_pretrained(path, pad_token_id=tokenizer.eos_token_id)

summarizer = pipeline("summarization",
    model=model,
    tokenizer=tokenizer,
    framework="tf")

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at /kaggle/working/model/NMT-2024-04-11-epocs-1.h5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


# Summarize

## Al usar 5000 ejemplos del dataset

In [45]:
import timeit
start_time = timeit.default_timer()

text = "summarize: Google is a technology company"
print(summarizer(text, min_length=8, max_length=12))

elapsed = timeit.default_timer() - start_time
print(elapsed)

Your max_length is set to 12, but your input_length is only 9. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)


[{'summary_text': 'Google estoy una tecnologia'}]
2.744006808999984


## Al usar 10000 ejemplos del dataset

In [29]:
import timeit
start_time = timeit.default_timer()

text = "summarize: Google is a technology company"
print(summarizer(text, min_length=8, max_length=12))

elapsed = timeit.default_timer() - start_time
print(elapsed)

Your max_length is set to 12, but your input_length is only 11. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=5)


[{'summary_text': 'Google está una tecnologia.'}]
3.3463849889999437


## Al usar 14000 ejemplos del dataset

In [55]:
import timeit
start_time = timeit.default_timer()

text = "summarize: Google is a technology company"
print(summarizer(text, min_length=8, max_length=12))

elapsed = timeit.default_timer() - start_time
print(elapsed)

Your max_length is set to 12, but your input_length is only 11. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=5)


[{'summary_text': 'Google es una sociedad '}]
2.8068187880001005


# Translate

## Usando 5000 datos

In [65]:
import timeit
start_time = timeit.default_timer()

text = "translate: Google is a technology company"
print(summarizer(text, min_length=8, max_length=20))

elapsed = timeit.default_timer() - start_time
print(elapsed)

Your max_length is set to 20, but your input_length is only 11. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=5)


[{'summary_text': 'google is a tech company .'}]
4.7378049489998375


## Usando 10000 datos

In [79]:
import timeit
start_time = timeit.default_timer()

text = "translate: Google is a technology company"
print(summarizer(text, min_length=8, max_length=12))

elapsed = timeit.default_timer() - start_time
print(elapsed)

Your max_length is set to 12, but your input_length is only 11. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=5)


[{'summary_text': 'google is a technology company .'}]
2.2709707560002244


## Usando 15000 datos

In [89]:
import timeit
start_time = timeit.default_timer()

text = "translate: Google is a technology company"
print(summarizer(text, min_length=8, max_length=12))

elapsed = timeit.default_timer() - start_time
print(elapsed)

Your max_length is set to 12, but your input_length is only 11. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=5)


[{'summary_text': 'google is a tech company .'}]
2.805183269999816


# Resultados

Al usar una mayor cantidad de datos los resultados observados llegan a ser mejores, por lo cuál con un dataset más grande, se podría crear un LLM robusto y util para traducir y resumir frases de más de 12 o 20 caracteres.
En la tarea de traducir no logra el resultado esperado, por lo que tal vez usando más datos pueda llegar a hacerlo de una manera más satisfactoria.