## Instalamos las dependencias necesarias

In [1]:
!pip install transformers
!pip install sentencepiece
!pip install torch
# Para usar GPU precisiones
!pip install accelerate

Collecting transformers
  Downloading transformers-4.35.1-py3-none-any.whl (7.9 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/7.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/7.9 MB[0m [31m4.4 MB/s[0m eta [36m0:00:02[0m[2K     [91m━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/7.9 MB[0m [31m27.9 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m7.9/7.9 MB[0m [31m79.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m59.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.1-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.1/311.1 kB[0m [31m39.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transf

## Carga del modelo Flan T5 - Large

In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch


tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large", device_map="auto", torch_dtype=torch.float16)

input_text = 'Create a slogan: The slogan should contain exciting information about energy drinks'
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
n = 100
temp = 0.8
# Cantidad de tokens con mayor probabilidad que se considerarán para cada paso de generación
top = 50
sequences = 3

outputs = model.generate(input_ids, max_length=n)
outputs = model.generate(input_ids, max_length=n, do_sample=True)
outputs = model.generate(input_ids, max_length=n, do_sample=True, temperature=temp)
outputs = model.generate(input_ids, max_length=n, temperature=temp, do_sample=True, num_return_sequences=sequences)
outputs = model.generate(input_ids, max_length=n, temperature=temp, do_sample=True, num_return_sequences=sequences, top_k=top)

print(tokenizer.decode(outputs[0]))

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


<pad> If you want to live longer, be healthy and be happy</s><pad>


## Carga de palabras al vocabulario

Verificamos qué palabras utilizadas en el prompt no están dentro del vocabulario del modelo pre entrenado

In [None]:
# Obtener el vocabulario del tokenizador
vocab = tokenizer.get_vocab()

# Verificar si todas las palabras están en el vocabulario
missing_words = []

words_to_check = input_text.split()
for word in words_to_check:
    if word not in vocab:
      missing_words.append(word)
      print(f"{word} no está en el vocabulario.")

Create no está en el vocabulario.
slogan: no está en el vocabulario.
slogan no está en el vocabulario.
contain no está en el vocabulario.
exciting no está en el vocabulario.
drinks no está en el vocabulario.


Agregamos las palabras desconocidas para que el modelo las tranforme en representaciones vectoriales para su procesamiento. **OJO** aún se requiere de ajuste fino para que el modelo sea capaz de generar los word embeddings correctamente.

In [None]:
## Tokenizador personalizado

# Nuevas palabras a agregar al vocabulario

# Agregamos al vocabulario
tokenizer.add_tokens(missing_words)
model.resize_token_embeddings(len(tokenizer))


input_text = 'Create a slogan: The slogan should contain exciting information about energy drinks'
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
n = 100
temp = 0.8
# Cantidad de tokens con mayor probabilidad que se considerarán para cada paso de generación
top = 50
sequences = 3

outputs = model.generate(input_ids, max_length=n)
outputs = model.generate(input_ids, max_length=n, do_sample=True)
outputs = model.generate(input_ids, max_length=n, do_sample=True, temperature=temp)
outputs = model.generate(input_ids, max_length=n, temperature=temp, do_sample=True, num_return_sequences=sequences)
outputs = model.generate(input_ids, max_length=n, temperature=temp, do_sample=True, num_return_sequences=sequences, top_k=top)

print(tokenizer.decode(outputs[0]))


<pad> give out</s><pad><pad><pad><pad><pad>


## Ajuste fino del modelo

A partir de un dataset personalizado, hacemos el ajuste fino del modelo con ejemplos específicos a la tarea que queremos que realice. El modelo contiene alrededor de 500 ejemplos.

In [3]:
# Conectamos el entorno de Colab
from google.colab import drive
drive.mount('/content/drive')

# Ruta del archivo en Google Drive
dataset_path = '/content/drive/My Drive/Profesional/7mo Semestre - Profesional/Bloques 2 y 3/Reto - Generative IA/T5/Dataset_Coca_Cola_Flan_T5.json'

import json
# Cargamos el conjunto de datos. Especificamos un codificador de bites UTF-8 para que Python lea el archivo JSON
with open(dataset_path, 'r', encoding='utf-8') as file:
    dataset_Coca = json.load(file)

print(f'Estamos utilizando un dataset para el ajuste fino de T5 con {len(dataset_Coca)} ejemplos')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Estamos utilizando un dataset para el ajuste fino de T5 con 453 ejemplos


In [17]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import torch.optim as optim
import torch
from torch.nn.utils.rnn import pad_sequence

# Divide el conjunto de datos en entrenamiento y validación
train_data, val_data = train_test_split(dataset_Coca, test_size=0.2, random_state=42)

# Define tu conjunto de datos (asegúrate de tener un formato similar al que usaste para preentrenar)

class CustomDataset(Dataset):
    def __init__(self, examples, tokenizer):
        self.examples = examples
        self.tokenizer = tokenizer

        # Calcula la longitud máxima dinámicamente
        self.max_sequence_length = max(len(self.tokenizer.encode(example['prompt'])) for example in examples)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        example = self.examples[idx]
        prompt_tokens = self.tokenizer.encode(example['prompt'], truncation=True)
        target_tokens = self.tokenizer.encode(example['target'], truncation=True)

        # Convertir a tensores de PyTorch
        inputs = torch.tensor(prompt_tokens, dtype=torch.long)
        labels = torch.tensor(target_tokens, dtype=torch.long)

        return inputs, labels

def collate_batch(batch):
    inputs, labels = zip(*batch)

    # Usar pad_sequence para hacer el padding por separado en inputs y labels
    padded_inputs = pad_sequence(inputs, batch_first=True)
    padded_labels = pad_sequence(labels, batch_first=True)

    return padded_inputs, padded_labels


In [38]:
inputs

tensor([[ 6357,    46,     3,    35,  1225,    53,  1154,    21,   638,  1050,
         12891,    10,     1,     0],
        [ 1642,     3,     9,  1643,    18,   715,  3898,    21, 25417,  3043,
             9, 18928,    10,     1]], device='cuda:0')

In [40]:
batch

(tensor([[  749,  2748,    15,     3,     9, 10468,  6754,  1154,    21,  6236,
          21131,    15, 18872,    10,     1],
         [ 4589,    95,    28,     3,     9,  1373,  1469,  1154,    21, 25417,
           3043,     9, 12891,    10,     1]]),
 tensor([[ 4783,     3,     9,  6236, 21131,    15, 10468,  5143,    11,   911,
              3,     9,  3898,    30,     3,     9,  2608,    18, 24186, 25417,
           3043,     9, 26565,    52,     5,     1],
         [ 4783,     3,     9, 25417,  3043,     9,  1373,  1469, 12927,    11,
            129,     3,     9,  3898,    30,    39,   416,  2914,  1190,     5,
              1,     0,     0,     0,     0,     0]]))

In [36]:
labels

tensor([[ 4780,     3,     9,     3,    31,   134,   440,   935,   925,   102,
            31,  1154,     3,   104,   805,   136,   431, 25417,  3043,     9,
          6750,    11,   129,     3,     9,   339,  2608, 15580,     5,     1],
        [14839,   136,   314, 25417,  3043,     9,  6750,    11,   129, 10738,
           326,    39,   792,     5,     1,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')

In [39]:
# Configura el modelo y el tokenizador
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large", device_map="auto", torch_dtype=torch.float16)


# Configura el optimizador
optimizer = optim.AdamW(model.parameters(), lr=5e-5)

# Configura tu conjunto de datos y DataLoader para entrenamiento
train_dataset = CustomDataset(train_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_batch)

# Configura tu conjunto de datos y DataLoader para validación
val_dataset = CustomDataset(val_data, tokenizer)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False, collate_fn=collate_batch)

# Entrenamiento del modelo
num_epochs = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    # Iteramos en los batch para mover los inputs y labels al hardware disponible
    for batch in train_loader:
        inputs, labels = batch
        inputs = inputs.to(device)
        labels = labels.to(device)

    #Proceso de entrenamiento: Se pasan los datos al modelo, se calcula el gradiente y se ajustan los pesos en la retropropagación
        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}, Average Training Loss: {average_loss}')

    # Evaluación en el conjunto de validación
    model.eval()
    total_validation_loss = 0

    with torch.no_grad():
        for batch in val_loader:
            inputs, labels = batch
            #inputs = {k: v.to(device) for k, v in inputs.items()}
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs, labels=labels)
            loss = outputs.loss

            total_validation_loss += loss.item()

    average_validation_loss = total_validation_loss / len(val_loader)
    print(f'Epoch {epoch + 1}, Average Validation Loss: {average_validation_loss}')

# Guarda el modelo ajustado
model.save_pretrained("fine_tuned_model")
tokenizer.save_pretrained("fine_tuned_model")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Epoch 1, Average Training Loss: nan


AttributeError: ignored

Un resultado bueno que generó fue *Turn on the music, turn on the drinks*

Verificamos que las palabras del prompt estén incluidas en el **vocabulario** de T5

-------------------------------------------------------------