# Fine-tuning de T5 y Gemma-2b en Colab con GPU

In [14]:
!pip install transformers torch sentencepiece nltk pandas bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.me

### Import Libraries

In [7]:
import os
import json
import re
import torch
import pandas as pd

from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
    default_data_collator
)
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset as HFDataset
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset

In [2]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineG

### Load Data

In [3]:
# Load both JSON files
with open("train_data.json", "r", encoding="utf-8") as f1:
    train_data = json.load(f1)

### Encoder - Decoder: T5 Fine Tuned WikiSQL

In [9]:
class Text2SQLDataset(Dataset):
    """
    Dataset sencillo para Text2SQL:
    - Recibe una lista de ejemplos con 'question' y 'query'.
    - Usa el tokenizer de Hugging Face para preparar inputs y labels.
    """
    def __init__(self, examples, tokenizer, max_input_len=64, max_output_len=64):
        self.examples = examples
        self.tokenizer = tokenizer
        self.max_input_len = max_input_len
        self.max_output_len = max_output_len

    def __len__(self):
        # Número de ejemplos
        return len(self.examples)

    def __getitem__(self, idx):
        # Tomamos el ejemplo idx-ésimo
        example = self.examples[idx]
        question = example["question"]
        query    = example["query"]

        # 1) Preparamos el texto de entrada
        input_text = f"translate English to SQL: {question} </s>"
        in_enc = self.tokenizer(
            input_text,
            truncation=True,
            padding="max_length",
            max_length=self.max_input_len,
            return_tensors="pt"
        )

        # 2) Tokenizamos la query como etiqueta (label)
        out_enc = self.tokenizer(
            query,
            truncation=True,
            padding="max_length",
            max_length=self.max_output_len,
            return_tensors="pt"
        )

        # 3) Extraemos tensores del diccionario y quitamos la dimensión batch
        input_ids      = in_enc["input_ids"][0]
        attention_mask = in_enc["attention_mask"][0]
        labels         = out_enc["input_ids"][0]

        # 4) Ponemos -100 en los tokens de padding para que no cuenten en la loss
        labels[labels == self.tokenizer.pad_token_id] = -100

        # 5) Devolvemos el dict que Trainer de HF espera
        return {
            "input_ids":      input_ids,
            "attention_mask": attention_mask,
            "labels":         labels
        }


In [10]:
model_name = "mrm8488/t5-small-finetuned-wikiSQL"
tokenizer = T5Tokenizer.from_pretrained(model_name)

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


In [11]:
# Crear dataset
train_subset, eval_subset = train_test_split(train_data, test_size=0.1, random_state=42)
train_dataset = Text2SQLDataset(train_subset, tokenizer)
eval_dataset = Text2SQLDataset(eval_subset, tokenizer)

In [12]:
from transformers import TrainingArguments, T5ForConditionalGeneration

# Argumentos de entrenamiento para GPU (gpu_args)
gpu_args = TrainingArguments(
    output_dir=".t5-WikiSQL-finetuned",  # Carpeta de salida para guardar el modelo entrenado
    overwrite_output_dir=True,            # Permite sobrescribir el directorio si ya existe
    fp16=True,                            # Usa precisión mixta (FP16) si está soportado
    per_device_train_batch_size=1,        # Batch muy pequeño para entrenamiento
    per_device_eval_batch_size=1,         # Batch pequeño para evaluación
    gradient_accumulation_steps=4,        # Acumula gradientes para simular batches más grandes
    num_train_epochs=8,                   # Número de épocas de entrenamiento
    warmup_steps=300,                     # Número de pasos de warm-up para la tasa de aprendizaje
    learning_rate=5e-5,                   # Tasa de aprendizaje inicial
    eval_strategy="epoch",                # Ejecutar evaluación al final de cada época
    save_strategy="no",                   # No guardar checkpoints intermedios
    logging_strategy="epoch",             # Registrar métricas al final de cada época
    disable_tqdm=True,                    # Deshabilitar barras de progreso en consola

    save_total_limit=1,                   # Mantener solo el último checkpoint

    load_best_model_at_end=False,         # No cargar el mejor modelo al final
    gradient_checkpointing=True,          # Activar gradient checkpointing para ahorrar memoria

    report_to=[],                         # No reportar métricas a ningún servicio externo
)

# Carga del modelo fine-tuneado y preparación para entrenamiento en GPU
model = T5ForConditionalGeneration.from_pretrained(
    "mrm8488/t5-small-finetuned-wikiSQL"
).to("cuda")

# Opciones para optimizar el uso de memoria durante el entrenamiento
model.gradient_checkpointing_enable()
model.config.use_cache = False


The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


In [None]:
# Creamos el DataCollator, que prepara batches para Seq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Configuramos el Trainer de Hugging Face
trainer = Trainer(
    model=model,
    args=gpu_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Ejecutamos el entrenamiento
trainer.train()

# Guardamos el modelo ya fine-tuneado en la carpeta indicada
trainer.save_model("./t5-WikiSQL-finetuned")

# Empaquetamos la carpeta del modelo en un archivo .zip
!zip -r t5-WikiSQL-finetuned.zip t5-WikiSQL-finetuned

# Descargamos el .zip al equipo local desde Colab
from google.colab import files
files.download("t5-WikiSQL-finetuned.zip")

  trainer = Trainer(
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


{'loss': 1.7838, 'grad_norm': 3.238239049911499, 'learning_rate': 4.4621826746924886e-05, 'epoch': 1.0}
{'eval_loss': 0.8961408138275146, 'eval_runtime': 20.0853, 'eval_samples_per_second': 43.116, 'eval_steps_per_second': 43.116, 'epoch': 1.0}
{'loss': 0.9652, 'grad_norm': 0.8656386733055115, 'learning_rate': 3.8249149437320075e-05, 'epoch': 2.0}
{'eval_loss': 0.6929260492324829, 'eval_runtime': 19.2391, 'eval_samples_per_second': 45.012, 'eval_steps_per_second': 45.012, 'epoch': 2.0}
{'loss': 0.79, 'grad_norm': 0.8367671370506287, 'learning_rate': 3.187647212771526e-05, 'epoch': 3.0}
{'eval_loss': 0.5998891592025757, 'eval_runtime': 19.5582, 'eval_samples_per_second': 44.278, 'eval_steps_per_second': 44.278, 'epoch': 3.0}
{'loss': 0.6971, 'grad_norm': 1.133829116821289, 'learning_rate': 2.550052342318765e-05, 'epoch': 4.0}
{'eval_loss': 0.5463928580284119, 'eval_runtime': 19.3441, 'eval_samples_per_second': 44.768, 'eval_steps_per_second': 44.768, 'epoch': 4.0}
{'loss': 0.6366, 'grad

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Decoder Only: Gemma 2 2b

In [4]:
model_id = "google/gemma-2b"  # Identificador del modelo Gemma-2b en Hugging Face

# Cargamos el tokenizador rápido para Gemma-2b
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    use_fast=True
)

# Cargamos el modelo en modo cuantizado 4 bits y con tipo de dato float16 para ahorrar memoria
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    load_in_4bit=True,
    torch_dtype=torch.float16
)

# Configuración de LoRA (Low-Rank Adaptation) para fine-tuning ligero
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

# Aplicamos la configuración LoRA al modelo base
model = get_peft_model(model, lora_config)

# Desactivamos el uso de cache para asegurar que LoRA entrene correctamente
model.config.use_cache = False


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [5]:
def prepare_data(data):
    """
    Prepara los datos en formato Hugging Face Dataset:
    - Construye listas de prompts y completions.
    - Devuelve un dataset con las columnas 'prompt' y 'completion'.
    """
    prompts, completions = [], []
    for item in data:
        # Extraer pregunta y SQL original
        question = item["question"]
        query    = item["query"]

        # Crear el texto de entrada (prompt) para el modelo
        prompt = (
            "Translate the following English question into exactly one SQL query. "
            "Do NOT output any additional examples or explanations. "
            f"Question: {question}\n </s>"
        )
        # Crear el texto de salida (completion) con la SQL entre etiquetas <code>
        completion = f" <code>{query}</code>"

        prompts.append(prompt)
        completions.append(completion)

    # Devolver un Hugging Face Dataset con dos columnas
    return HFDataset.from_dict({
        "prompt":     prompts,
        "completion": completions
    })


def tokenize(example):
    """
    Tokeniza un ejemplo que combina prompt y completion
    """
    return tokenizer(
        example["prompt"] + example["completion"],
        truncation=True,
        padding="max_length",
        max_length=256
    )


def add_labels(example):
    """
    Prepara la clave 'labels' para Trainer
    """
    example["labels"] = example["input_ids"]
    return example

In [6]:
# Creamos el Hugging Face Dataset a partir de los datos de entrenamiento
dataset = prepare_data(train_data)

# Dividimos el dataset en 90% train y 10% eval
dataset_split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset_split["train"]
eval_dataset  = dataset_split["test"]

# Tokenizamos los conjuntos
tokenized_train = train_dataset.map(
    tokenize,
    remove_columns=["prompt", "completion"]
)
tokenized_eval = eval_dataset.map(
    tokenize,
    remove_columns=["prompt", "completion"]
)

tokenized_train = tokenized_train.map(add_labels)
tokenized_eval  = tokenized_eval.map(add_labels)

Map:   0%|          | 0/7793 [00:00<?, ? examples/s]

Map:   0%|          | 0/866 [00:00<?, ? examples/s]

Map:   0%|          | 0/7793 [00:00<?, ? examples/s]

Map:   0%|          | 0/866 [00:00<?, ? examples/s]

In [None]:
# Usamos el collator por defecto que agrupa ejemplos en batches
data_collator = default_data_collator

# Configuración de entrenamiento para Gemma-2b
training_args = TrainingArguments(
    output_dir="./gemma2b-finetuned",    # Carpeta de salida para el modelo entrenado
    overwrite_output_dir=True,          # Sobrescribir si ya existe
    per_device_train_batch_size=1,      # Batch pequeño para entrenamiento
    per_device_eval_batch_size=1,       # Batch pequeño para evaluación
    gradient_accumulation_steps=8,      # Acumular gradientes para simular batch más grande
    num_train_epochs=4,                 # Número de épocas
    learning_rate=2e-4,                 # Tasa de aprendizaje
    warmup_steps=800,                   # Pasos de calentamiento de LR
    fp16=True,                          # Usar precisión mixta (fp16) si está soportado
    logging_strategy="epoch",           # Registrar métricas al final de cada época
    eval_strategy="epoch",              # Evaluar al final de cada época
    save_strategy="no",                 # No guardar checkpoints intermedios
    report_to=[]                        # No enviar métricas a servicios externos
)

# Creamos el Trainer de Hugging Face
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator
)

trainer.train()

# Guardamos el modelo y el tokenizador fine-tuneados
model.save_pretrained("./gemma2b-finetuned")
tokenizer.save_pretrained("./gemma2b-finetuned")

!zip -r gemma2b-finetuned.zip gemma2b-finetuned

# Descargamos el zip al equipo local desde Colab
from google.colab import files
files.download("gemma2b-finetuned.zip")

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss
1,3.9084,3.418587
2,3.3779,3.368963
3,3.3413,3.345669
