# Fine tuning

In [1]:
from transformers import (Trainer, TrainingArguments, EarlyStoppingCallback,
                          DataCollatorForLanguageModeling, AutoModelForCausalLM, AutoTokenizer)
from datasets import load_dataset
from huggingface_hub import notebook_login
import torch

2024-10-20 13:24:20.911803: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-20 13:24:21.487461: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-20 13:24:21.661442: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-20 13:24:23.050967: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
dataset = load_dataset('kevmansilla/jokes_spanish_tm', split='train')
dataset

Dataset({
    features: ['joke'],
    num_rows: 832
})

## Preparando el conjunto de datos

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_name = 'DeepESP/gpt2-spanish-medium'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
model



GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50257, bias=False)
)

In [4]:
def preprocess_function(max_len):
    """
    Prepara una función de preprocesamiento que tokeniza el texto.
    """
    def _preprocess_function(examples):
        return tokenizer(
            # Asegúrate de que el nombre de la columna coincida
            examples['joke'],
            max_length=max_len,
            truncation=True,
            # 'max_length' asegura que todas las secuencias tengan el mismo tamaño
            padding='max_length',
            return_tensors='pt'
        )
    return _preprocess_function

In [5]:
# Tokenizar el dataset
tokenized_dataset = dataset.map(preprocess_function(max_len=256), batched=True)

# Filtrar solo los 'input_ids' para entrenar
tokenized_dataset = tokenized_dataset.remove_columns(
    [col for col in tokenized_dataset.column_names if col != 'input_ids']
)

# Dividir el dataset en entrenamiento y validación (70% - 30%)
tokenized_dataset = tokenized_dataset.train_test_split(train_size=0.70)

# Asegurar que el formato sea PyTorch
tokenized_dataset.set_format(type='torch')

# Mostrar el dataset tokenizado
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 582
    })
    test: Dataset({
        features: ['input_ids'],
        num_rows: 250
    })
})


In [6]:
# Muestra la primera entrada del dataset tokenizado
print(tokenized_dataset['train'][0])

# lo decodeamos
tokenizer.decode(tokenized_dataset['train'][0]['input_ids'])

{'input_ids': tensor([   37, 13126, 43214,    39,    68, 11365,  3522, 14158,  3952,    70,
          576,  1423,   868,   335, 46034,  2195,  5687,  7420,    40,   208,
           68,  3349,    60,    64,  2979,    70,  6575,   335,  5563,   615,
          741, 10625, 22349,    23,   208,    37,  4216,    45,    39, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 

'<START>[QUESTION] ¿Por qué los cementerios tienen paredes altas?\n[ANSWER] Porque los muertos son muy codiciosos.\n<END><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoft

In [8]:
EPOCHS = 10
BARTCH_SIZE = 8
LOGGING_STEPS = len(tokenized_dataset['train']) // BARTCH_SIZE

# Definimos los parámetros globales de entrenamiento
training_args = TrainingArguments(
    output_dir='./hf-gpt',
    overwrite_output_dir=True,
    num_train_epochs=EPOCHS,
    learning_rate=1e-6,
    per_device_eval_batch_size=BARTCH_SIZE,
    per_device_train_batch_size=BARTCH_SIZE,
    weight_decay=0.01,
    eval_strategy='epoch',  # Cambiado de evaluation_strategy a eval_strategy
    save_strategy='epoch',  # Guardamos al final de cada época
    load_best_model_at_end=True,  # Cargamos el mejor modelo al final
    disable_tqdm=False,
    logging_steps=LOGGING_STEPS,
    save_total_limit=2,
    # Utilizamos eval_loss como métrica de referencia
    metric_for_best_model="eval_loss",
    greater_is_better=False  # Indica que buscamos minimizar eval_loss
)

# Crear el callback para early stopping, asegurándote de que no se repita
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)

# Definir el entrenador, asegurando que no se añadan múltiples callbacks
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False),
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    callbacks=[early_stopping_callback]  # Solo se añade una vez
)

In [9]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,4.4032,3.318475
2,3.0381,2.357456
3,2.334,2.070567
4,2.1015,1.975508
5,2.0204,1.931211
6,1.9782,1.906914
7,1.9274,1.892188
8,1.9055,1.883004
9,1.9175,1.877693
10,1.8828,1.876199


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=730, training_loss=2.345893420911815, metrics={'train_runtime': 3411.0131, 'train_samples_per_second': 1.706, 'train_steps_per_second': 0.214, 'total_flos': 2702519021076480.0, 'train_loss': 2.345893420911815, 'epoch': 10.0})

In [10]:
# Guardar el modelo
# model.save_pretrained('./trained-question-answer')
# tokenizer.save_pretrained('./trained-question-answer')

('./trained-question-answer/tokenizer_config.json',
 './trained-question-answer/special_tokens_map.json',
 './trained-question-answer/vocab.json',
 './trained-question-answer/merges.txt',
 './trained-question-answer/added_tokens.json',
 './trained-question-answer/tokenizer.json')

In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
model.push_to_hub('kevmansilla/generate_jokes_question_answer')
tokenizer.push_to_hub('kevmansilla/generate_jokes_question_answer')
print('Modelo subido a Hugging Face')

README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Modelo subido a Hugging Face


In [6]:
# cargo el modelo de hugging face
model = AutoModelForCausalLM.from_pretrained('kevmansilla/generate_jokes_question_answer')
tokenizer = AutoTokenizer.from_pretrained('kevmansilla/generate_jokes_question_answer')

config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.46k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/840k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/498k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.19M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/771 [00:00<?, ?B/s]

In [7]:
def generate_short_joke(prompt, max_length=50, temperature=0.7, top_k=30, top_p=0.9):
    """
    Genera un chiste breve basado en el prompt proporcionado.

    Args:
        prompt (str): El texto inicial para el chiste.
        max_length (int): Longitud máxima del chiste generado.
        temperature (float): Controla la aleatoriedad.
        top_k (int): Tokens más probables considerados en cada paso.
        top_p (float): Probabilidad acumulativa para top-p sampling.

    Returns:
        str: El chiste generado.
    """
    # Tokenizar el prompt
    input_ids = tokenizer(
        prompt, return_tensors='pt').input_ids.to(model.device)

    # Generar el chiste con una oración breve
    output = model.generate(
        input_ids,
        max_length=max_length,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        num_return_sequences=1,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.convert_tokens_to_ids(
            '<END>')  # Detener al encontrar <END>
    )

    # Decodificar el chiste generado
    joke = tokenizer.decode(output[0], skip_special_tokens=True)

    # Limitar la salida a la primera oración
    # short_joke = joke.split('.')[0] + '.'

    return joke

In [8]:
# Ejemplo de uso
prompt = '<START>[QUESTION] ¿Que le dijo un judio al otro?'
chiste = generate_short_joke(prompt)

print(f'Chiste generado:\n{chiste}')

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Chiste generado:
<START>[QUESTION] ¿Que le dijo un judio al otro?
[ANSWER] Que le dijo:
- ¡Qué bien lo ha pasado!
<END>
<END>
