# Fine tuning

In [1]:
from transformers import (Trainer, TrainingArguments, EarlyStoppingCallback,
                          DataCollatorForLanguageModeling, AutoModelForCausalLM, AutoTokenizer)
from datasets import load_dataset
from huggingface_hub import notebook_login
import torch

2024-10-21 11:46:35.223284: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-21 11:46:35.371000: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-21 11:46:35.402649: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-21 11:46:35.614323: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
dataset = load_dataset('kevmansilla/jokes_spanish_tm', split='train')
dataset

jokes_tm.csv:   0%|          | 0.00/223k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1732 [00:00<?, ? examples/s]

Dataset({
    features: ['joke'],
    num_rows: 1732
})

## Preparando el conjunto de datos

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_name = 'DeepESP/gpt2-spanish-medium'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
model



GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50257, bias=False)
)

In [4]:
def preprocess_function(max_len):
    """
    Prepara una función de preprocesamiento que tokeniza el texto.
    """
    def _preprocess_function(examples):
        return tokenizer(
            # Asegúrate de que el nombre de la columna coincida
            examples['joke'],
            max_length=max_len,
            truncation=True,
            # 'max_length' asegura que todas las secuencias tengan el mismo tamaño
            padding='max_length',
            return_tensors='pt'
        )
    return _preprocess_function

In [5]:
# Tokenizar el dataset
tokenized_dataset = dataset.map(preprocess_function(max_len=256), batched=True)

# Filtrar solo los 'input_ids' para entrenar
tokenized_dataset = tokenized_dataset.remove_columns(
    [col for col in tokenized_dataset.column_names if col != 'input_ids']
)

# Dividir el dataset en entrenamiento y validación (70% - 30%)
tokenized_dataset = tokenized_dataset.train_test_split(train_size=0.75)

# Asegurar que el formato sea PyTorch
tokenized_dataset.set_format(type='torch')

# Mostrar el dataset tokenizado
print(tokenized_dataset)

Map:   0%|          | 0/1732 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 1299
    })
    test: Dataset({
        features: ['input_ids'],
        num_rows: 433
    })
})


In [6]:
# Muestra la primera entrada del dataset tokenizado
print(tokenized_dataset['train'][0])

# lo decodeamos
tokenizer.decode(tokenized_dataset['train'][0]['input_ids'])

{'input_ids': tensor([   37, 13126, 43214,    39,    68, 11365,  3522, 14158,  3952,    70,
          576,  1423,   868,   297,  3025,   366,  2620, 10676,   288,  2205,
           40,   208,    68,  3349,    60,    64,  2979,    70,  6575,   426,
        22673,   370,  4442,    23,   208,    37,  4216,    45,    39, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 

'<START>[QUESTION] ¿Por qué el niño no pudo cruzar la calle?\n[ANSWER] Porque le faltaban las piernas.\n<END><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endof

In [7]:
EPOCHS = 12
BARTCH_SIZE = 8
LOGGING_STEPS = len(tokenized_dataset['train']) // BARTCH_SIZE

# Definimos los parámetros globales de entrenamiento
training_args = TrainingArguments(
    output_dir='./hf-gpt',
    overwrite_output_dir=True,
    num_train_epochs=EPOCHS,
    learning_rate=1e-6,
    per_device_eval_batch_size=BARTCH_SIZE,
    per_device_train_batch_size=BARTCH_SIZE,
    weight_decay=0.01,
    eval_strategy='epoch',  # Cambiado de evaluation_strategy a eval_strategy
    save_strategy='epoch',  # Guardamos al final de cada época
    load_best_model_at_end=True,  # Cargamos el mejor modelo al final
    disable_tqdm=False,
    logging_steps=LOGGING_STEPS,
    save_total_limit=2,
    # Utilizamos eval_loss como métrica de referencia
    metric_for_best_model="eval_loss",
    greater_is_better=False  # Indica que buscamos minimizar eval_loss
)

# Crear el callback para early stopping, asegurándote de que no se repita
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)

# Definir el entrenador, asegurando que no se añadan múltiples callbacks
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False),
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    callbacks=[early_stopping_callback]  # Solo se añade una vez
)

In [8]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,3.4987,2.024146
2,1.9591,1.759298
3,1.7962,1.694738
4,1.748,1.660583
5,1.7,1.63923
6,1.6676,1.624269
7,1.651,1.613267
8,1.6319,1.605129
9,1.6268,1.599196
10,1.6098,1.595129


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=1956, training_loss=1.8398562345524263, metrics={'train_runtime': 12219.3231, 'train_samples_per_second': 1.276, 'train_steps_per_second': 0.16, 'total_flos': 7238293213151232.0, 'train_loss': 1.8398562345524263, 'epoch': 12.0})

In [10]:
# Guardar el modelo
# model.save_pretrained('./trained-question-answer')
# tokenizer.save_pretrained('./trained-question-answer')

('./trained-question-answer/tokenizer_config.json',
 './trained-question-answer/special_tokens_map.json',
 './trained-question-answer/vocab.json',
 './trained-question-answer/merges.txt',
 './trained-question-answer/added_tokens.json',
 './trained-question-answer/tokenizer.json')

In [9]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [10]:
model.push_to_hub('kevmansilla/generate_jokes_question_answer')
tokenizer.push_to_hub('kevmansilla/generate_jokes_question_answer')
print('Modelo subido a Hugging Face')

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Modelo subido a Hugging Face
