# Fine tuning

In [1]:
from transformers import (Trainer, TrainingArguments, EarlyStoppingCallback,
                          DataCollatorForLanguageModeling, AutoModelForCausalLM, AutoTokenizer)
from datasets import load_dataset
from huggingface_hub import notebook_login
import torch

2024-11-29 17:01:45.618409: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-29 17:01:45.968987: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-29 17:01:46.092451: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-29 17:01:47.268007: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
dataset = load_dataset('kevmansilla/jokes_topic', split='train')
dataset

README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

jokes_topic.csv:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/122 [00:00<?, ? examples/s]

Dataset({
    features: ['joke'],
    num_rows: 122
})

## Preparando el conjunto de datos

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_name = 'DeepESP/gpt2-spanish-medium'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
model



GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50257, bias=False)
)

In [4]:
def preprocess_function(max_len):
    """
    Prepara una función de preprocesamiento que tokeniza el texto.
    """
    def _preprocess_function(examples):
        return tokenizer(
            # Asegúrate de que el nombre de la columna coincida
            examples['joke'],
            max_length=max_len,
            truncation=True,
            # 'max_length' asegura que todas las secuencias tengan el mismo tamaño
            padding='max_length',
            return_tensors='pt'
        )
    return _preprocess_function

In [5]:
# Tokenizar el dataset
tokenized_dataset = dataset.map(preprocess_function(max_len=256), batched=True)

# Filtrar solo los 'input_ids' para entrenar
tokenized_dataset = tokenized_dataset.remove_columns(
    [col for col in tokenized_dataset.column_names if col != 'input_ids']
)

# Dividir el dataset en entrenamiento y validación (70% - 30%)
tokenized_dataset = tokenized_dataset.train_test_split(train_size=0.75)

# Asegurar que el formato sea PyTorch
tokenized_dataset.set_format(type='torch')

# Mostrar el dataset tokenizado
print(tokenized_dataset)

Map:   0%|          | 0/122 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 91
    })
    test: Dataset({
        features: ['input_ids'],
        num_rows: 31
    })
})


In [6]:
# Muestra la primera entrada del dataset tokenizado
print(tokenized_dataset['train'][0])

# lo decodeamos
tokenizer.decode(tokenized_dataset['train'][0]['input_ids'])

{'input_ids': tensor([   37, 13126, 43214,    39,    68, 11365,  3522, 14158,  3952,    70,
         1368, 28320,   289,   325,  2623,  2385,   436,   491,   299,  2886,
         1935,   576, 28461,  1223, 12430,   420,   314,  2324,  4536,    40,
          208,    22, 39518,  1200,   208,    22, 10516,   874,    21,   576,
        45808,  6535,   363,  6366,    40,   332,   208,   230,   208,    68,
         3349,    60,    64,  2979,    70,  1368, 39518,    21,   913,  2871,
          785,  1293,  1200,   208,    37,  4216,    45,    39, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 

'<START>[QUESTION] -Veo en su curriculum que sabe inglés ¿sabría decirme como se dice mirar?\n- Look...\n- Muy bien, ¿podría construir una frase? \n\n \n[ANSWER] - Look, yo soy tu padre...\n<END><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|

In [7]:
EPOCHS = 12
BARTCH_SIZE = 8
LOGGING_STEPS = len(tokenized_dataset['train']) // BARTCH_SIZE

# Definimos los parámetros globales de entrenamiento
training_args = TrainingArguments(
    output_dir='./hf-gpt',
    overwrite_output_dir=True,
    num_train_epochs=EPOCHS,
    learning_rate=1e-6,
    per_device_eval_batch_size=BARTCH_SIZE,
    per_device_train_batch_size=BARTCH_SIZE,
    weight_decay=0.01,
    eval_strategy='epoch',  # Cambiado de evaluation_strategy a eval_strategy
    save_strategy='epoch',  # Guardamos al final de cada época
    load_best_model_at_end=True,  # Cargamos el mejor modelo al final
    disable_tqdm=False,
    logging_steps=LOGGING_STEPS,
    save_total_limit=2,
    # Utilizamos eval_loss como métrica de referencia
    metric_for_best_model="eval_loss",
    greater_is_better=False  # Indica que buscamos minimizar eval_loss
)

# Crear el callback para early stopping, asegurándote de que no se repita
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)

# Definir el entrenador, asegurando que no se añadan múltiples callbacks
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False),
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    callbacks=[early_stopping_callback]  # Solo se añade una vez
)

In [8]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,5.754,5.347269
2,5.3452,4.904883
3,5.0037,4.558318
4,4.77,4.282827
5,4.5072,4.060749
6,4.4173,3.880362
7,4.0965,3.728688
8,4.0495,3.610077
9,3.8805,3.518081
10,3.8063,3.452085


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=144, training_loss=4.365368387765354, metrics={'train_runtime': 833.4863, 'train_samples_per_second': 1.31, 'train_steps_per_second': 0.173, 'total_flos': 507070579212288.0, 'train_loss': 4.365368387765354, 'epoch': 12.0})

In [10]:
# Guardar el modelo
# model.save_pretrained('./trained-question-answer')
# tokenizer.save_pretrained('./trained-question-answer')

('./trained-question-answer/tokenizer_config.json',
 './trained-question-answer/special_tokens_map.json',
 './trained-question-answer/vocab.json',
 './trained-question-answer/merges.txt',
 './trained-question-answer/added_tokens.json',
 './trained-question-answer/tokenizer.json')

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [10]:
model.push_to_hub('kevmansilla/topic_dataset')
tokenizer.push_to_hub('kevmansilla/topic_dataset')
print('Modelo subido a Hugging Face')

README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Modelo subido a Hugging Face
