In [1]:
from transformers import (Trainer, TrainingArguments, EarlyStoppingCallback,
                          DataCollatorForLanguageModeling, AutoModelForCausalLM, AutoTokenizer)
from datasets import load_dataset
from typing import Optional, Tuple
import torch
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import accelerate
import keras
import random

2024-10-20 15:24:07.657105: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-20 15:24:07.668489: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-20 15:24:07.671990: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-20 15:24:07.680905: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
# Cargar el dataset
ds = load_dataset("ysharma/short_jokes", split='train')
ds

Dataset({
    features: ['ID', 'Joke'],
    num_rows: 231657
})

In [5]:
# Seleccionar 10,000 índices aleatorios sin repetición
random_indices = random.sample(range(len(ds)), 10000)

# Crear una nueva lista con los chistes seleccionados
dataset = ds.select(random_indices)
dataset

Dataset({
    features: ['ID', 'Joke'],
    num_rows: 10000
})

In [6]:
dataset.set_format('pandas')

# cambio la columna Joke a text
dataset = dataset.rename_column('Joke', 'text')
df = dataset.to_pandas()
df.head(10)

Unnamed: 0,ID,text
0,139732,Whats a terrorists favorite dessert flavored ices
1,207541,"Side Effects May Include: upset stomach, diarr..."
2,196381,Need to save money on car insurance? Flee the ...
3,36400,Apparently they're making a Middle Eastern ver...
4,208842,"Reddit, I need your darkest Christmas jokes to..."
5,231569,Why did the element Fluorine get a copyright s...
6,121797,Everything he knows about gynecologist visits ...
7,26292,How many teenagers does it take to screw in a ...
8,154256,Probably the worst thing you can do to a perso...
9,2109,I was fighting with my wife over the arrangeme...


In [7]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
model



GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [8]:
def preprocess_function(examples, max_len=256):
    return tokenizer(
        examples['text'],
        max_length=max_len,
        truncation=True,
        padding='longest',
        return_tensors='pt'
    )


# Ensure the tokenizer has a padding token before tokenizing
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # or add [PAD] as shown above

In [9]:
dataset.reset_format()
tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(
    [col for col in tokenized_dataset.column_names if col != 'input_ids'])
tokenized_dataset = tokenized_dataset.train_test_split(train_size=0.70)
tokenized_dataset.set_format('torch')
tokenized_dataset

Map: 100%|██████████| 10000/10000 [00:00<00:00, 11404.30 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 7000
    })
    test: Dataset({
        features: ['input_ids'],
        num_rows: 3000
    })
})

In [19]:
# Muestra la primera entrada del dataset tokenizado
print(tokenized_dataset['train'][0])

# lo decodeamos
tokenizer.decode(tokenized_dataset['train'][0]['input_ids'])

{'input_ids': tensor([ 1532,   345,  1683,   761,   617,  1107,  3595, 23491,   314,  1101,
          345,   821,  3516, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256])}


"If you ever need some really poor grammar I'm you're guy<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>"

In [10]:
epochs = 10
batch_size = 8
logging_steps = len(tokenized_dataset['train']) // batch_size

# Definimos los parámetros globales de entrenamiento
training_args = TrainingArguments(
    output_dir='./hf-gpt',
    overwrite_output_dir=True,
    num_train_epochs=epochs,
    learning_rate=2e-5,
    per_device_eval_batch_size=batch_size,
    per_device_train_batch_size=batch_size,
    weight_decay=0.01,
    eval_strategy='epoch',  # Cambiado de evaluation_strategy a eval_strategy
    save_strategy='epoch',  # Guardamos al final de cada época
    load_best_model_at_end=True,  # Cargamos el mejor modelo al final
    disable_tqdm=False,
    logging_steps=logging_steps,
    save_total_limit=2,
    # Utilizamos eval_loss como métrica de referencia
    metric_for_best_model="eval_loss",
    greater_is_better=False  # Indica que buscamos minimizar eval_loss
)

# Crear el callback para early stopping, asegurándote de que no se repita
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)

# Definir el entrenador, asegurando que no se añadan múltiples callbacks
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False),
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    callbacks=[early_stopping_callback]  # Solo se añade una vez
)

In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,4.0674,3.843042
2,3.7139,3.823332
3,3.5268,3.827336
4,3.3862,3.847118
5,3.2683,3.870111


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=4375, training_loss=3.5925260602678573, metrics={'train_runtime': 4969.7651, 'train_samples_per_second': 14.085, 'train_steps_per_second': 1.761, 'total_flos': 1761961577472000.0, 'train_loss': 3.5925260602678573, 'epoch': 5.0})

In [11]:
# Guardamos el modelo (solo correr si se vuelve a entrenar el modelo)
# trainer.save_model('trained-gpt2-inglish')
# tokenizer.save_pretrained('trained-gpt2-inglish')

('trained-gpt2-inglish/tokenizer_config.json',
 'trained-gpt2-inglish/special_tokens_map.json',
 'trained-gpt2-inglish/vocab.json',
 'trained-gpt2-inglish/merges.txt',
 'trained-gpt2-inglish/added_tokens.json',
 'trained-gpt2-inglish/tokenizer.json')

In [7]:
# Correr para cargar el modelo pre-entrenado
model = AutoModelForCausalLM.from_pretrained('trained-gpt2-inglish')
tokenizer = AutoTokenizer.from_pretrained('trained-gpt2-inglish')

Exception: data did not match any variant of untagged enum ModelWrapper at line 250317 column 3