In [13]:
from transformers import (Trainer, TrainingArguments, EarlyStoppingCallback,
                          DataCollatorForLanguageModeling, AutoModelForCausalLM, AutoTokenizer)
from datasets import load_dataset
from typing import Optional, Tuple
import torch
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import accelerate
import keras
import random
from huggingface_hub import notebook_login

In [2]:
# Cargar el dataset
ds = load_dataset("ysharma/short_jokes", split='train')
ds

Dataset({
    features: ['ID', 'Joke'],
    num_rows: 231657
})

In [3]:
# Seleccionar 10,000 índices aleatorios sin repetición
random_indices = random.sample(range(len(ds)), 10000)

# Crear una nueva lista con los chistes seleccionados
dataset = ds.select(random_indices)
dataset

Dataset({
    features: ['ID', 'Joke'],
    num_rows: 10000
})

In [4]:
dataset.set_format('pandas')

# cambio la columna Joke a text
dataset = dataset.rename_column('Joke', 'text')
df = dataset.to_pandas()
df.head(10)

Unnamed: 0,ID,text
0,1827,Why was the programmer lonely? Off-by-one erro...
1,7843,Me: Objection! The plaintiff is a bologna sand...
2,127771,"When I want to exercise, I wear my gym clothes..."
3,173096,BREAKING. With Disney buying Star Wars Donald ...
4,202814,"Never date a chemist, they seduce you with the..."
5,111234,Why legalize weed? Because it's 2015... http:/...
6,74864,What do you call it when someone steals your f...
7,94869,I accidentally pushed 2 for Spanish and the op...
8,146425,"I hate working with customers I've gotta say ""..."
9,198516,Scientists have discovered a Gene that makes w...


In [5]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
model



GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [6]:
def preprocess_function(examples, max_len=256):
    return tokenizer(
        examples['text'],
        max_length=max_len,
        truncation=True,
        padding='longest',
        return_tensors='pt'
    )


# Ensure the tokenizer has a padding token before tokenizing
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # or add [PAD] as shown above

In [7]:
dataset.reset_format()
tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(
    [col for col in tokenized_dataset.column_names if col != 'input_ids'])
tokenized_dataset = tokenized_dataset.train_test_split(train_size=0.70)
tokenized_dataset.set_format('torch')
tokenized_dataset

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 7000
    })
    test: Dataset({
        features: ['input_ids'],
        num_rows: 3000
    })
})

In [8]:
# Muestra la primera entrada del dataset tokenizado
print(tokenized_dataset['train'][0])

# lo decodeamos
tokenizer.decode(tokenized_dataset['train'][0]['input_ids'])

{'input_ids': tensor([ 1532,   345,  1053,  1683,  2077,   257,  5156,   284,   257,  3807,
           11,  3387,   900,  3511,   319,  2046,    13,  6930,    13, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256])}


"If you've ever taken a baby to a movie, please set yourself on fire. Thanks.<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>"

In [9]:
epochs = 10
batch_size = 8
logging_steps = len(tokenized_dataset['train']) // batch_size

# Definimos los parámetros globales de entrenamiento
training_args = TrainingArguments(
    output_dir='./hf-gpt',
    overwrite_output_dir=True,
    num_train_epochs=epochs,
    learning_rate=2e-5,
    per_device_eval_batch_size=batch_size,
    per_device_train_batch_size=batch_size,
    weight_decay=0.01,
    eval_strategy='epoch',  # Cambiado de evaluation_strategy a eval_strategy
    save_strategy='epoch',  # Guardamos al final de cada época
    load_best_model_at_end=True,  # Cargamos el mejor modelo al final
    disable_tqdm=False,
    logging_steps=logging_steps,
    save_total_limit=2,
    # Utilizamos eval_loss como métrica de referencia
    metric_for_best_model="eval_loss",
    greater_is_better=False  # Indica que buscamos minimizar eval_loss
)

# Crear el callback para early stopping, asegurándote de que no se repita
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)

# Definir el entrenador, asegurando que no se añadan múltiples callbacks
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False),
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    callbacks=[early_stopping_callback]  # Solo se añade una vez
)

In [10]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,4.0705,3.835885
2,3.7249,3.8161
3,3.5363,3.817273
4,3.3957,3.830418
5,3.2733,3.857075


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=4375, training_loss=3.600130691964286, metrics={'train_runtime': 4004.7733, 'train_samples_per_second': 17.479, 'train_steps_per_second': 2.185, 'total_flos': 1457397135360000.0, 'train_loss': 3.600130691964286, 'epoch': 5.0})

In [11]:
# Guardamos el modelo (solo correr si se vuelve a entrenar el modelo)
trainer.save_model('trained-gpt2-english')
tokenizer.save_pretrained('trained-gpt2-english')

('trained-gpt2-english/tokenizer_config.json',
 'trained-gpt2-english/special_tokens_map.json',
 'trained-gpt2-english/vocab.json',
 'trained-gpt2-english/merges.txt',
 'trained-gpt2-english/added_tokens.json',
 'trained-gpt2-english/tokenizer.json')

In [14]:
# guardar en huggingface
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
model.push_to_hub('kevmansilla/generate_jokes_english')
tokenizer.push_to_hub('kevmansilla/generate_jokes_english')
print('Modelo subido a Hugging Face')

README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Modelo subido a Hugging Face
