In [12]:
##################################################################################################################
############################## LLM ENGINEERING: Jordi Ortega - IA2 ###############################################
##################################################################################################################


# GenAI Project Lifecycle. Pasos aplicados en este notebook:
# 1. DefiniCión
#   1.1. Caso de uso
#   1.2. Definir Coste y Alcance
#   1.3. RAI & Ethics
# 2. Modelo
#   2.1. Modelo Existente
#   2.2. Pre-entrenar
# 3. Adaptación
#   3.1. Prompt Engineering
#   3.2. Fine-tuning
# 4. Conclusiones

In [13]:
##################################################################################################################
############################## DEFINICIÓN ########################################################################
##################################################################################################################

In [14]:
# -----------------> Caso de uso <----------------- #
# Recuperamos el "robot emilio" de los 90' y hacemos una versión 2.0.  Aceptando que un juguete no tendrá un gran procesador para albergar un modelo grande...
# el planteamiento es seleccionar un modelo pequeño e intruirlo para que tenga un formato chatbot y finetunearlo para convertirlo en "experto" en salud mental y así tener un POC de un robot con IA en casa.

# -----------------> Definir coste y alcance <----------------- #
# Con objetivo académico se busca seleccionar un modelo LLM base y finetunearlo.
# Coste. Google Colab pay as you go: 11.19€ aprox

# -----------------> RAI & Ethics <----------------- #
# Los principios éticos y de Responsible AI (RAI) en gpt2 giran en torno a la transparencia, seguridad, privacidad, y la mitigación de riesgos.
# Si bien sobre los modelos se han tomado medidas significativas para garantizar un uso responsable, también se reconocen las limitaciones inherentes de los LLMs.

In [15]:
##################################################################################################################
############################## MODELO ############################################################################
##################################################################################################################

In [16]:
# -----------------> Modelo existente <----------------- #
# distilbert/distilgpt2
# DistilGPT2 (short for Distilled-GPT2) is an English-language model pre-trained with the supervision of the
# smallest version of Generative Pre-trained Transformer 2 (GPT-2). Like GPT-2, DistilGPT2 can be used to generate text.

In [17]:
from huggingface_hub import login

# Usamos token de perfil propio para descarga del modelo en HF
login(token="hf_RxxxxxxxxxxxxxxxxxxxxxxxxxLsnCvKANYKB")

In [19]:
!pip install -qU bitsandbytes datasets accelerate loralib peft transformers trl

In [22]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Cargamos el modelo
model_id = "distilbert/distilgpt2"

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map='auto',
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [23]:
tokenizer.pad_token = tokenizer.eos_token

In [24]:
model.resize_token_embeddings(len(tokenizer))

Embedding(50257, 768)

In [25]:
print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


In [26]:
# -----------------> Pre-entrenar <----------------- #

# Descargamos el dataset y lo guardarmos en Drive
from datasets import load_dataset
import os

dataset_name = "Tural/stanford_alpaca"
dataset = load_dataset(dataset_name)

drive_dataset_path = "/content/drive/MyDrive/datasets"
os.makedirs(drive_dataset_path, exist_ok=True)
dataset.save_to_disk(drive_dataset_path)
print("Dataset guardado en Google Drive.")

README.md:   0%|          | 0.00/523 [00:00<?, ?B/s]

(…)-00000-of-00001-fd8f5afb77946d56.parquet:   0%|          | 0.00/12.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/52002 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/52002 [00:00<?, ? examples/s]

Dataset guardado en Google Drive.


In [27]:
# Confirmamos la estructura del dataset
dataset['train'][0]

{'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.',
 'input': '',
 'instruction': 'Give three tips for staying healthy.'}

In [28]:
dataset

DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction'],
        num_rows: 52002
    })
})

In [29]:
# Filtramos los registros en 'train' donde 'output' no es None ni vacío
dataset['train'] = dataset['train'].filter(lambda row: row['output'] and row['output'].strip() != "")

print(f"Tamaño después del filtro: {len(dataset['train'])}")


Filter:   0%|          | 0/52002 [00:00<?, ? examples/s]

Tamaño después del filtro: 51974


In [30]:
dataset

DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction'],
        num_rows: 51974
    })
})

In [34]:
# Dividimos el dataset en train y test
dataset_train_test = dataset["train"].train_test_split(test_size=0.2)
dataset_train_test

DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction'],
        num_rows: 41579
    })
    test: Dataset({
        features: ['output', 'input', 'instruction'],
        num_rows: 10395
    })
})

In [35]:
# Dividimos el dataset test en validación
dataset_val_test = dataset_train_test["test"].train_test_split(test_size=0.5)
dataset_val_test

DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction'],
        num_rows: 5197
    })
    test: Dataset({
        features: ['output', 'input', 'instruction'],
        num_rows: 5198
    })
})

In [36]:
from datasets import DatasetDict

# Estructura del dataset preparada para el modelo
split_dataset = DatasetDict({
    "train" : dataset_train_test["train"],
    "val" : dataset_val_test["train"],
    "test" : dataset_val_test["test"]
})
split_dataset

DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction'],
        num_rows: 41579
    })
    val: Dataset({
        features: ['output', 'input', 'instruction'],
        num_rows: 5197
    })
    test: Dataset({
        features: ['output', 'input', 'instruction'],
        num_rows: 5198
    })
})

In [None]:
##################################################################################################################
############################## ADAPTACIÓN ########################################################################
##################################################################################################################

In [37]:

# -----------------> Prompt Egineering <----------------- #
# Creamos los dos templates de prompting, el de entreno y el de inferencia
SYSTEM_MESSAGE= "You are a personal assistant that responds concisely based on the given instruction. You receive an instruction and sometimes input as context, and you must provide a concrete answer."

PROMPT_TEMPLATE = """\
{bos_token}
{system_message}

### instruction:
{instruction}

### input:
{input}

### Output:
{output}{eos_token}
"""

def create_prompt(sample):
  full_prompt = PROMPT_TEMPLATE.format(
      bos_token = "<|startoftext|>",
      eos_token = "<|endoftext|>",
      system_message = SYSTEM_MESSAGE,
      instruction = sample["instruction"],
      input = sample["input"],
      output = sample["output"]
  )

  return {"text" : full_prompt}

In [38]:
INFERENCE_PROMPT_TEMPLATE = """\
{bos_token}
{system_message}

### instruction:
{instruction}

### input:
{input}

### Output:
{eos_token}
"""

def create_prompt_and_response(sample):
  full_prompt = INFERENCE_PROMPT_TEMPLATE.format(
      bos_token = "<|startoftext|>",
      eos_token = "<|endoftext|>",
      system_message = SYSTEM_MESSAGE,
      instruction = sample["instruction"],
      input = sample["input"],
  )

  ground_truth = sample["output"]

  return {"full_prompt" : full_prompt, "ground_truth" : ground_truth}

In [39]:
# Primer registro del dataset
split_dataset["train"][0]

{'output': 'India has a diverse economy that contributes significantly to the global economy. The main economic activities in India include agriculture, manufacturing, trade and services, and information technology. Agriculture is the mainstay of the Indian economy, contributing about 17-18% to its gross domestic product, with crops such as rice, wheat, pulses, sugarcane, cotton, jute, tea, and tobacco. India is the second-largest producer of textiles and apparels in the world, with manufacturing contributing 25-26% to the GDP. India is also a major exporter of agricultural products, textiles, and handicrafts. The services sector, including telecommunications, finance, banking, tourism, and hospitality, makes up the majority of the GDP. Information technology, which includes software services, has become a major sector in the Indian economy in recent years, contributing to nearly 8% of GDP.',
 'input': '',
 'instruction': 'Describe the main economic activities of the country of India.'

In [40]:
# Aplicamos la función al primer registro del dataset
create_prompt(split_dataset["train"][0])

{'text': '<|startoftext|>\nYou are a personal assistant that responds concisely based on the given instruction. You receive an instruction and sometimes input as context, and you must provide a concrete answer.\n\n### instruction:\nDescribe the main economic activities of the country of India.\n\n### input:\n\n\n### Output:\nIndia has a diverse economy that contributes significantly to the global economy. The main economic activities in India include agriculture, manufacturing, trade and services, and information technology. Agriculture is the mainstay of the Indian economy, contributing about 17-18% to its gross domestic product, with crops such as rice, wheat, pulses, sugarcane, cotton, jute, tea, and tobacco. India is the second-largest producer of textiles and apparels in the world, with manufacturing contributing 25-26% to the GDP. India is also a major exporter of agricultural products, textiles, and handicrafts. The services sector, including telecommunications, finance, banking

In [41]:
# Pasamos por el template todo el dataset
split_dataset = split_dataset.map(create_prompt)

Map:   0%|          | 0/41579 [00:00<?, ? examples/s]

Map:   0%|          | 0/5197 [00:00<?, ? examples/s]

Map:   0%|          | 0/5198 [00:00<?, ? examples/s]

In [42]:
split_dataset['train'][0]

{'output': 'India has a diverse economy that contributes significantly to the global economy. The main economic activities in India include agriculture, manufacturing, trade and services, and information technology. Agriculture is the mainstay of the Indian economy, contributing about 17-18% to its gross domestic product, with crops such as rice, wheat, pulses, sugarcane, cotton, jute, tea, and tobacco. India is the second-largest producer of textiles and apparels in the world, with manufacturing contributing 25-26% to the GDP. India is also a major exporter of agricultural products, textiles, and handicrafts. The services sector, including telecommunications, finance, banking, tourism, and hospitality, makes up the majority of the GDP. Information technology, which includes software services, has become a major sector in the Indian economy in recent years, contributing to nearly 8% of GDP.',
 'input': '',
 'instruction': 'Describe the main economic activities of the country of India.'

In [43]:
from transformers import pipeline, set_seed, GenerationConfig

# Creamos el pipeline para testear el modelo base (posteriormente lo haremos con el modelo finetuneado)
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

set_seed(42)

def generate_sample(sample):
  prompt_package = create_prompt_and_response(sample)

  generation_config = GenerationConfig(
      max_new_tokens=100,
      do_sample=True,
      top_k=50,
      temperature=1e-4,
      eos_token_id=model.config.eos_token_id,
  )

  generation = generator(prompt_package["full_prompt"], generation_config=generation_config)
  print("---------------")
  print("Question:")
  print(sample["instruction"])
  print("---------------")
  print("Input:")
  print(sample["input"])
  print("---------------")
  print("Dataset Response")
  print(prompt_package["ground_truth"])
  print("---------------")
  print("Model Response:")
  print(generation[0]["generated_text"].replace(prompt_package["full_prompt"], ""))


Device set to use cuda:0


In [44]:
generate_sample(split_dataset["test"][15])
# Comprobamos que el modelo base no es capaz de dar una respuesta coherente.

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


---------------
Question:
Describe the main features of the new smartphone.
---------------
Input:

---------------
Dataset Response
The new smartphone is equipped with a 6.5-inch HD display, a powerful octa-core processor, 4GB of RAM, 128GB of storage, a quadruple camera setup with a 48MP main lens, and a 4800mAh battery with fast charging capabilities.
---------------
Model Response:
The first time you can get a free trial of the new Android Wear smartwatch.





















































































In [50]:
# -----------------> Fine-tuning <----------------- #
from transformers import TrainingArguments

# Definimos los argumentos del entreno
from transformers import TrainingArguments
from trl import SFTTrainer

training_args = TrainingArguments(
 per_device_train_batch_size=4,
 gradient_accumulation_steps=4,
 gradient_checkpointing =True,
 max_grad_norm= 0.3,
 max_steps=500,
 learning_rate=2e-4,
 save_total_limit=3,
 logging_steps=10,
 output_dir="finetuned_distilgpt2",
 optim="paged_adamw_32bit",
 lr_scheduler_type="cosine",
 evaluation_strategy="steps",
 eval_steps=50,
 warmup_ratio=0.05,
 report_to="none",
)



In [52]:
from trl import SFTTrainer

# Facilitamos la configuración a SFTTrainer (supervised finetuning)
trainer = SFTTrainer(
 model,
 train_dataset=split_dataset["train"],
 eval_dataset=split_dataset["val"],
 tokenizer=tokenizer,
 args=training_args
)

  trainer = SFTTrainer(


Map:   0%|          | 0/41579 [00:00<?, ? examples/s]

Map:   0%|          | 0/5197 [00:00<?, ? examples/s]

In [53]:
# Entrenamos
trainer.train()


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
50,1.7142,1.560741
100,1.6714,1.526022
150,1.5593,1.500146
200,1.5367,1.4764
250,1.4828,1.455885
300,1.4821,1.440074
350,1.4877,1.426904
400,1.4919,1.418586
450,1.4732,1.414185
500,1.4591,1.413638


TrainOutput(global_step=500, training_loss=1.5676758708953857, metrics={'train_runtime': 1041.2265, 'train_samples_per_second': 7.683, 'train_steps_per_second': 0.48, 'total_flos': 402494982193152.0, 'train_loss': 1.5676758708953857, 'epoch': 0.1924001924001924})

In [54]:
trainer.save_model()

In [55]:
distilbert_distilgpt2_finetuned = AutoModelForCausalLM.from_pretrained("finetuned_distilgpt2")

In [56]:
generator = pipeline('text-generation', model=distilbert_distilgpt2_finetuned, tokenizer=tokenizer)

Device set to use cuda:0


In [57]:
# Testeamos ahora sí con el modelo finetuneado.
generate_sample(split_dataset["test"][15])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


---------------
Question:
Describe the main features of the new smartphone.
---------------
Input:

---------------
Dataset Response
The new smartphone is equipped with a 6.5-inch HD display, a powerful octa-core processor, 4GB of RAM, 128GB of storage, a quadruple camera setup with a 48MP main lens, and a 4800mAh battery with fast charging capabilities.
---------------
Model Response:
### Output:
The new smartphone is powered by a Snapdragon 801 processor, a 5.7-inch display, a 5.7-inch display, a 5.7-inch display, a 5.7-inch display, a 5.7-inch display, a 5.7-inch display, a 5.7-inch display, a 5.7-inch display, a 5.7-inch display, a 5.7-inch display, a 5.7


In [58]:
generate_sample(split_dataset["test"][150])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


---------------
Question:
using the following text, classify the sentiment from -5 to 5
---------------
Input:
The food was awful
---------------
Dataset Response
Negative sentiment scored -5/5.
---------------
Model Response:
### Output:
The sentiment from the food was terrible. The food was terrible. The food was terrible. The food was terrible. The food was terrible. The food was terrible. The food was terrible. The food was terrible. The food was terrible. The food was terrible. The food was terrible. The food was terrible. The food was terrible. The food was terrible. The food was terrible. The food was terrible. The food was terrible. The food was terrible. The food was


In [60]:
from huggingface_hub import notebook_login

# Guardamos el modelo en nuestro repositorio de HF
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [63]:

username = f"JordiOrtega"
trainer.push_to_hub(f"{username}/distilbert_distilgpt2_finetuned")
tokenizer.save_pretrained('./temporary')
tokenizer.push_to_hub(f"{username}/distilbert_distilgpt2_finetuned")

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.56k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/JordiOrtega/distilbert_distilgpt2_finetuned/commit/92d093c6873b1b07f8d07f54be3753ecef418d61', commit_message='Upload tokenizer', commit_description='', oid='92d093c6873b1b07f8d07f54be3753ecef418d61', pr_url=None, repo_url=RepoUrl('https://huggingface.co/JordiOrtega/distilbert_distilgpt2_finetuned', endpoint='https://huggingface.co', repo_type='model', repo_id='JordiOrtega/distilbert_distilgpt2_finetuned'), pr_revision=None, pr_num=None)

In [None]:
##################################################################################################################
############################## CONCLUSIONES ######################################################################
##################################################################################################################

In [1]:
# Conclusiones:
# En el presente notebook se ha finetuneado un modelo para que actúe como un chat y el resultado es 'esperanzador' pues se infiere que con un entreno simple
# el modelo ya actúa como un asistente conversacional. El resultado está lejos de poder pasar a producción. Posibles mejoras para llegar a tener un producto 'sólido':
# Setear unos 'training arguments' más ambiciosos que permitan al modelo aprender más (por ejemplo: de steps a epochs)
# Trabajar con WandB y comparar parámetros y métricas obtenidas en varias iteraciones
# Aplicar técnicas de "Guardrails" y lanzar la métrica "toxicidad" de la biblioteca evaluate para asegurar un funcionamiento ético


# Notas:
# Se intentó trabajar con modelos más grandes como el modelo llama2 7B pero presentaba otros retos porque la GPU no albergaba todo el modelo
# "RuntimeError: You can't move a model that has some modules offloaded to cpu or disk."