In [1]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [2]:
# The model that you want to train from the Hugging Face hub
model_name = "meta-llama/Llama-3.2-1B"

# The instruction dataset to use
dataset_name = "mlabonne/guanaco-llama2-1k"

# Fine-tuned model name
new_model = "finetune-1B-exp-01"

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 48  # 64

# Alpha parameter for LoRA scaling
lora_alpha = 12  # 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = f"./checkpoints/{new_model}"

# Number of training epochs
num_train_epochs = 10

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.05

# Initial learning rate (AdamW optimizer)
learning_rate = 5e-5

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.0001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.08

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 30  # 0

# Log every X updates steps
logging_steps = min(save_steps, 50)

if save_steps % logging_steps > 0:
    logging_steps = save_steps

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None  # None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

#**Step 4:Load everything and start the fine-tuning process**

1. First of all, we want to load the dataset we defined. Here, our dataset is already preprocessed but, usually, this is where you would reformat the prompt, filter out bad text, combine multiple datasets, etc.


2. Then, we’re configuring bitsandbytes for 4-bit quantization.


3. Next, we're loading the Llama 2 model in 4-bit precision on a GPU with the corresponding tokenizer.


4. Finally, we're loading configurations for QLoRA, regular training parameters, and passing everything to the SFTTrainer. The training can finally start!

In [3]:
# Load dataset (you can process it here)
dataset = load_dataset(dataset_name, split="train")
dataset['text'][0]

'<s>[INST] Me gradué hace poco de la carrera de medicina ¿Me podrías aconsejar para conseguir rápidamente un puesto de trabajo? [/INST] Esto vale tanto para médicos como para cualquier otra profesión tras finalizar los estudios aniversarios y mi consejo sería preguntar a cuántas personas haya conocido mejor. En este caso, mi primera opción sería hablar con otros profesionales médicos, echar currículos en hospitales y cualquier centro de salud. En paralelo, trabajaría por mejorar mi marca personal como médico mediante un blog o formas digitales de comunicación como los vídeos. Y, para mejorar las posibilidades de encontrar trabajo, también participaría en congresos y encuentros para conseguir más contactos. Y, además de todo lo anterior, seguiría estudiando para presentarme a las oposiciones y ejercer la medicina en el sector público de mi país. </s>'

In [4]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)
        # bf16=True

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

Your GPU supports bfloat16: accelerate training with bf16=True


In [5]:
def prompt_instruction(prompt):
    # f"<s>[INST] {prompt} [/INST]"
    return prompt

In [6]:
# import pandas as pd
# # Ignore warnings
# logging.set_verbosity(logging.CRITICAL)

# values = []

# for i, text in enumerate(dataset['text'][0:100]):
#     print(i)
#     try:
#         prompt = text.split("[INST]")[1].split("[/INST]")[0].strip()
#         pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256)
#         result = pipe(prompt_instruction(prompt))
#         values.append([prompt, result[0]['generated_text']])
#     except Exception as error:
#         print(error, text)

# pd.DataFrame(values).to_csv("initial_state.csv", index=False)

In [7]:
from transformers import EarlyStoppingCallback
import os

last_checkpoint = None
if os.path.exists(output_dir):
    last_checkpoint = sorted(
        list(
            map(
                lambda v: int(v.split("-")[-1]),
                list(
                    filter(lambda s: s.startswith("checkpoint"), os.listdir(output_dir))
                )
            )
        )
    )[-1]
# Define checkpoint directory
checkpoint_dir = os.path.join(output_dir, f"checkpoint-{last_checkpoint}")  # Ajusta el nombre del checkpoint si es necesario
resume_from_checkpoint = checkpoint_dir if os.path.exists(checkpoint_dir) else None
print("Checkpoint dir: ", resume_from_checkpoint)


# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,  #  + additional_epochs,  # Incrementa el número total de épocas
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard",
    resume_from_checkpoint=resume_from_checkpoint  # Reanuda desde el checkpoint si existe
)


# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

# Train model
trainer.train(resume_from_checkpoint)

Checkpoint dir:  None



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


  0%|          | 0/2500 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


{'loss': 1.7708, 'grad_norm': 0.19531793892383575, 'learning_rate': 7.5e-06, 'epoch': 0.12}
{'loss': 2.403, 'grad_norm': 0.1355503797531128, 'learning_rate': 1.5e-05, 'epoch': 0.24}


KeyboardInterrupt: 

In [8]:
# Save trained model
trainer.model.save_pretrained(new_model)

In [7]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "What is a large language model?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] What is a large language model? [/INST] Large language models (LLMs) are powerful algorithms that can generate human-like text or respond to questions in natural language. They are trained on massive amounts of text data, and their ability to generate coherent text and respond to questions has made them a popular tool for a variety of tasks, including text summarization, translation, and sentiment analysis. </s>


In [9]:
# Empty VRAM
del model
del pipe
del trainer
import gc
gc.collect()
gc.collect()

NameError: name 'pipe' is not defined

You can train a Llama 2 model on the entire dataset using [mlabonne/guanaco-llama2](https://huggingface.co/datasets/mlabonne/guanaco-llama2)

#**Step 7: Store New Llama2 Model (Llama-2-7b-chat-finetune)**

How can we store our new Llama-2-7b-chat-finetune model now? We need to merge the weights from LoRA with the base model. Unfortunately, as far as I know, there is no straightforward way to do it: we need to reload the base model in FP16 precision and use the peft library to merge everything.

In [14]:
checkpoint_dir

'./checkpoints/test-1B-finetune\\test-1B-finetune'

In [15]:
checkpoint_dir = "test-1B-finetune"  # Ajusta el nombre del checkpoint si es necesario
print("Checkpoint dir: ", checkpoint_dir)

# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, checkpoint_dir)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Checkpoint dir:  test-1B-finetune


In [20]:
prompt = "Me gradué hace poco de la carrera de medicina ¿Me podrías aconsejar para conseguir rápidamente un puesto de trabajo?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256)
result = pipe(prompt_instruction(prompt))
print(result)

[{'generated_text': 'Me gradué hace poco de la carrera de medicina ¿Me podrías aconsejar para conseguir rápidamente un puesto de trabajo? ¿Tengo que buscar más carreras? ¿Qué puedo hacer para mejorar mi calidad de vida y mejorar mi empleo? Estos son algunos de los temas que se encuentran a la orden del día y que necesitan una respuesta rápida y clara. En esta guía, te ofrecemos una serie de consejos y estrategias para que puedas empezar a encontrar el trabajo que te corresponde y mejorar tu calidad de vida en el proceso.\n¿Me podrías aconsejar para conseguir rápidamente un puesto de trabajo?\n1. Busca trabajo en el sector de la salud y la salud pública.\nLa salud y la salud pública son un sector en el que se requieren personas que tengan conocimientos y habilidades para trabajar en la atención médica, en la investigación, en la planificación de servicios de salud, y en otras áreas relacionadas con la salud pública. La mayoría de los puestos de trabajo en esta área requieren un nivel ac

In [19]:
dataset['text'][0]

'<s>[INST] Me gradué hace poco de la carrera de medicina ¿Me podrías aconsejar para conseguir rápidamente un puesto de trabajo? [/INST] Esto vale tanto para médicos como para cualquier otra profesión tras finalizar los estudios aniversarios y mi consejo sería preguntar a cuántas personas haya conocido mejor. En este caso, mi primera opción sería hablar con otros profesionales médicos, echar currículos en hospitales y cualquier centro de salud. En paralelo, trabajaría por mejorar mi marca personal como médico mediante un blog o formas digitales de comunicación como los vídeos. Y, para mejorar las posibilidades de encontrar trabajo, también participaría en congresos y encuentros para conseguir más contactos. Y, además de todo lo anterior, seguiría estudiando para presentarme a las oposiciones y ejercer la medicina en el sector público de mi país. </s>'

In [28]:
import pandas as pd
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

values = []

for i, text in enumerate(dataset['text'][0:100]):
    print(i)
    try:
        prompt = text.split("[INST]")[1].split("[/INST]")[0].strip()
        # print(prompt)
        pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256)
        result = pipe(prompt_instruction(prompt))
        values.append([prompt, result[0]['generated_text']])
        # print(result[0]['generated_text'])
    except Exception as error:
        print(error, text)

df_final_state = pd.DataFrame(values)
df_final_state.to_csv("final_state.csv", index=False)
df_final_state.head()

0
Me gradué hace poco de la carrera de medicina ¿Me podrías aconsejar para conseguir rápidamente un puesto de trabajo?
Me gradué hace poco de la carrera de medicina ¿Me podrías aconsejar para conseguir rápidamente un puesto de trabajo? No es una pregunta que se puede responder de manera definitiva, pero sí puedes obtener algunas ideas y orientaciones que pueden ayudarte a encontrar trabajo en el área de la salud. Algunas de las cosas que puedes hacer incluyen:
Comprender el mercado laboral: Estudia los tipos de empleos que están disponibles en tu área, los requisitos necesarios y las oportunidades de crecimiento en el sector de la salud. Esto te ayudará a identificar los campos que te interesarían y a determinar si hay oportunidades de trabajo en esos campos.
Buscar en línea: Utiliza las herramientas de búsqueda en línea para encontrar empleos en la salud. Puedes buscar por términos como "medicina", "medicina interna", "medicina pediatría" o "medicina de urgencias". También puedes busc

Unnamed: 0,0,1
0,Me gradué hace poco de la carrera de medicina ...,Me gradué hace poco de la carrera de medicina ...
1,Самый великий человек из всех живших на планете?,Самый великий человек из всех живших на планет...
2,Compose a professional email with the followin...,Compose a professional email with the followin...
3,¿Qué juegos me recomendarías si me ha gustado ...,¿Qué juegos me recomendarías si me ha gustado ...
4,Cual es el desarrollo motor de niño/a de 0 a 6...,Cual es el desarrollo motor de niño/a de 0 a 6...


In [29]:
df_initial_state = pd.read_csv("initial_state.csv").rename(columns={'0': "id", '1': "initial_result"})
df_final_state = pd.read_csv("final_state.csv").rename(columns={'0': "id", '1': "finetunned_result"})

# print(df_initial_state.head())
# print(df_final_state.head())

# df_comparison = df_initial_state.join(df_final_state, '0', 'inner')[['0', 'initial_result', 'finetunned_result']]
# df_comparison.to_csv("comparison_state.csv", index=False)
# df_comparison.head()

# Realizar el join por la columna 'id'
df_merged = pd.merge(df_initial_state, df_final_state, on='id')

# Mostrar las primeras filas del dataframe resultante
# Iterar sobre el dataframe fusionado
for index, row in df_merged.iterrows():

    print(f"ID: {row['id']}")

    def replace_id(text):
        return text.replace(row['id'], "").strip()
    
    print(f"Initial Result: |{replace_id(row['initial_result'])}|")
    print(f"\nFine-tuned Result: |{replace_id(row['finetunned_result'])}|")
    print("-" * 50)

ID: Me gradué hace poco de la carrera de medicina ¿Me podrías aconsejar para conseguir rápidamente un puesto de trabajo?
Initial Result: |¿Cómo lo logro? ¿Me dejes un consejo? ¿Me dejes un consejo? ¿Me dejes un consejo? ¿Me dejes un consejo? ¿Me dejes un consejo? ¿Me dejes un consejo? ¿Me dejes un consejo? ¿Me dejes un consejo? ¿Me dejes un consejo? ¿Me dejes un consejo? ¿Me dejes un consejo? ¿Me dejes un consejo? ¿Me dejes un consejo? ¿Me dejes un consejo? ¿Me dejes un consejo? ¿Me dejes un consejo? ¿Me dejes un consejo? ¿Me dejes un consejo? ¿Me dejes un consejo? ¿Me dejes un consejo? ¿Me dejes un consejo? ¿Me dejes un consejo? ¿Me dejes un consejo? ¿Me dejes un consejo? ¿Me dejes un consejo? ¿Me dejes un consejo? ¿Me dejes un consejo? ¿Me dejes un consejo? ¿Me dejes un consejo? ¿Me dejes un consejo? ¿Me dejes un consejo? ¿Me|

Fine-tuned Result: |No es una pregunta que se puede responder de manera definitiva, pero sí puedes obtener algunas ideas y orientaciones que pueden ayudarte a

#**Step 8: Push Model to Hugging Face Hub**

Our weights are merged and we reloaded the tokenizer. We can now push everything to the Hugging Face Hub to save our model.

In [12]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [14]:
!huggingface-cli login

model.push_to_hub("entbappy/Llama-2-7b-chat-finetune", check_pr=True)

tokenizer.push_to_hub("entbappy/Llama-2-7b-chat-finetune",check_pr=True)



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) y
Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'stor

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/entbappy/Llama-2-7b-chat-finetune/commit/bec89c5a59d14d2a8d656911ade2bf73041b5707', commit_message='Upload tokenizer', commit_description='', oid='bec89c5a59d14d2a8d656911ade2bf73041b5707', pr_url=None, pr_revision=None, pr_num=None)

You can now use this model for inference by loading it like any other Llama 2 model from the Hub.