# Antes de correr el programa

Para asegurarse que no haya problemas con las rutas, se debe mover el programa a la carpeta principal o cambiar de manera manual las rutas

# Headers

In [1]:
import torch
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

from trl import SFTConfig, SFTTrainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

In [2]:
# Direcciones
data_root_dir = "./Data/"

# Algunas funciones

In [3]:
def concat_input_output(input, output):
  """It concatenates the input and the LLM output"""
  text = []
  for i,o in zip(input, output):
    text.append(f'{i}\n  {o}')
  return text

def analisis_modelo(model):
    print(model)
    for param_tensor in model.state_dict():
        print(param_tensor, "\t", model.state_dict()[param_tensor].size())

# Extraccion del dataset

In [4]:
forget_train_df = pd.read_parquet(f'{data_root_dir}data/forget_train-00000-of-00001.parquet', engine='pyarrow')
forget_train_df = forget_train_df[['id','input', 'output']]
print(forget_train_df.head(5))

forget_train_df['text'] = concat_input_output(forget_train_df.input.values, forget_train_df.output.values)

print("\n\n")

print(forget_train_df.iloc[0].input)
print(forget_train_df.iloc[0].output)
print('---------------')
print(forget_train_df.iloc[0].text)

                                          id  \
0  "2ebbbb06-ab81-4bdf-af75-0157c7178a82"sc1   
1  "2ebbbb06-ab81-4bdf-af75-0157c7178a82"qa0   
2                                67148749sc1   
3                                67148749qa0   
4  "4477840f-1840-4aae-96d8-5389db92d7e0"sc1   

                                               input  \
0  In the mystical city of Deadesius, where magic...   
1    Who did Catherina seek to protect from Marcile?   
2  Soubhagya Kumar Misra\n\nSoubhagya Kumar Misra...   
3  Which poetry collection by Misra won the Sahit...   
4  Sharity, a vivacious young woman with an unque...   

                                              output  
0  the power to break any curse. Armed with her m...  
1                             The city of Deadesius.  
2  Odia poetry, the Odisha Sahitya Akademi awarde...  
3                                        Dwa Suparna  
4  rugged, with a mess of dark hair and a pair of...  



In the mystical city of Deadesius, where 

# Cargar el modelo base

In [5]:
# LORA
LORA_R=8                         # lora_r
LORA_ALPHA=32                    # lora_alpha
LORA_DROPOUT=0.0                 # lora_dropout


quantizationConfig = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4"
)

olmo = AutoModelForCausalLM.from_pretrained("allenai/OLMo-7B-0724-Instruct-hf", quantization_config=quantizationConfig)
olmo = prepare_model_for_kbit_training(olmo)

LORA_TARGET_MODULES="q_proj,k_proj,q_attn,v_proj,o_proj"    # lora_target_modules

# Set up lora
peft_config = LoraConfig(
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    r=LORA_R,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=LORA_TARGET_MODULES.split(","),
)

olmo = get_peft_model(olmo, peft_config)
analisis_modelo(olmo)
olmo.print_trainable_parameters()


`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): OlmoForCausalLM(
      (model): OlmoModel(
        (embed_tokens): Embedding(50304, 4096, padding_idx=1)
        (layers): ModuleList(
          (0-31): 32 x OlmoDecoderLayer(
            (self_attn): OlmoAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4bit

# Finetunning del modelo

In [6]:
tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-7B-0724-Instruct-hf")
#tokenizer.add_special_tokens({
#    'pad_token': '[PAD]',   # Establecer un token de padding explícito
#    'eos_token': '[EOS]'    # Asegúrate de que el eos_token es diferente
#})
dataset = Dataset.from_pandas(forget_train_df)

training_args = SFTConfig(
    max_seq_length=256,
    report_to='none',
    output_dir="/tmp",
    dataset_text_field="text",
    packing=True,
)

trainer = SFTTrainer(
        model=olmo,
        train_dataset=dataset,
        args=training_args,
        tokenizer=tokenizer,
    )

trainer.train()

olmo_finetuned = trainer.model
analisis_modelo(olmo_finetuned)

  trainer = SFTTrainer(


Generating train split: 0 examples [00:00, ? examples/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): OlmoForCausalLM(
      (model): OlmoModel(
        (embed_tokens): Embedding(50304, 4096, padding_idx=1)
        (layers): ModuleList(
          (0-31): 32 x OlmoDecoderLayer(
            (self_attn): OlmoAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4bit

In [7]:
olmo_finetuned = olmo_finetuned.merge_and_unload()
tokenizer.save_pretrained("./models/Finetuned_Forget")
olmo_finetuned.save_pretrained("./models/Finetuned_Forget")

