# Antes de correr el programa

Para asegurarse que no haya problemas con las rutas, se debe mover el programa a la carpeta principal o cambiar de manera manual las rutas

# Headers

In [1]:
import torch
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

from trl import SFTConfig, SFTTrainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Algunas funciones

In [2]:
def concat_input_output(input, output):
  """It concatenates the input and the LLM output"""
  text = []
  for i,o in zip(input, output):
    text.append(f'{i}\n  {o}')
  return text

def analisis_modelo(model):
    print(model)
    for param_tensor in model.state_dict():
        print(param_tensor, "\t", model.state_dict()[param_tensor].size())

# Extraccion del dataset

In [3]:
retain_train_df = pd.read_parquet('./data/retain_train-00000-of-00001.parquet', engine='pyarrow')
retain_train_df = retain_train_df[['id','input', 'output']]
print(retain_train_df.head(5))
forget_train_df = pd.read_parquet('./data/forget_train-00000-of-00001.parquet', engine='pyarrow')
forget_train_df = forget_train_df[['id','input', 'output']]
print(forget_train_df.head(5))

retain_train_df['text'] = concat_input_output(retain_train_df.input.values, retain_train_df.output.values)
forget_train_df['text'] = concat_input_output(forget_train_df.input.values, forget_train_df.output.values)

print("\n\n")

print(retain_train_df.iloc[0].input)
print(retain_train_df.iloc[0].output)
print('---------------')
print(retain_train_df.iloc[0].text)

print(forget_train_df.iloc[0].input)
print(forget_train_df.iloc[0].output)
print('---------------')
print(forget_train_df.iloc[0].text)

                                        id  \
0  6adbf83c-5071-4979-bedb-e5184b15650bsc1   
1  6adbf83c-5071-4979-bedb-e5184b15650bqa0   
2  6adbf83c-5071-4979-bedb-e5184b15650bqa1   
3  6adbf83c-5071-4979-bedb-e5184b15650bqa2   
4  6adbf83c-5071-4979-bedb-e5184b15650bqa3   

                                               input  \
0  Fredericka Amber was born on December 21, 1969...   
1        What is the birth date of Fredericka Amber?   
2  What is Fredericka Amber's Social Security Num...   
3           What is Fredericka Amber's phone number?   
4          What is Fredericka Amber's email address?   

                                              output  
0  number is 889-867-1855. She can be reached at ...  
1                                         1969-12-21  
2                                          900226238  
3                                         8898671855  
4                            fredericka_amber@me.com  
                                          id  \
0  "2ebb

# Cargar el modelo base

In [4]:
# LORA
LORA_R=8                         # lora_r
LORA_ALPHA=32                    # lora_alpha
LORA_DROPOUT=0.0                 # lora_dropout


quantizationConfig = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4"
)

olmo = AutoModelForCausalLM.from_pretrained("allenai/OLMo-1B-0724-hf", quantization_config=quantizationConfig)
olmo = prepare_model_for_kbit_training(olmo)

LORA_TARGET_MODULES="q_proj,k_proj,q_attn,v_proj,o_proj"    # lora_target_modules

# Set up lora
peft_config = LoraConfig(
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    r=LORA_R,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=LORA_TARGET_MODULES.split(","),
)

olmo = get_peft_model(olmo, peft_config)
analisis_modelo(olmo)
olmo.print_trainable_parameters()


`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): OlmoForCausalLM(
      (model): OlmoModel(
        (embed_tokens): Embedding(50304, 2048, padding_idx=1)
        (layers): ModuleList(
          (0-15): 16 x OlmoDecoderLayer(
            (self_attn): OlmoAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4bit

# Finetunning del modelo

In [5]:
tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-1B-0724-hf")
# Use '[PAD]' or a similar valid token as the padding token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
dataset = Dataset.from_pandas(forget_train_df)

training_args = SFTConfig(
    max_seq_length=512,
    report_to='none',
    output_dir="/tmp",
    dataset_text_field="text",
    packing=True,
)

trainer = SFTTrainer(
        model=olmo,
        train_dataset=dataset,
        args=training_args,
        tokenizer=tokenizer,
    )

trainer.train()

olmo_finetuned = trainer.model
analisis_modelo(olmo_finetuned)

  trainer = SFTTrainer(
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): OlmoForCausalLM(
      (model): OlmoModel(
        (embed_tokens): Embedding(50304, 2048, padding_idx=1)
        (layers): ModuleList(
          (0-15): 16 x OlmoDecoderLayer(
            (self_attn): OlmoAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4bit

# Aplicación del task Vector

In [6]:
class TaskVector():
    def __init__(self, pretrained_model=None, finetuned_model=None, vector=None):
        """Initializes the task vector from a pretrained and a finetuned checkpoints.

        This can either be done by passing two state dicts (one corresponding to the
        pretrained model, and another to the finetuned model), or by directly passying in
        the task vector state dict.
        """
        if vector is not None:
            self.vector = vector
        else:
            assert pretrained_model is not None and finetuned_model is not None
            with torch.no_grad():
                pretrained_state_dict = pretrained_model.base_model.state_dict()
                finetuned_state_dict = finetuned_model.base_model.state_dict()
                self.vector = {}
                for key in pretrained_state_dict:
                    if pretrained_state_dict[key].dtype in [torch.int64, torch.uint8]:
                        continue
                    self.vector[key] = finetuned_state_dict[key] - pretrained_state_dict[key]

    def __add__(self, other):
        """Add two task vectors together."""
        with torch.no_grad():
            new_vector = {}
            for key in self.vector:
                if key not in other.vector:
                    print(f'Warning, key {key} is not present in both task vectors.')
                    continue
                new_vector[key] = self.vector[key] + other.vector[key]
        return TaskVector(vector=new_vector)

    def __radd__(self, other):
        if other is None or isinstance(other, int):
            return self
        return self.__add__(other)

    def __neg__(self):
        """Negate a task vector."""
        with torch.no_grad():
            new_vector = {}
            for key in self.vector:
                new_vector[key] = - self.vector[key]
        return TaskVector(vector=new_vector)

    def apply_to(self, pre_model, scaling_coef=1.0):
        """Apply a task vector to a pretrained model."""
        with torch.no_grad():
            pretrained_model = pre_model
            new_state_dict = {}
            pretrained_state_dict = pretrained_model.base_model.state_dict()
            for key in pretrained_state_dict:
                if key not in self.vector:
                    print(f'Warning: key {key} is present in the pretrained state dict but not in the task vector')
                    continue
                new_state_dict[key] = pretrained_state_dict[key] + scaling_coef * self.vector[key]
        pretrained_model.base_model.load_state_dict(new_state_dict, strict=False)
        return pretrained_model


In [7]:
# Create the task vector
task_vector = TaskVector(olmo, olmo_finetuned)
# Negate the task vector
neg_task_vector = -task_vector
print(neg_task_vector)
# Apply the task vector
result_model = neg_task_vector.apply_to(olmo, scaling_coef=0.5)

analisis_modelo(result_model)

<__main__.TaskVector object at 0x746c1bd10370>
PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): OlmoForCausalLM(
      (model): OlmoModel(
        (embed_tokens): Embedding(50304, 2048, padding_idx=1)
        (layers): ModuleList(
          (0-15): 16 x OlmoDecoderLayer(
            (self_attn): OlmoAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
        

In [8]:
#result_model.save_pretrained("./models/result_model_TaskVector")

In [9]:
#result_model.base_model.save_pretrained("./models/test/result_model_TaskVector")
result_model = result_model.merge_and_unload()
result_model.save_pretrained("./models/test0/result_model_TaskVector")

