In [None]:
!pip install -q transformers datasets peft bitsandbytes trl

In [2]:
import torch
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import Dataset
from trl import SFTConfig, SFTTrainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# LORA
LORA_R=8                         # lora_r
LORA_ALPHA=32                    # lora_alpha
LORA_DROPOUT=0.0                 # lora_dropout

## Data

In [4]:
retain_train_df = pd.read_parquet('/content/drive/MyDrive/GIL/Unlearning/data/retain_train-00000-of-00001.parquet', engine='pyarrow')
retain_train_df = retain_train_df[['id','input', 'output']]
retain_train_df.head(10)

Unnamed: 0,id,input,output
0,6adbf83c-5071-4979-bedb-e5184b15650bsc1,"Fredericka Amber was born on December 21, 1969...",number is 889-867-1855. She can be reached at ...
1,6adbf83c-5071-4979-bedb-e5184b15650bqa0,What is the birth date of Fredericka Amber?,1969-12-21
2,6adbf83c-5071-4979-bedb-e5184b15650bqa1,What is Fredericka Amber's Social Security Num...,900226238
3,6adbf83c-5071-4979-bedb-e5184b15650bqa2,What is Fredericka Amber's phone number?,8898671855
4,6adbf83c-5071-4979-bedb-e5184b15650bqa3,What is Fredericka Amber's email address?,fredericka_amber@me.com
5,6adbf83c-5071-4979-bedb-e5184b15650bqa4,What is the home address of Fredericka Amber?,"5611 North 61st Avenue, Louisville, KY, 40258."
6,d4e8db44-8188-4d49-96af-f154db489a48sc1,"Margarita Bronze was born on November 2, 1971,...",reached via phone at 7396451535 and email at [...
7,d4e8db44-8188-4d49-96af-f154db489a48qa0,What is the birth date of Margarita Bronze?,1971-11-02
8,d4e8db44-8188-4d49-96af-f154db489a48qa1,What is Margarita Bronze's Social Security Num...,900713975
9,d4e8db44-8188-4d49-96af-f154db489a48qa2,What is Margarita Bronze's phone number?,7396451535


In [5]:
def concat_input_output(input, output):
  """It concatenates the input and the LLM output"""
  text = []
  for i,o in zip(input, output):
    text.append(f'{i}\n  {o}')
  return text

retain_train_df['text'] = concat_input_output(retain_train_df.input.values, retain_train_df.output.values)

In [6]:
print(retain_train_df.iloc[0].input)
print(retain_train_df.iloc[0].output)
print('---------------')
print(retain_train_df.iloc[0].text)

Fredericka Amber was born on December 21, 1969. Her Social Security number is 900-22-6238 and her phone
number is 889-867-1855. She can be reached at the email address [fredericka\_amber@me.com](mailto:fredericka_amber@me.com). Her home address is 5611 North 61st Avenue, Louisville, KY, 40258.
---------------
Fredericka Amber was born on December 21, 1969. Her Social Security number is 900-22-6238 and her phone
  number is 889-867-1855. She can be reached at the email address [fredericka\_amber@me.com](mailto:fredericka_amber@me.com). Her home address is 5611 North 61st Avenue, Louisville, KY, 40258.


## Model and tokenizer

In [7]:
quantizationConfig = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4"
)

olmo = AutoModelForCausalLM.from_pretrained("allenai/OLMo-1B-0724-hf", quantization_config=quantizationConfig)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/9.25k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.71G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

In [8]:
olmo = prepare_model_for_kbit_training(olmo)

In [9]:
print(olmo)

OlmoForCausalLM(
  (model): OlmoModel(
    (embed_tokens): Embedding(50304, 2048, padding_idx=1)
    (layers): ModuleList(
      (0-15): 16 x OlmoDecoderLayer(
        (self_attn): OlmoSdpaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): OlmoRotaryEmbedding()
        )
        (mlp): OlmoMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): OlmoLayerNorm()
        (post_attention_layernorm): OlmoLayerNorm()
      )
    )
    (no

In [10]:
LORA_TARGET_MODULES="q_proj,k_proj,q_attn,v_proj,o_proj"    # lora_target_modules

In [11]:
# Set up lora
peft_config = LoraConfig(
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    r=LORA_R,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=LORA_TARGET_MODULES.split(","),
)

olmo = get_peft_model(olmo, peft_config)
olmo.print_trainable_parameters()

trainable params: 2,097,152 || all params: 1,281,884,160 || trainable%: 0.1636


In [12]:
tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-1B-0724-hf")
tokenizer.add_special_tokens({'pad_token': '<PAD>'})

tokenizer_config.json:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.12M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

1

## Fine tune

In [13]:
dataset = Dataset.from_pandas(retain_train_df)

In [18]:
training_args = SFTConfig(
    max_seq_length=512,
    report_to='none',
    output_dir="/tmp",
    dataset_text_field="text",
    packing=True,
)

trainer = SFTTrainer(
        model=olmo,
        train_dataset=dataset,
        args=training_args,
        tokenizer=tokenizer,
    )

  trainer = SFTTrainer(


In [19]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss


TrainOutput(global_step=63, training_loss=2.3548717196025546, metrics={'train_runtime': 499.7174, 'train_samples_per_second': 1.003, 'train_steps_per_second': 0.126, 'total_flos': 1814352831184896.0, 'train_loss': 2.3548717196025546, 'epoch': 3.0})

## Some inference

In [21]:
from transformers import pipeline

In [32]:
sequences = pipeline("text-generation", model='/content/drive/MyDrive/GIL/Unlearning/fine_tune_retain', tokenizer='/content/drive/MyDrive/GIL/Unlearning/fine_tune_retain')

sequences(
   "Goldi Aqua was born on March 29, 1976. She can be reached via phone at 565-577-9919 and email at goldi\_aqua@me.com. Her home",
    max_length=200,
    do_sample=True,
    truncation=True,
    top_k=10,
    num_return_sequences=1
)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


[{'generated_text': 'Goldi Aqua was born on March 29, 1976. She can be reached via phone at 565-577-9919 and email at goldi\\_aqua@me.com. Her home address is: 730 N. Broadway, San Diego, CA 92101.'}]

Answer to forget:

> address is 1011 San Jose Street, Brooklyn Park, MD, 21225. Her Social Security Number is 900-57-8002.







## Save the model

In [20]:
trainer.save_model("/content/drive/MyDrive/GIL/Unlearning/fine_tune_retain")