In [1]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    AutoModel,
    AutoConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    TrainerCallback,
    AutoModelForCausalLM,
    GenerationConfig
)
from tqdm.auto import tqdm
from transformers.integrations import WandbCallback
from datasets import load_dataset, DatasetDict
from peft import LoraConfig
from trl import SFTTrainer
from huggingface_hub import HfApi, HfFolder, Repository
import os
import torch
import wandb
from datetime import datetime
from types import SimpleNamespace
import json


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [2]:
class LLMSampleCB(WandbCallback):
    def __init__(self, trainer, test_dataset, num_samples=10, max_new_tokens=256, log_model="checkpoint"):
        super().__init__()
        self._log_model = log_model
        self.sample_dataset = test_dataset.select(range(num_samples))
        self.model, self.tokenizer = trainer.model, trainer.tokenizer
        self.gen_config = GenerationConfig.from_pretrained(trainer.model.name_or_path,
                                                           max_new_tokens=max_new_tokens)
    def generate(self, prompt):
        tokenized_prompt = self.tokenizer(prompt, return_tensors='pt')['input_ids'].cuda()
        with torch.inference_mode():
            output = self.model.generate(inputs=tokenized_prompt, generation_config=self.gen_config)
        return self.tokenizer.decode(output[0][len(tokenized_prompt[0]):], skip_special_tokens=True)
    
    def samples_table(self, examples):
        records_table = wandb.Table(columns=["prompt", "generation"] + list(self.gen_config.to_dict().keys()))
        for example in tqdm(examples, leave=False):
            prompt = example["text"]
            generation = self.generate(prompt=prompt)
            records_table.add_data(prompt, generation, *list(self.gen_config.to_dict().values()))
        return records_table
        
    def on_evaluate(self, args, state, control,  **kwargs):
        super().on_evaluate(args, state, control, **kwargs)
        records_table = self.samples_table(self.sample_dataset)
        self._wandb.log({"sample_predictions":records_table})

In [3]:
# Define constants
SEED = 42
TRAIN_SIZE = 10000
EVAL_SIZE = 350
TOTAL_SAMPLE_SIZE = TRAIN_SIZE + EVAL_SIZE
# DATASET = load_dataset("teknium/GPT4-LLM-Cleaned").shuffle(seed=SEED) # Load and shuffle the GPT-4-LLM dataset from hugging face (initially contains 54k rows)
MODEL_ID = 'meta-llama/Llama-2-7b-hf' #"EleutherAI/pythia-70m", 'meta-llama/Llama-2-7b-hf'
MODEL_NAME = MODEL_ID.split('/')[-1]
DATASET_NAME = 'GPT-4_alpaca'
BASE_REPOSITORY = 'persuasion-scaling-laws'

In [4]:
os.environ["WANDB_NOTEBOOK_NAME"] = "04_instruction_tune_v2.ipynb"
wandb.init(project="GPT-4_alpaca_ft",
           entity="kobihackenburg",
           job_type="train",
           tags=["hf_sft_lora", "7b"],
           name=f"{BASE_REPOSITORY}/{MODEL_NAME}/{DATASET_NAME}/10k_filtered")
artifact = wandb.use_artifact('capecape/alpaca_ft/alpaca_gpt4_splitted:v4', type='dataset')
artifact_dir = artifact.download()

[34m[1mwandb[0m: Currently logged in as: [33mkobihackenburg[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m:   2 of 2 files downloaded.  


In [5]:
alpaca_ds = load_dataset("json", data_dir=artifact_dir)
alpaca_ds

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 51002
    })
    test: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 1000
    })
})

In [6]:
def prompt_no_input(row):
    return ("Below is an instruction that describes a task. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction:\n{instruction}\n\n### Response:\n{output}").format_map(row)

def prompt_input(row):
    return ("Below is an instruction that describes a task, paired with an input that provides further context. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n{output}").format_map(row)

def create_prompt(row):
    return prompt_no_input(row) if row["input"] == "" else prompt_input(row)

In [7]:
#drop input rows
def filter_empty_input(row):
    return row["input"] != ""

filtered_alpaca_ds = {split: ds.filter(filter_empty_input) for split, ds in alpaca_ds.items()}
filtered_alpaca_ds

{'train': Dataset({
     features: ['instruction', 'input', 'output'],
     num_rows: 20276
 }),
 'test': Dataset({
     features: ['instruction', 'input', 'output'],
     num_rows: 403
 })}

In [8]:
#remove refusals
refusal_pattern = r"\b(?:I will not|I(?:'m| am) unable to|I cannot|I can't|I'm sorry|I am sorry|I am not able|I'm unable|I'm not able|I am unable|I am unable |AI assistant|AI chatbot| AI language model)\b"

import re

def filter_refusals(row):
    return re.search(refusal_pattern, row["output"]) is None

filtered_alpaca_ds = {split: ds.filter(filter_refusals) for split, ds in filtered_alpaca_ds.items()}
filtered_alpaca_ds

{'train': Dataset({
     features: ['instruction', 'input', 'output'],
     num_rows: 19597
 }),
 'test': Dataset({
     features: ['instruction', 'input', 'output'],
     num_rows: 388
 })}

In [9]:
train_dataset = filtered_alpaca_ds["train"].select(range(TRAIN_SIZE))
eval_dataset = filtered_alpaca_ds["test"].select(range(EVAL_SIZE))

In [10]:
model_kwargs = dict(
    device_map={"" : 0},
    trust_remote_code=True,
    # low_cpu_mem_usage=True,
    torch_dtype=torch.bfloat16,
    # use_flash_attention_2=True,
    use_cache=False,
)

In [11]:
peft_config = LoraConfig(
    r=64,  # the rank of the LoRA matrices
    lora_alpha=16, # the weight
    lora_dropout=0.1, # dropout to add to the LoRA layers
    bias="none", # add bias to the nn.Linear layers?
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj","v_proj","o_proj"] # the name of the layers to add LoRA
    # target_modules=["query_key_value", "dense"] # uncomment if using a Pythia model
)

In [12]:
batch_size = 16
gradient_accumulation_steps = 2
num_train_epochs = 3

total_num_steps = num_train_epochs * 11_210 // (batch_size * gradient_accumulation_steps)

total_num_steps

1050

In [13]:
output_dir = "./output/"
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size//2,
    bf16=True,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    warmup_ratio = 0.1,
    max_steps=total_num_steps,
    gradient_accumulation_steps=gradient_accumulation_steps,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs=dict(use_reentrant=False),
    evaluation_strategy="steps",
    eval_steps=total_num_steps // num_train_epochs,
    # eval_steps=10,
    # logging strategies
    logging_strategy="steps",
    logging_steps=1,
    save_strategy="steps",
    save_steps=total_num_steps // num_train_epochs,
)

In [14]:
trainer = SFTTrainer(
    model=MODEL_ID,
    model_init_kwargs=model_kwargs,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    packing=True,
    max_seq_length=1024,
    args=training_args,
    formatting_func=create_prompt,
    peft_config=peft_config,
)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

[codecarbon INFO @ 15:08:17] [setup] RAM Tracking...
[codecarbon INFO @ 15:08:17] [setup] GPU Tracking...
[codecarbon INFO @ 15:08:17] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 15:08:17] [setup] CPU Tracking...
[codecarbon INFO @ 15:08:19] CPU Model on constant consumption mode: Intel(R) Xeon(R) Gold 6338 CPU @ 2.00GHz
[codecarbon INFO @ 15:08:19] >>> Tracker's metadata:
[codecarbon INFO @ 15:08:19]   Platform system: Linux-5.15.0-91-generic-x86_64-with-glibc2.35
[codecarbon INFO @ 15:08:19]   Python version: 3.10.12
[codecarbon INFO @ 15:08:19]   CodeCarbon version: 2.3.4
[codecarbon INFO @ 15:08:19]   Available RAM : 251.516 GB
[codecarbon INFO @ 15:08:19]   CPU count: 128
[codecarbon INFO @ 15:08:19]   CPU model: Intel(R) Xeon(R) Gold 6338 CPU @ 2.00GHz
[codecarbon INFO @ 15:08:19]   GPU count: 2
[codecarbon INFO @ 15:08:19]   GPU model: 2 x NVIDIA A100 80GB PCIe


In [15]:

def create_prompt_no_anwer(row):
    row["output"] = ""
    return {"text": create_prompt(row)}

test_dataset = eval_dataset.map(create_prompt_no_anwer)


Map:   0%|          | 0/350 [00:00<?, ? examples/s]

In [16]:
# Cell 14
wandb_callback = LLMSampleCB(trainer, test_dataset, num_samples=10, max_new_tokens=256)


In [17]:
trainer.add_callback(wandb_callback)

In [18]:
trainer.train()
wandb.finish()

[codecarbon INFO @ 15:08:38] Energy consumed for RAM : 0.000393 kWh. RAM Power : 94.31847381591797 W
[codecarbon INFO @ 15:08:38] Energy consumed for all GPUs : 0.001874 kWh. Total GPU Power : 449.3557408961135 W
[codecarbon INFO @ 15:08:38] Energy consumed for all CPUs : 0.000177 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 15:08:38] 0.002444 kWh of electricity used since the beginning.


Step,Training Loss,Validation Loss
350,0.7233,0.80367
700,0.5453,0.940188
1050,0.5063,0.985014


[codecarbon INFO @ 15:08:53] Energy consumed for RAM : 0.000786 kWh. RAM Power : 94.31847381591797 W
[codecarbon INFO @ 15:08:53] Energy consumed for all GPUs : 0.003693 kWh. Total GPU Power : 436.9930914567071 W
[codecarbon INFO @ 15:08:53] Energy consumed for all CPUs : 0.000354 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 15:08:53] 0.004834 kWh of electricity used since the beginning.
[codecarbon INFO @ 15:09:08] Energy consumed for RAM : 0.001178 kWh. RAM Power : 94.31847381591797 W
[codecarbon INFO @ 15:09:08] Energy consumed for all GPUs : 0.005524 kWh. Total GPU Power : 439.8022858992363 W
[codecarbon INFO @ 15:09:08] Energy consumed for all CPUs : 0.000532 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 15:09:08] 0.007234 kWh of electricity used since the beginning.
[codecarbon INFO @ 15:09:23] Energy consumed for RAM : 0.001571 kWh. RAM Power : 94.31847381591797 W
[codecarbon INFO @ 15:09:23] Energy consumed for all GPUs : 0.007372 kWh. Total GPU Power : 444.0239089919122

  0%|          | 0/10 [00:00<?, ?it/s]

[codecarbon INFO @ 16:39:24] Energy consumed for RAM : 0.142888 kWh. RAM Power : 94.31847381591797 W
[codecarbon INFO @ 16:39:24] Energy consumed for all GPUs : 0.699341 kWh. Total GPU Power : 306.85171407148545 W
[codecarbon INFO @ 16:39:24] Energy consumed for all CPUs : 0.064450 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 16:39:24] 0.906679 kWh of electricity used since the beginning.
[codecarbon INFO @ 16:39:39] Energy consumed for RAM : 0.143281 kWh. RAM Power : 94.31847381591797 W
[codecarbon INFO @ 16:39:39] Energy consumed for all GPUs : 0.700373 kWh. Total GPU Power : 247.79668470806826 W
[codecarbon INFO @ 16:39:39] Energy consumed for all CPUs : 0.064627 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 16:39:39] 0.908280 kWh of electricity used since the beginning.
Checkpoint destination directory ./output/checkpoint-350 already exists and is non-empty. Saving will proceed but saved results may be invalid.
[34m[1mwandb[0m: Adding directory to artifact (./output/check

  0%|          | 0/10 [00:00<?, ?it/s]

[codecarbon INFO @ 18:09:09] Energy consumed for RAM : 0.283810 kWh. RAM Power : 94.31847381591797 W
[codecarbon INFO @ 18:09:09] Energy consumed for all GPUs : 1.392793 kWh. Total GPU Power : 291.6806396948043 W
[codecarbon INFO @ 18:09:09] Energy consumed for all CPUs : 0.128014 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 18:09:09] 1.804617 kWh of electricity used since the beginning.
Checkpoint destination directory ./output/checkpoint-700 already exists and is non-empty. Saving will proceed but saved results may be invalid.
[codecarbon INFO @ 18:09:24] Energy consumed for RAM : 0.284203 kWh. RAM Power : 94.31847381591797 W
[codecarbon INFO @ 18:09:24] Energy consumed for all GPUs : 1.393876 kWh. Total GPU Power : 260.1402607953005 W
[codecarbon INFO @ 18:09:24] Energy consumed for all CPUs : 0.128191 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 18:09:24] 1.806269 kWh of electricity used since the beginning.
[34m[1mwandb[0m: Adding directory to artifact (./output/checkpo

  0%|          | 0/10 [00:00<?, ?it/s]

[codecarbon INFO @ 19:37:39] Energy consumed for RAM : 0.422775 kWh. RAM Power : 94.31847381591797 W
[codecarbon INFO @ 19:37:39] Energy consumed for all GPUs : 2.080745 kWh. Total GPU Power : 263.3672739864591 W
[codecarbon INFO @ 19:37:39] Energy consumed for all CPUs : 0.190693 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 19:37:39] 2.694214 kWh of electricity used since the beginning.
Checkpoint destination directory ./output/checkpoint-1050 already exists and is non-empty. Saving will proceed but saved results may be invalid.
[codecarbon INFO @ 19:37:54] Energy consumed for RAM : 0.423168 kWh. RAM Power : 94.31847381591797 W
[codecarbon INFO @ 19:37:54] Energy consumed for all GPUs : 2.081784 kWh. Total GPU Power : 249.27939336045358 W
[codecarbon INFO @ 19:37:54] Energy consumed for all CPUs : 0.190871 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 19:37:54] 2.695823 kWh of electricity used since the beginning.
[34m[1mwandb[0m: Adding directory to artifact (./output/check

VBox(children=(Label(value='1169.860 MB of 1290.177 MB uploaded (4.491 MB deduped)\r'), FloatProgress(value=0.…

0,1
eval/loss,▁▁▆▆██
eval/runtime,▂▂▁▁██
eval/samples_per_second,▇▇██▁▁
eval/steps_per_second,▇▇██▁▁
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/grad_norm,▇▃▁▁▂▁▂▂▃▂▂▃▃▄▃▄▄▅▄▅▅▆▆▆▅▇▆▆▆▆▆▆▅▅▇▅▅▆█▅
train/learning_rate,▂▃▅▆██████▇▇▇▇▇▆▆▆▆▅▅▅▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁
train/loss,█▆▄▄▄▄▄▄▄▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁▁

0,1
eval/loss,0.98501
eval/runtime,7.5598
eval/samples_per_second,8.201
eval/steps_per_second,0.529
train/epoch,37.5
train/global_step,1050.0
train/grad_norm,0.12012
train/learning_rate,0.0
train/loss,0.5063
train/total_flos,2.723863493093622e+18


In [19]:
# EXPORT MODEL
# Get the trained model
os.environ["HF_API_TOKEN"] = "token here"
model = trainer.model

# push model to Hugging Face hub
model.push_to_hub(
    f"{BASE_REPOSITORY}/{MODEL_NAME}-{DATASET_NAME}_10k_filtered",
    use_auth_token=os.environ["HF_API_TOKEN"],
    private=True,
)



adapter_model.safetensors:   0%|          | 0.00/134M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/persuasion-scaling-laws/Llama-2-7b-hf-GPT-4_alpaca_10k_filtered/commit/06a081a3f514eb956da35489f87aba392f54d9f5', commit_message='Upload model', commit_description='', oid='06a081a3f514eb956da35489f87aba392f54d9f5', pr_url=None, pr_revision=None, pr_num=None)