In [1]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    AutoModel,
    AutoConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    TrainerCallback,
    AutoModelForCausalLM,
    GenerationConfig
)
from tqdm.auto import tqdm
from transformers.integrations import WandbCallback
from datasets import load_dataset, DatasetDict
from peft import LoraConfig
from trl import SFTTrainer
from huggingface_hub import HfApi, HfFolder, Repository
import os
import torch
import wandb
from datetime import datetime
from types import SimpleNamespace
import json
import pandas as pd
import random

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [2]:
# !wget https://huggingface.co/datasets/Open-Orca/OpenOrca/resolve/main/1M-GPT4-Augmented.parquet

In [3]:
# # Load Parquet file into a pandas DataFrame
# df = pd.read_parquet('1M-GPT4-Augmented.parquet')

# # Convert the DataFrame to JSON Lines and save it to a file
# with open('1M-GPT4-Augmented.jsonl', 'w') as f:
#     for index, row in df.iterrows():
#         json.dump(row.to_dict(), f)
#         f.write('\n')

In [4]:
class LLMSampleCB(WandbCallback):
    def __init__(self, trainer, test_dataset, num_samples=10, max_new_tokens=256, log_model="checkpoint"):
        super().__init__()
        self._log_model = log_model
        self.sample_dataset = test_dataset.select(range(num_samples))
        self.model, self.tokenizer = trainer.model, trainer.tokenizer
        self.gen_config = GenerationConfig.from_pretrained(trainer.model.name_or_path,
                                                           max_new_tokens=max_new_tokens)
    def generate(self, prompt):
        tokenized_prompt = self.tokenizer(prompt, return_tensors='pt')['input_ids'].cuda()
        with torch.inference_mode():
            output = self.model.generate(inputs=tokenized_prompt, generation_config=self.gen_config)
        return self.tokenizer.decode(output[0][len(tokenized_prompt[0]):], skip_special_tokens=True)
    
    def samples_table(self, examples):
        records_table = wandb.Table(columns=["prompt", "generation"] + list(self.gen_config.to_dict().keys()))
        for example in tqdm(examples, leave=False):
            prompt = example["text"]
            generation = self.generate(prompt=prompt)
            records_table.add_data(prompt, generation, *list(self.gen_config.to_dict().values()))
        return records_table
        
    def on_evaluate(self, args, state, control,  **kwargs):
        super().on_evaluate(args, state, control, **kwargs)
        records_table = self.samples_table(self.sample_dataset)
        self._wandb.log({"sample_predictions":records_table})

In [5]:
dataset_file = "1M-GPT4-Augmented.jsonl"
OpenOrca = []

with open(dataset_file, "r") as f:
    for line in f:
        OpenOrca.append(json.loads(line))

In [6]:
type(OpenOrca), OpenOrca[0:3], len(OpenOrca)

(list,
 [{'id': 'niv.242684',
   'system_prompt': '',
   'question': "You will be given a definition of a task first, then some input of the task.\nThis task is about using the specified sentence and converting the sentence to Resource Description Framework (RDF) triplets of the form (subject, predicate object). The RDF triplets generated must be such that the triplets accurately capture the structure and semantics of the input sentence. The input is a sentence and the output is a list of triplets of the form [subject, predicate, object] that capture the relationships present in the sentence. When a sentence has more than 1 RDF triplet possible, the output must contain all of them.\n\nAFC Ajax (amateurs)'s ground is Sportpark De Toekomst where Ajax Youth Academy also play.\nOutput:",
   'response': '[\n  ["AFC Ajax (amateurs)", "has ground", "Sportpark De Toekomst"],\n  ["Ajax Youth Academy", "plays at", "Sportpark De Toekomst"]\n]'},
  {'id': 'flan.564327',
   'system_prompt': 'You ar

In [7]:
seed = 42

random.seed(seed)
random.shuffle(OpenOrca) 

In [8]:
# train_dataset = alpaca[:-1000]
train_dataset = OpenOrca[:12000]
eval_dataset = OpenOrca[-1200:]

In [9]:
train_df = pd.DataFrame(train_dataset)
eval_df = pd.DataFrame(eval_dataset)

train_table = wandb.Table(dataframe=train_df)
eval_table  = wandb.Table(dataframe=eval_df)

train_df.to_json("OpenOrca_train.jsonl", orient='records', lines=True)
eval_df.to_json("OpenOrca_eval.jsonl", orient='records', lines=True)

with wandb.init(project="OpenOrca_ft", entity="kobihackenburg", job_type="split_data"):
    at = wandb.Artifact(
        name="OpenOrca_splitted", 
        type="dataset",
        description="OpenOrca dataset for instruction finetuning"
    )
    at.add_file("OpenOrca_train.jsonl")
    at.add_file("OpenOrca_eval.jsonl")
    wandb.log_artifact(at)
    wandb.log({"train_dataset":train_table, "eval_dataset":eval_table})

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkobihackenburg[0m. Use [1m`wandb login --relogin`[0m to force relogin




VBox(children=(Label(value='42.800 MB of 42.800 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [10]:
# Define constants
SEED = 42
TRAIN_SIZE = 10000
EVAL_SIZE = 1000
TOTAL_SAMPLE_SIZE = TRAIN_SIZE + EVAL_SIZE
MODEL_ID = 'meta-llama/Llama-2-7b-hf' #"EleutherAI/pythia-70m", 'meta-llama/Llama-2-7b-hf'
MODEL_NAME = MODEL_ID.split('/')[-1]
DATASET_NAME = 'OpenOrca'
BASE_REPOSITORY = 'persuasion-scaling-laws'

wandb.init(project="OpenOrca_ft",
           entity="kobihackenburg",
           job_type="train",
           tags=["hf_sft_lora", "7b"],
           name=f"{BASE_REPOSITORY}/{MODEL_NAME}/{DATASET_NAME}/10k_filtered")
artifact = wandb.use_artifact('kobihackenburg/OpenOrca_ft/OpenOrca_splitted:v2', type='dataset')
artifact_dir = artifact.download()

[34m[1mwandb[0m:   2 of 2 files downloaded.  


In [11]:
OpenOrca_ds = load_dataset("json", data_dir=artifact_dir)
OpenOrca_ds

DatasetDict({
    train: Dataset({
        features: ['id', 'system_prompt', 'question', 'response'],
        num_rows: 12000
    })
    test: Dataset({
        features: ['id', 'system_prompt', 'question', 'response'],
        num_rows: 1200
    })
})

In [12]:
#drop "id" and "system_prompt" features
OpenOrca_ds = OpenOrca_ds.remove_columns(["id", "system_prompt"])
OpenOrca_ds

DatasetDict({
    train: Dataset({
        features: ['question', 'response'],
        num_rows: 12000
    })
    test: Dataset({
        features: ['question', 'response'],
        num_rows: 1200
    })
})

In [13]:
def create_prompt(row):
    return ("Below is an instruction that describes a task. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction:\n{question}\n\n### Response:\n{response}").format_map(row)


In [14]:
#remove refusals
refusal_pattern = r"\b(?:I will not|I(?:'m| am) unable to|I cannot|I can't|I'm sorry|I am sorry|I am not able|I'm unable|I'm not able|I am unable|I am unable |AI assistant|AI chatbot| AI language model)\b"

import re

def filter_refusals(row):
    return re.search(refusal_pattern, row["response"]) is None

OpenOrca_ds = {split: ds.filter(filter_refusals) for split, ds in OpenOrca_ds.items()}
OpenOrca_ds

{'train': Dataset({
     features: ['question', 'response'],
     num_rows: 11870
 }),
 'test': Dataset({
     features: ['question', 'response'],
     num_rows: 1188
 })}

In [15]:
#remove "step 1" responses

def filter_steps(row):
    pattern = r"\b(?:step 1|Step 1|step one)\b"
    return re.search(pattern, row["response"]) is None

OpenOrca_ds = {split: ds.filter(filter_steps) for split, ds in OpenOrca_ds.items()}
OpenOrca_ds

{'train': Dataset({
     features: ['question', 'response'],
     num_rows: 10567
 }),
 'test': Dataset({
     features: ['question', 'response'],
     num_rows: 1080
 })}

In [16]:
def filter_ai(row):
    pattern = r"\bAI\b"
    return re.search(pattern, row["response"]) is None

OpenOrca_ds = {split: ds.filter(filter_ai) for split, ds in OpenOrca_ds.items()}
OpenOrca_ds

Filter:   0%|          | 0/10567 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1080 [00:00<?, ? examples/s]

{'train': Dataset({
     features: ['question', 'response'],
     num_rows: 10527
 }),
 'test': Dataset({
     features: ['question', 'response'],
     num_rows: 1074
 })}

In [16]:
train_dataset = OpenOrca_ds["train"].select(range(TRAIN_SIZE))
eval_dataset = OpenOrca_ds["test"].select(range(EVAL_SIZE))

In [17]:
model_kwargs = dict(
    device_map={"" : 0},
    trust_remote_code=True,
    # low_cpu_mem_usage=True,
    torch_dtype=torch.bfloat16,
    # use_flash_attention_2=True,
    use_cache=False,
)

In [18]:
peft_config = LoraConfig(
    r=64,  # the rank of the LoRA matrices
    lora_alpha=16, # the weight
    lora_dropout=0.1, # dropout to add to the LoRA layers
    bias="none", # add bias to the nn.Linear layers?
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj","v_proj","o_proj"] # the name of the layers to add LoRA
    # target_modules=["query_key_value", "dense"] # uncomment if using a Pythia model
)

In [19]:
batch_size = 16
gradient_accumulation_steps = 2
num_train_epochs = 3

total_num_steps = num_train_epochs * 11_210 // (batch_size * gradient_accumulation_steps)

total_num_steps

1050

In [20]:
output_dir = "./output/"
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size//2,
    bf16=True,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    warmup_ratio = 0.1,
    max_steps=total_num_steps,
    gradient_accumulation_steps=gradient_accumulation_steps,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs=dict(use_reentrant=False),
    evaluation_strategy="steps",
    eval_steps=total_num_steps // num_train_epochs,
    # eval_steps=10,
    # logging strategies
    logging_strategy="steps",
    logging_steps=1,
    save_strategy="steps",
    save_steps=total_num_steps // num_train_epochs,
)

In [21]:
trainer = SFTTrainer(
    model=MODEL_ID,
    model_init_kwargs=model_kwargs,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    packing=True,
    max_seq_length=1024,
    args=training_args,
    formatting_func=create_prompt,
    peft_config=peft_config,
)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

[codecarbon INFO @ 09:13:06] [setup] RAM Tracking...
[codecarbon INFO @ 09:13:06] [setup] GPU Tracking...
[codecarbon INFO @ 09:13:06] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 09:13:06] [setup] CPU Tracking...
[codecarbon INFO @ 09:13:07] CPU Model on constant consumption mode: Intel(R) Xeon(R) Gold 6338 CPU @ 2.00GHz
[codecarbon INFO @ 09:13:07] >>> Tracker's metadata:
[codecarbon INFO @ 09:13:07]   Platform system: Linux-5.15.0-91-generic-x86_64-with-glibc2.35
[codecarbon INFO @ 09:13:07]   Python version: 3.10.12
[codecarbon INFO @ 09:13:07]   CodeCarbon version: 2.3.4
[codecarbon INFO @ 09:13:07]   Available RAM : 251.516 GB
[codecarbon INFO @ 09:13:07]   CPU count: 128
[codecarbon INFO @ 09:13:07]   CPU model: Intel(R) Xeon(R) Gold 6338 CPU @ 2.00GHz
[codecarbon INFO @ 09:13:07]   GPU count: 2
[codecarbon INFO @ 09:13:07]   GPU model: 2 x NVIDIA A100 80GB PCIe


In [22]:
def create_prompt_no_anwer(row):
    row["output"] = ""
    return {"text": create_prompt(row)}

test_dataset = eval_dataset.map(create_prompt_no_anwer)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [23]:
wandb_callback = LLMSampleCB(trainer, test_dataset, num_samples=10, max_new_tokens=256)

In [24]:
trainer.add_callback(wandb_callback)

In [25]:
trainer.train()
wandb.finish()



Step,Training Loss,Validation Loss
350,1.27,1.372098
700,1.2759,1.367239
1050,1.2853,1.370183


[codecarbon INFO @ 09:13:27] Energy consumed for RAM : 0.000393 kWh. RAM Power : 94.31847381591797 W
[codecarbon INFO @ 09:13:27] Energy consumed for all GPUs : 0.001888 kWh. Total GPU Power : 452.84071193896256 W
[codecarbon INFO @ 09:13:27] Energy consumed for all CPUs : 0.000177 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 09:13:27] 0.002459 kWh of electricity used since the beginning.
[codecarbon INFO @ 09:13:42] Energy consumed for RAM : 0.000786 kWh. RAM Power : 94.31847381591797 W
[codecarbon INFO @ 09:13:42] Energy consumed for all GPUs : 0.004028 kWh. Total GPU Power : 514.0489312966356 W
[codecarbon INFO @ 09:13:42] Energy consumed for all CPUs : 0.000354 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 09:13:42] 0.005169 kWh of electricity used since the beginning.
[codecarbon INFO @ 09:13:57] Energy consumed for RAM : 0.001178 kWh. RAM Power : 94.31847381591797 W
[codecarbon INFO @ 09:13:57] Energy consumed for all GPUs : 0.006035 kWh. Total GPU Power : 481.973329740555

  0%|          | 0/10 [00:00<?, ?it/s]

[codecarbon INFO @ 10:24:12] Energy consumed for RAM : 0.111489 kWh. RAM Power : 94.31847381591797 W
[codecarbon INFO @ 10:24:12] Energy consumed for all GPUs : 0.592939 kWh. Total GPU Power : 393.8740892488568 W
[codecarbon INFO @ 10:24:12] Energy consumed for all CPUs : 0.050287 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:24:12] 0.754715 kWh of electricity used since the beginning.
Checkpoint destination directory ./output/checkpoint-350 already exists and is non-empty. Saving will proceed but saved results may be invalid.
[34m[1mwandb[0m: Adding directory to artifact (./output/checkpoint-350)... [codecarbon INFO @ 10:24:27] Energy consumed for RAM : 0.111881 kWh. RAM Power : 94.31847381591797 W
[codecarbon INFO @ 10:24:27] Energy consumed for all GPUs : 0.593871 kWh. Total GPU Power : 223.9978664608633 W
[codecarbon INFO @ 10:24:27] Energy consumed for all CPUs : 0.050465 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:24:27] 0.756217 kWh of electricity used since the

  0%|          | 0/10 [00:00<?, ?it/s]

[codecarbon INFO @ 11:35:29] Energy consumed for RAM : 0.223382 kWh. RAM Power : 94.31847381591797 W
[codecarbon INFO @ 11:35:29] Energy consumed for all GPUs : 1.187116 kWh. Total GPU Power : 342.40677900450214 W
[codecarbon INFO @ 11:35:29] Energy consumed for all CPUs : 0.100761 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 11:35:29] 1.511258 kWh of electricity used since the beginning.
Checkpoint destination directory ./output/checkpoint-700 already exists and is non-empty. Saving will proceed but saved results may be invalid.
[codecarbon INFO @ 11:35:44] Energy consumed for RAM : 0.223780 kWh. RAM Power : 94.31847381591797 W
[codecarbon INFO @ 11:35:44] Energy consumed for all GPUs : 1.188058 kWh. Total GPU Power : 222.64457527732688 W
[codecarbon INFO @ 11:35:44] Energy consumed for all CPUs : 0.100941 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 11:35:44] 1.512780 kWh of electricity used since the beginning.
[34m[1mwandb[0m: Adding directory to artifact (./output/check

  0%|          | 0/10 [00:00<?, ?it/s]

[codecarbon INFO @ 12:46:44] Energy consumed for RAM : 0.335263 kWh. RAM Power : 94.31847381591797 W
[codecarbon INFO @ 12:46:44] Energy consumed for all GPUs : 1.781223 kWh. Total GPU Power : 370.26765596685544 W
[codecarbon INFO @ 12:46:44] Energy consumed for all CPUs : 0.151226 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 12:46:44] 2.267712 kWh of electricity used since the beginning.
[codecarbon INFO @ 12:46:59] Energy consumed for RAM : 0.335655 kWh. RAM Power : 94.31847381591797 W
[codecarbon INFO @ 12:46:59] Energy consumed for all GPUs : 1.782230 kWh. Total GPU Power : 241.95106736437106 W
[codecarbon INFO @ 12:46:59] Energy consumed for all CPUs : 0.151403 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 12:46:59] 2.269289 kWh of electricity used since the beginning.
Checkpoint destination directory ./output/checkpoint-1050 already exists and is non-empty. Saving will proceed but saved results may be invalid.
[34m[1mwandb[0m: Adding directory to artifact (./output/chec

VBox(children=(Label(value='1224.030 MB of 1290.251 MB uploaded (4.491 MB deduped)\r'), FloatProgress(value=0.…

0,1
eval/loss,██▁▁▅▅
eval/runtime,████▁▁
eval/samples_per_second,▁▁▁▁██
eval/steps_per_second,▁▁▁▁██
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/grad_norm,▇▄▁▁▂▄▅▅▅▅▆▅▅▆▅▆▇▆▆▆▆▇█▇█▇▇▇▇▇▇▇█▇▇▇▇▆▇▇
train/learning_rate,▂▃▅▆██████▇▇▇▇▇▆▆▆▆▅▅▅▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁
train/loss,█▆▅▅▅▄▄▃▃▃▂▃▃▃▂▃▂▂▃▂▃▂▂▂▁▂▁▃▁▁▂▂▁▁▁▁▂▂▂▂
train/total_flos,▁▁

0,1
eval/loss,1.37018
eval/runtime,39.8725
eval/samples_per_second,11.186
eval/steps_per_second,0.702
train/epoch,14.89
train/global_step,1050.0
train/grad_norm,0.06885
train/learning_rate,0.0
train/loss,1.2853
train/total_flos,2.741947948785992e+18


In [26]:
# EXPORT MODEL
# Get the trained model
os.environ["HF_API_TOKEN"] = "token here"
model = trainer.model

# push model to Hugging Face hub
model.push_to_hub(
    f"{BASE_REPOSITORY}/{MODEL_NAME}-{DATASET_NAME}_steps_filtered",
    use_auth_token=os.environ["HF_API_TOKEN"],
    private=True,
)



adapter_model.safetensors:   0%|          | 0.00/134M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/persuasion-scaling-laws/Llama-2-7b-hf-OpenOrca_steps_filtered/commit/7d4828c20b3642badd7393f95d2cc4a71a83e237', commit_message='Upload model', commit_description='', oid='7d4828c20b3642badd7393f95d2cc4a71a83e237', pr_url=None, pr_revision=None, pr_num=None)