In [19]:
#!pip install huggingface_hub
#!pip install trl
#!pip install transformers
#!pip install peft


In [20]:
# if issues with versions:   
#!pip uninstall -y transformers peft trl bitsandbytes accelerate wandb
#!pip install --upgrade transformers peft trl bitsandbytes accelerate wandb --no-cache-dir

In [3]:
# hf model download 
from huggingface_hub import snapshot_download
download = snapshot_download(repo_id="Qwen/Qwen2.5-3B-Instruct", repo_type='model', local_dir='base_model')
download

Fetching 12 files: 100%|██████████| 12/12 [01:30<00:00,  7.58s/it]


'/home/bart/Desktop/selective-qlora/selective-qlora/base_model'

In [3]:
# imports
import os
os.environ["WANDB_DISABLED"] = "true" #i had wandb issues 
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
#bits and bytes: 
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="bfloat16",
    bnb_4bit_use_double_quant=True,
)



In [5]:
# load base model
model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-3b-Instruct",
    quantization_config=bnb_config,
    device_map="auto"
)

Current model requires 512 bytes of buffer for offloaded layers, which seems does not fit any GPU's remaining memory. If you are experiencing a OOM later, please consider using offload_buffers=True.
Loading weights: 100%|██████████| 434/434 [00:39<00:00, 10.99it/s, Materializing param=model.norm.weight]                              


In [6]:
# dataset download 
from huggingface_hub import snapshot_download
download_dataset = snapshot_download(repo_id="teknium/OpenHermes-2.5", repo_type='dataset', local_dir='dataset')
download_dataset

Fetching 3 files: 100%|██████████| 3/3 [00:00<00:00, 6516.27it/s]


'/home/bart/Desktop/selective-qlora/selective-qlora/dataset'

In [7]:
# selective dataset technique 
# forward pass on dataset - without training
# save the 30% hardest ones
# https://medium.com/@njorogeofrey73/forward-pass-3f716ed71f19 

# imports again ( why? )
import torch
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer

# load datasetf
dataset_hermes = 'dataset/openhermes2_5.json'
df = pd.read_json(dataset_hermes)
print(df.head())

# Tokenize data
model_name = "Qwen/Qwen2.5-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Every day, a tree drops 7 leaves. How many leaves would it drop in a month of February in a non-leap year? Include your logic""."},
    {"role": "assistant", "content": "Here's the logic behind this:\n\n1. We know that February has 28 days in a non-leap year.\n2. ... (itd.)"}
]

#convert data to match model // dataset 
def convert_to_messages(row_conversations):
    messages = []
    role_map = {
        "system": "system",
        "human": "user",
        "gpt": "assistant"
    }

    for msg in row_conversations:
        role = role_map.get(msg["from"])
        content = msg["value"]

        if role == "system" and not content.strip():
            continue

        messages.append({"role": role, "content": content})
    return messages
df = df.head(100).copy() # testing on 100 samples (have to add batch_size to increase speed test)
df['formatted_messages'] = df['conversations'].apply(convert_to_messages)

def calculate_loss(msgs): 
    text = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=False)

    inputs = tokenizer(
        text,
        return_tensors="pt",
        max_length=2048,
        truncation=True,
    ).to(model.device)
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
        return outputs.loss.item()
    
model.eval()
df['loss'] = df['formatted_messages'].apply(calculate_loss)

df_sorted = df.sort_values(by='loss', ascending=False)
top_30_percent = int(len(df_sorted) * 0.3)
hard_samples = df_sorted.head(top_30_percent)

print(f"we have {len(hard_samples)} hardest samples ")



                                       conversations        source  \
0  [{'from': 'human', 'value': 'Every day, a tree...  airoboros2.2   
1  [{'from': 'human', 'value': 'In analytical che...  airoboros2.2   
2  [{'from': 'human', 'value': 'A rectangular gar...  airoboros2.2   
3  [{'from': 'human', 'value': 'What was the purp...  airoboros2.2   
4  [{'from': 'human', 'value': 'A man claims he c...  airoboros2.2   

          category  skip_prompt_formatting   id language title model_name  \
0             orca                     0.0  NaN      NaN   NaN        NaN   
1  multiple_choice                     0.0  NaN      NaN   NaN        NaN   
2             orca                     0.0  NaN      NaN   NaN        NaN   
3          general                     0.0  NaN      NaN   NaN        NaN   
4             orca                     0.0  NaN      NaN   NaN        NaN   

   custom_instruction topic system_prompt model avatarUrl  views hash  idx  
0                 NaN   NaN           N

`torch_dtype` is deprecated! Use `dtype` instead!
Current model requires 512 bytes of buffer for offloaded layers, which seems does not fit any GPU's remaining memory. If you are experiencing a OOM later, please consider using offload_buffers=True.
Loading weights: 100%|██████████| 434/434 [00:00<00:00, 2247.07it/s, Materializing param=model.norm.weight]                              


we have 30 hardest samples 


In [8]:
# check details
pd.set_option('display.max_colwidth', 200)

print(f"top 5: \n\n {df_sorted[['loss', 'conversations']].head(5)}")
print(f"worse 5: \n\n {df_sorted[['loss', 'conversations']].tail(5)}")

top 5: 

         loss  \
57  5.642434   
40  5.023724   
76  4.731190   
77  4.092717   
96  3.996943   

                                                                                                                                                                                              conversations  
57  [{'from': 'system', 'value': 'You are a world class trivia AI - provide accurate, succinct responses.'}, {'from': 'human', 'value': 'Stories passed down through generations, this palindrome often ...  
40               [{'from': 'human', 'value': 'Do you know any jokes about animals with ailments?'}, {'from': 'gpt', 'value': 'What's worse than a centipede with sore feet? A giraffe with a sore throat'}]  
76                                          [{'from': 'human', 'value': 'Do you know any jokes about librarians?'}, {'from': 'gpt', 'value': 'Why do librarians like the wind? It says, "Shhh!" all day!'}]  
77  [{'from': 'human', 'value': 'How many apples does Sally have if s

In [18]:
df = pd.read_json(dataset_hermes)
df = df.head(10000).copy()

full_ds = Dataset.from_pandas(df[['conversations']])
full_ds = full_ds.map(preprocess_function, remove_columns=['conversations'], num_proc=4)
loader = DataLoader(full_ds, batch_size=8, collate_fn=data_collator)
print(f"new size {len(df)}, test") 

Map (num_proc=4): 100%|██████████| 10000/10000 [00:02<00:00, 4712.87 examples/s]

new size 10000, test





In [14]:
# make it faster
# take dialog, tokenize, run model, eval, save loss for every example 
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from datasets import Dataset

tokenizer.pad_token = tokenizer.eos_token

def preprocess_function(example):
    msgs = convert_to_messages(example['conversations'])
    text = tokenizer.apply_chat_template(msgs, tokenize=False)
    tokenized = tokenizer(text, truncation=True,  max_length=2048)
    #tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

full_ds = Dataset.from_pandas(df[['conversations']])
full_ds = full_ds.map(preprocess_function, remove_columns=['conversations'])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
loader = DataLoader(full_ds, batch_size=8, collate_fn=data_collator)

all_losses = []
model.eval()

# reduction='none' -> loss per token
loss_fct = torch.nn.CrossEntropyLoss(reduction='none', ignore_index=tokenizer.pad_token_id)

print("start")
with torch.no_grad():
    for batch in loader:
        inputs = {k: v.to(model.device) for k, v in batch.items()}
        outputs = model(**inputs) # forward pass
        logits = outputs.logits
        labels = inputs["input_ids"]
        # calculate loss per token, logits predicting next token
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        # group batch and calculate the average with padding mask
        loss = loss.view(shift_labels.size(0), -1)
        mask = (shift_labels != tokenizer.pad_token_id).float()
        # calculate the average only for tokens (not for paddings)
        actual_loss = (loss * mask).sum(dim=1) / mask.sum(dim=1)
        all_losses.extend(actual_loss.cpu().tolist())


df['loss'] = all_losses


Map: 100%|██████████| 100/100 [00:00<00:00, 2160.99 examples/s]


start


Map (num_proc=4): 100%|██████████| 100/100 [00:00<00:00, 175.80 examples/s]

new size 100, test





In [16]:
total_count = len(df)
top_30_count = int(total_count * 0.3)

df_sorted = df.sort_values(by='loss', ascending=False)

print(f"Total samples {total_count}")
print(f"top 30% (hardest ones) {top_30_count}")
print(f"The average loss {df['loss'].mean():.4f}")

pd.set_option('display.max_colwidth', 300)
print(df_sorted[['loss', 'conversations']].head(5))
print(df_sorted[['loss', 'conversations']].tail(5))


Total samples 100
top 30% (hardest ones) 30
The average loss 1.6759
        loss  \
57  5.543040   
76  4.692902   
40  4.562225   
96  3.796139   
77  3.568712   

                                                                                                                                                                                                                                                                    conversations  
57          [{'from': 'system', 'value': 'You are a world class trivia AI - provide accurate, succinct responses.'}, {'from': 'human', 'value': 'Stories passed down through generations, this palindrome often involves legends and myths.'}, {'from': 'gpt', 'value': 'sagas'}]  
76                                                                                                                [{'from': 'human', 'value': 'Do you know any jokes about librarians?'}, {'from': 'gpt', 'value': 'Why do librarians like the wind? It says, "Shhh!" all day!'}]  
40     