In [8]:
import json
from tqdm import tqdm
from transformers import AutoTokenizer
import datasets
import random
import pandas as pd

In [5]:
tokenizer = AutoTokenizer.from_pretrained("weights/phi-3-wizardlm_merged")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [32]:
# Normally I do a random.choice to select a random system prompt, but in this case I kept it constant
system_prompts = [
    # "You are a helpful AI assistant.",
    # "You are an AI assistant that follows instruction extremely well. Help as much as you can.",
    # "You are a friendly chatbot who answers without hesitation.",
    "You are a helpful AI assistant who answers without hesitation."
]

In [11]:
data = "data/orpo-dpo-mix-40k/data/train-00000-of-00001.parquet" # raw data from https://huggingface.co/datasets/mlabonne/orpo-dpo-mix-40k
df = pd.read_parquet(data)

In [12]:
df.head()

Unnamed: 0,source,chosen,rejected,prompt
0,Airoboros,"[{'content': 'The setting is an otherworldly, ...","[{'content': 'The setting is an otherworldly, ...","The setting is an otherworldly, yet eerily fam..."
1,EverythingLM,[{'content': 'How many colors are traditionall...,[{'content': 'How many colors are traditionall...,How many colors are traditionally recognized i...
2,EverythingLM,"[{'content': 'In a basket, there are 20 orange...","[{'content': 'In a basket, there are 20 orange...","In a basket, there are 20 oranges, 60 apples, ..."
3,Airoboros,[{'content': 'Which famous physicist developed...,[{'content': 'Which famous physicist developed...,Which famous physicist developed the theory of...
4,GOAT,[{'content': 'Find 40 * 865. Exclude words; sh...,[{'content': 'Find 40 * 865. Exclude words; sh...,Find 40 * 865. Exclude words; show only the math.


In [40]:
chosen = list(df['chosen'])
rejected = list(df['rejected'])
prompts = list(df['prompt'])

In [34]:
def parse_conversation(conversation):
    prompt = ""
    for i in conversation:
        content = i['content']
        role = i['role']
        if role == 'user':
            prompt += f"<|user|>\n{content}<|end|>"
        if role == 'system':
            prompt += f"<|system|>\n{content}<|end|>"
        if role == 'assistant':
            prompt += f"<|assistant|>\n{content}<|end|>"
    if "<|system|>" not in prompt:
        prompt = f"<|system|>\n{random.choice(system_prompts)}<|end|>" + prompt
    return prompt

In [35]:
chosen_prompts = []
token_len = []
for i in tqdm(chosen):
    i = list(i)
    prompt = parse_conversation(i)
    tokens = len(tokenizer(prompt)['input_ids'])
    token_len.append(tokens)
    chosen_prompts.append(prompt)

100%|██████████████████████████████████████████████████████████████████████| 44245/44245 [00:26<00:00, 1685.40it/s]


In [36]:
rejected_prompts = []
for i in tqdm(rejected):
    i = list(i)
    prompt = parse_conversation(i)
    tokens = len(tokenizer(prompt)['input_ids'])
    rejected_prompts.append(prompt)

100%|██████████████████████████████████████████████████████████████████████| 44245/44245 [00:21<00:00, 2017.70it/s]


In [41]:
processed_prompts = []
for i in tqdm(prompts):
    prompt = f"<|system|>{random.choice(system_prompts)}<|end|><|user|>\n{i}<|end|>"
    tokens = len(tokenizer(prompt)['input_ids'])
    processed_prompts.append(prompt)

100%|██████████████████████████████████████████████████████████████████████| 44245/44245 [00:06<00:00, 7290.91it/s]


In [43]:
orpo_dataset = []
for chosen, rejected, prompt in zip(chosen_prompts, rejected_prompts, processed_prompts):
    orpo_dataset.append({'chosen': chosen, 'rejected': rejected, 'prompt': prompt})

In [44]:
with open("data/phi-3-orpo-data.json", 'w') as f:
    json.dump(orpo_dataset, f)