In [8]:
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer


input_column = "final_input"
output_column = "final_target"
rationale_column = "rationale"

max_gen_length = 100
tokenizer_name = "facebook/galactica-125m"
answer_trigger = "ANSWER: "
cot_trigger = f"BOT: "
instruction = f""


def add_instruction(example):
    text = f"{example[input_column]}\n{cot_trigger}"
    example[input_column] = text
    return example


tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
full_dataset = load_dataset("jeggers/CoT-Collection", split="train[1500:4000]")
rationale_dataset = load_dataset("jeggers/CoT-Collection-Rationales", split="train")



In [9]:
# filter dataset to only inclue inputs <= 150 tokens
def batch_filter(batch):
    tokenized = tokenizer(batch['final_input'])
    return [len(item) <= 150 for item in tokenized['input_ids']]

full_dataset = full_dataset.filter(batch_filter, batched=True, batch_size=-1)
print(full_dataset)

Dataset({
    features: ['final_input', 'final_target', 'Canary String', 'category', 'original_dataset', 'evaluation_method', 'dataset'],
    num_rows: 1651
})


In [10]:
# apply add_instruction to all examples from both datasets
# in full_dataset add output column that is answer_trigger + current output column
# in rationale_dataset add output column that is rationale + answer_trigger + current output column
# select only the input and output columns for both datasets
# sample size of rationale_dataset from full_dataset to match
# concatenate the two datasets, shuffle, push to huggingface

full_dataset = full_dataset.map(add_instruction)
rationale_dataset = rationale_dataset.map(add_instruction)

full_dataset = full_dataset.map(lambda x: { output_column: answer_trigger + x[output_column]})
rationale_dataset = rationale_dataset.map(lambda x: { output_column: x[rationale_column] + answer_trigger + x[output_column]})

full_dataset = full_dataset.select_columns([input_column, output_column])
rationale_dataset = rationale_dataset.select_columns([input_column, output_column])

full_dataset = full_dataset.select(range(len(rationale_dataset)))

dataset = concatenate_datasets([full_dataset, rationale_dataset])
dataset = dataset.shuffle()

dataset.push_to_hub("jeggers/CoT-Collection-finetuning")

Map: 100%|██████████| 1651/1651 [00:00<00:00, 9691.89 examples/s] 
Map: 100%|██████████| 722/722 [00:00<00:00, 7773.57 examples/s]
Map: 100%|██████████| 1651/1651 [00:00<00:00, 12397.76 examples/s]
Map: 100%|██████████| 722/722 [00:00<00:00, 6219.23 examples/s]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 80.96ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.36s/it]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


CommitInfo(commit_url='https://huggingface.co/datasets/jeggers/CoT-Collection-finetuning/commit/05731665609288487721f5f982679339fe1b3e7a', commit_message='Upload dataset', commit_description='', oid='05731665609288487721f5f982679339fe1b3e7a', pr_url=None, pr_revision=None, pr_num=None)