In [1]:
import datasets
from tokenizers import TRIETokenizerFast
from matplotlib import pyplot as plt
import json
from tqdm.notebook import tqdm
from dataloader import DatasetWriter, SingleDatasetReader
from typing import *
import numpy as np
from torch.utils.data import DataLoader
from functools import partial
import re

In [2]:
tokenizer = TRIETokenizerFast('llama_vocab_pruned_32k.json')

In [None]:
alpaca_gpt = datasets.load_dataset('vicgalle/alpaca-gpt4', cache_dir='./corpus')

In [None]:
airoboros = datasets.load_dataset('jondurbin/airoboros-2.2.1', cache_dir='./corpus')

In [None]:
wizardlm = datasets.load_dataset('WizardLM/WizardLM_evol_instruct_V2_196k', cache_dir='./corpus')

In [None]:
sharegpt4 = datasets.load_dataset('shibing624/sharegpt_gpt4', cache_dir='./corpus')

In [None]:
def sharegpt4_en_filter(entry):
    conv = entry['conversations']
    # print([re.findall(r'[^a-zA-Z0-9!@#$%^&*()_\-+=\[\]{}|\\;:\'",<>/?.`~\n\t\r ]', c['value']) for c in conv])
    return all(len(re.findall(r'[^a-zA-Z0-9!@#$%^&*()_\-+=\[\]{}|\\;:\'",<>/?.`~\n\t\r ]', c['value'])) == 0 for c in conv)

In [None]:
sharegpt4_en = sharegpt4.filter(sharegpt4_en_filter)

In [4]:
ultrachat = datasets.load_dataset('HuggingFaceH4/ultrachat_200k', cache_dir='./corpus')

In [9]:
def ultrachat_en_filter(entry):
    conv = entry['messages']
    # print([re.findall(r'[^a-zA-Z0-9!@#$%^&*()_\-+=\[\]{}|\\;:\'",<>/?.`~\n\t\r ]', c['value']) for c in conv])
    return all(len(re.findall(r'[^a-zA-Z0-9!@#$%^&*()_\-+=\[\]{}|\\;:\'",<>/?.`~\n\t\r ]', c['content'])) == 0 for c in conv)

ultrachat_en = ultrachat['train_sft'].filter(ultrachat_en_filter)

Filter:   0%|          | 0/207865 [00:00<?, ? examples/s]

In [17]:
def preview_chunk(token_ids, attn_mask, loss_mask):
    unique_mask = [i for i in set(attn_mask) if i != 0]
    print(f'Total {len(unique_mask)} dialogues within chunk.')
    for m in unique_mask:
        print(f'Dialogue index {m}')
        dialogue_token_ids = [token_ids[i] for i in range(len(token_ids)) if attn_mask[i] == m]
        dialogue_token_ids_loss = [token_ids[i] for i in range(len(token_ids)) if attn_mask[i] == m and loss_mask[i] == 1]
        print('Full text:')
        print(tokenizer.decode(dialogue_token_ids))
        print('-' * 80)
        print('Loss text:')
        print(tokenizer.decode(dialogue_token_ids_loss))
        print('=' * 80)

In [18]:
def dialogues_to_chunks(dialogues: List[List[Tuple[str, str]]], chunk_length: int, max_message_length: int, overlap_count: int):
    assert max_message_length * overlap_count < chunk_length, 'max_message_length * overlap_count >= chunk_size can cause infinite loop'

    skip_dialogue_count = 0

    start_tokens = tokenizer.encode('<s>A chat between User and Assistant.\n')

    mask_index = 0
    token_ids, attn_mask, loss_mask = [], [], []

    for dial in tqdm(dialogues):
        dial_encoded = [(m[0], tokenizer.encode(f'{m[0]}:{m[1]}' + {'User': '\n', 'Assistant': '</s>\n'}[m[0]])) for m in dial]
        if any(len(m[1]) > max_message_length for m in dial_encoded):
            skip_dialogue_count += 1
            continue

        mask_index += 1
        if chunk_length - len(token_ids) <= len(start_tokens):
            pad_length = chunk_length - len(token_ids)
            token_ids += [0 for _ in range(pad_length)]
            attn_mask += [chunk_length + 1 for _ in range(pad_length)]  # use standalone index for padding mask to avoid "void attention"
            loss_mask += [0 for _ in range(pad_length)]
            assert len(token_ids) == len(attn_mask) == len(loss_mask) == chunk_length
            yield token_ids, attn_mask, loss_mask
            mask_index = 1
            token_ids, attn_mask, loss_mask = start_tokens.copy(), [1 for _ in range(len(start_tokens))], [0 for _ in range(len(start_tokens))]
        else:
            token_ids += start_tokens
            attn_mask += [mask_index for _ in range(len(start_tokens))]
            loss_mask += [0 for _ in range(len(start_tokens))]

        msg_index, max_msg_index = 0, -1
        while msg_index < len(dial_encoded):
            src, msg = dial_encoded[msg_index]
            append_length = min(chunk_length - len(token_ids), len(msg))
            token_ids += msg[:append_length]
            attn_mask += [mask_index for _ in range(append_length)]
            loss_mask += [0 for _ in range(append_length)] if src == 'User' or msg_index <= max_msg_index else [1 for _ in range(append_length)]
            max_msg_index = max(msg_index, max_msg_index)
            if len(token_ids) == chunk_length:
                assert len(token_ids) == len(attn_mask) == len(loss_mask) == chunk_length
                yield token_ids, attn_mask, loss_mask
                mask_index = 1
                token_ids, attn_mask, loss_mask = [], [], []
                msg_index -= min(overlap_count, msg_index)
            else:
                msg_index += 1

    if len(token_ids) > len(start_tokens):
        pad_length = chunk_length - len(token_ids)
        token_ids += [0 for _ in range(pad_length)]
        attn_mask += [chunk_length + 1 for _ in range(pad_length)]
        loss_mask += [0 for _ in range(pad_length)]
        assert len(token_ids) == len(attn_mask) == len(loss_mask) == chunk_length
        yield token_ids, attn_mask, loss_mask

    print(f'Skipped {skip_dialogue_count}/{len(dialogues)} dialogues.')

In [19]:
def write_out_dataset(file, entries):
    writer = DatasetWriter(file, {'token_ids': np.uint16, 'attn_mask': np.uint16, 'loss_mask': np.uint16})
    for (token_ids, attn_mask, loss_mask) in tqdm(entries):
        writer.add_entry(token_ids=np.array(token_ids, dtype=np.uint16),
                         attn_mask=np.array(attn_mask, dtype=np.uint16),
                         loss_mask=np.array(loss_mask, dtype=np.uint16))
    writer.finish()

In [20]:
dialogues_to_chunks_1024 = partial(dialogues_to_chunks, chunk_length=1024, max_message_length=450, overlap_count=1)

In [None]:
def alpaca_to_dialogue(alpaca_sample):
    return [('User', alpaca_sample['instruction'] + (f'\n{alpaca_sample["input"]}' if alpaca_sample['input'] != '' else '')),
            ('Assistant', alpaca_sample['output'])]


alpaca_diags = [alpaca_to_dialogue(d) for d in alpaca_gpt['train']]

In [None]:
alpaca_chunks = [x for x in dialogues_to_chunks_1024(alpaca_diags)]

In [None]:
write_out_dataset('datasets/sft/alpaca_gpt4.bin', alpaca_chunks)

In [None]:
airoboros_diags = [[('User', d['instruction']), ('Assistant', d['response'])] for d in airoboros['train'] if not 'contextual' in d['category']]

In [None]:
airoboros_chunks = [x for x in dialogues_to_chunks_1024(airoboros_diags)]

In [None]:
write_out_dataset('datasets/sft/airoboros_2.2.1.bin', airoboros_chunks)

In [None]:
wizardlm_diags = [[('User' if msg['from'] == 'human' else 'Assistant', msg['value']) for msg in diag] for diag in wizardlm['train']['conversations']]

In [None]:
wizardlm_chunks = [x for x in dialogues_to_chunks_1024(wizardlm_diags)]

In [None]:
write_out_dataset('datasets/sft/wizardlm_evol_2.bin', wizardlm_chunks)

In [None]:
sharegpt4_en_diags = [[('User' if msg['from'] == 'human' else 'Assistant', msg['value']) for msg in diag] for diag in sharegpt4_en['train']['conversations']]

In [None]:
sharegpt4_en_chunks = [x for x in dialogues_to_chunks_1024(sharegpt4_en_diags)]

In [None]:
write_out_dataset('datasets/sft/sharegpt_gpt4.bin', sharegpt4_en_chunks)

In [14]:
ultrachat_en_diags = [[('User' if msg['role'] == 'user' else 'Assistant', msg['content']) for msg in diag] for diag in ultrachat_en['messages']]

In [21]:
ultrachat_en_chunks = [x for x in dialogues_to_chunks_1024(ultrachat_en_diags)]

  0%|          | 0/121030 [00:00<?, ?it/s]

Skipped 61611/121030 dialogues.


In [26]:
write_out_dataset('datasets/sft/ultrachat.bin', ultrachat_en_chunks)

  0%|          | 0/70705 [00:00<?, ?it/s]