# Processing the data for PoyoLLM
First of all we process and refine our data, just like in MacroData Refinement from severance.

In [1]:
from datasets import load_dataset, get_dataset_split_names

print(get_dataset_split_names("Helsinki-NLP/opus_books", "ca-en"))

ds = load_dataset("Helsinki-NLP/opus_books", "ca-en")

['train']


In [2]:
import os

print(f"There's a total of {ds.num_rows["train"]} rows in the dataset.")
filtered_ds = ds.filter(lambda x: x["translation"]["en"] is not None)
single_string = "".join([f'{x["translation"]["en"]} ' for x in filtered_ds["train"]])
print(single_string[:1000])  # Print the first 1000 characters of the concatenated string

os.makedirs("output", exist_ok="true")
with open("output/single_string.txt", "w") as file:
    file.write(single_string)



There's a total of 4605 rows in the dataset.
Source: Project Gutenberg The Adventures of Tom Sawyer Mark Twain PREFACE Most of the adventures recorded in this book really occurred; one or two were experiences of my own, the rest those of boys who were schoolmates of mine. Huck Finn is drawn from life; Tom Sawyer also, but not from an individual--he is a combination of the characteristics of three boys whom I knew, and therefore belongs to the composite order of architecture. The odd superstitions touched upon were all prevalent among children and slaves in the West at the period of this story--that is to say, thirty or forty years ago. Although my book is intended mainly for the entertainment of boys and girls, I hope it will not be shunned by men and women on that account, for part of my plan has been to try to pleasantly remind adults of what they once were themselves, and of how they felt and thought and talked, and what queer enterprises they sometimes engaged in. THE AUTHOR. HARTF

## Tokenizer
I'm gonna use minBPE :3

In [3]:
with open("output/single_string.txt", "r") as file:
    text_sequence = file.read()

len(text_sequence)

378289

In [4]:
import sys
sys.path.append('')

In [5]:
from minbpe import BasicTokenizer

tokenizer = BasicTokenizer()
tokenizer.train(text_sequence, vocab_size=1024)

In [6]:
vocab = tokenizer.vocab
vocab

{0: b'\x00',
 1: b'\x01',
 2: b'\x02',
 3: b'\x03',
 4: b'\x04',
 5: b'\x05',
 6: b'\x06',
 7: b'\x07',
 8: b'\x08',
 9: b'\t',
 10: b'\n',
 11: b'\x0b',
 12: b'\x0c',
 13: b'\r',
 14: b'\x0e',
 15: b'\x0f',
 16: b'\x10',
 17: b'\x11',
 18: b'\x12',
 19: b'\x13',
 20: b'\x14',
 21: b'\x15',
 22: b'\x16',
 23: b'\x17',
 24: b'\x18',
 25: b'\x19',
 26: b'\x1a',
 27: b'\x1b',
 28: b'\x1c',
 29: b'\x1d',
 30: b'\x1e',
 31: b'\x1f',
 32: b' ',
 33: b'!',
 34: b'"',
 35: b'#',
 36: b'$',
 37: b'%',
 38: b'&',
 39: b"'",
 40: b'(',
 41: b')',
 42: b'*',
 43: b'+',
 44: b',',
 45: b'-',
 46: b'.',
 47: b'/',
 48: b'0',
 49: b'1',
 50: b'2',
 51: b'3',
 52: b'4',
 53: b'5',
 54: b'6',
 55: b'7',
 56: b'8',
 57: b'9',
 58: b':',
 59: b';',
 60: b'<',
 61: b'=',
 62: b'>',
 63: b'?',
 64: b'@',
 65: b'A',
 66: b'B',
 67: b'C',
 68: b'D',
 69: b'E',
 70: b'F',
 71: b'G',
 72: b'H',
 73: b'I',
 74: b'J',
 75: b'K',
 76: b'L',
 77: b'M',
 78: b'N',
 79: b'O',
 80: b'P',
 81: b'Q',
 82: b'R',
 83: b'

In [7]:
max_vocab_id = list(tokenizer.vocab.keys())[-1]
tokenizer.special_tokens = {
    "<|startofstring|>": max_vocab_id + 1,
    "<|separator|>": max_vocab_id + 2,
    "<|endofstring|>": max_vocab_id + 3,
    "<|unk|>": max_vocab_id + 4
}

In [8]:
len(tokenizer.encode(text_sequence))

135235

In [None]:
os.makedirs("output/tokenizer", exist_ok="true")
tokenizer.save(file_prefix="output/tokenizer/poyo_tokenizer")