In [1]:
from datasets import Dataset, load_dataset
import torch

from gpt_tokenizers import BytePairEncoder

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
bpe_vocab_size = 7500
byte_pair_encoder = BytePairEncoder(bpe_vocab_size, 2)
byte_pair_encoder.load("encoder_directory/encoder-vocab.json", "encoder_directory/encoder-merges.txt")#{args.model_dir}")


# Old code that this should replace for ref

In [4]:
def encode_dataset_in_chunks(encoder, dataset, chunk_size=1000000):
    encoded_chunks = []
    start_idx = 0
    while start_idx < len(dataset):
        end_idx = min(start_idx + chunk_size, len(dataset))
        chunk = dataset[start_idx:end_idx]
        encoded_chunk = encoder.encode(chunk)
        encoded_chunk_tensor = torch.tensor(encoded_chunk, dtype=torch.long)  # Convert list to tensor
        encoded_chunks.append(encoded_chunk_tensor)
        start_idx = end_idx

    return torch.cat(encoded_chunks)  # Concatenate tensors before returning

In [5]:
def load_batch(data, block_size, batch_size):
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i + block_size] for i in ix])
    y = torch.stack([data[i + 1:i + block_size + 1] for i in ix])
    # device = data[0].device
    # x, y = x.to(device), y.to(device)
    return x, y

In [3]:
import pyarrow as pa

# Custom ChunkyText Dataset
Loads up a dataset with text chunks so we can process faster with map, and take advantages of the features that the dataset object offers. 

It integrates with [Apache Arrow](https://arrow.apache.org/overview/) ([more info](https://huggingface.co/docs/datasets/about_arrow)) to give these advantages.


In [4]:

class chunkyText(Dataset):
    def __init__(self, file, chunk_size):
        # self._data: Table = _check_table(arrow_table)
        self.chunks = []
        with open(file, encoding="utf-8") as f:
            text = f.read()
            start_idx = 0
            while start_idx < len(text):
                end_idx = min(start_idx + chunk_size, len(text))
                chunk = text[start_idx:end_idx]
                self.chunks.append(chunk)
                start_idx = end_idx

        # sets all of the needed vars underthehood for more complex dataset features
        chunks = pa.array(self.chunks, type=pa.string())
        arrow_table = pa.table([chunks], names=["chunks"])
        Dataset.__init__(self, arrow_table) 

    def __len__(self):
        return len(self.chunks)  # this wont ever update

    def __getitem__(self, idx):
        if not self.last:
            self.last = next(self.mapped_iter)

        nextval = next(self.mapped_iter)
        val = (self.last, nextval)
        self.last = next

        return (val)

    def __getitem__(self, idx):  
        return(self.chunks[idx])

# How to load the dataset

Use dataset text to load the files line by line and each row it just a line of string text. Its nice, but ever string has a diff len.
```python
load_dataset("text", data_files={"train": ["my_text_1.txt", "my_text_2.txt"], "test": "my_test_file.txt"})
```
You can also use the datadir arg to specify a dir of text file

In [5]:
chunk_size = 1000000
dataset = chunkyText("input_data_files/cleaned_orca_dataset.txt", chunk_size)

In [6]:
len(dataset)

1631

# Processing the dataset into tokens
Using the map method for dataset lets you use multiprocessing to process the dataset in batches
```python
dataset_with_duplicates = dataset.map(lambda batch: {"b": batch["a"] * 2}, remove_columns=["a"], batched=True, batch_size=1000, num_proc=10)
```
The num proc is what specifies how many thread to use.
There is also a with_rank=True parameter for multiprocessing

In [6]:
dataset.shape 

(1631, 1)

In [7]:
from multiprocess import set_start_method
set_start_method("fork")  # this is needed for linux maybe windows too spawn

In [14]:
byte_pair_encoder.encode(["this is a test", "lyl, not string typehahah"])

TypeError: TextInputSequence must be str

In [9]:
byte_pair_encoder.encode(dataset[0])

[3888,
 1442,
 316,
 4777,
 256,
 3286,
 259,
 1901,
 289,
 256,
 2289,
 1958,
 11,
 758,
 509,
 293,
 79,
 305,
 289,
 264,
 2289,
 13,
 198,
 5288,
 2289,
 323,
 626,
 3182,
 264,
 4061,
 4895,
 2639,
 678,
 266,
 425,
 335,
 576,
 264,
 2639,
 678,
 276,
 2799,
 281,
 617,
 366,
 1256,
 281,
 4829,
 341,
 941,
 81,
 730,
 2966,
 220,
 7,
 49,
 35,
 37,
 8,
 574,
 608,
 931,
 289,
 264,
 2775,
 220,
 7,
 82,
 1151,
 1189,
 11,
 5237,
 5424,
 4119,
 8,
 13,
 296,
 2799,
 35,
 37,
 574,
 608,
 931,
 2142,
 885,
 812,
 316,
 2063,
 327,
 264,
 574,
 608,
 931,
 1970,
 319,
 2543,
 6538,
 264,
 5286,
 266,
 551,
 76,
 375,
 1492,
 289,
 264,
 293,
 79,
 305,
 2639,
 678,
 13,
 296,
 293,
 79,
 305,
 323,
 256,
 2639,
 678,
 266,
 264,
 416,
 79,
 305,
 323,
 256,
 1097,
 289,
 574,
 608,
 931,
 289,
 264,
 2775,
 220,
 58,
 82,
 1151,
 1189,
 11,
 5237,
 5424,
 11,
 4119,
 60,
 327,
 6538,
 264,
 3727,
 1440,
 293,
 264,
 2639,
 678,
 13,
 1387,
 256,
 2639,
 678,
 345,
 782,
 1367,
 480

# Encoding with map
Map is a nice way to apply a transfomation to a dataset. It has options for multiprocessing, batching, and it showes your progress.

## Batching with BPE
If you try and use the batch mode on map it will return a batch with lists, which wont work with the decoder

## Multiprocessing with BPE
Multiprocessing is tricky in python, because it serializes all of its data into an entirelly new process, and intercommunication between the new process, and the original is difficult and resource intensive because every object must be serialized then unserialized and its asynchronus. 
Because of this its really tricky to create a new process with the BPE tokenizer, and you have to look into how python multiprocessing works and what kind of spawn time you are using.

In [22]:
# looks like the encoder cant encode a list of strings into a "batch"
# lets see if it can encode them all one by one
encodedDataset = dataset.map(lambda chunk : {"tokens": byte_pair_encoder.encode(str(chunk))})#, remove_columns="chunks")#, num_proc=4)

Map: 100%|██████████| 1631/1631 [08:41<00:00,  3.13 examples/s]


In [8]:
# with multiproc
encodedDataset = dataset.map(lambda chunk : {"tokens": byte_pair_encoder.encode(str(chunk))}, remove_columns="chunks", num_proc=4)

Map (num_proc=4): 100%|██████████| 1631/1631 [02:43<00:00, 10.00 examples/s]


In [9]:
len(encodedDataset)

1631

In [10]:
# It possible to do something like this with batching built in

# batch_size = 16
# encodedDataset = dataset.map(byte_pair_encoder.encodethingy, batched=True, batch_size=batch_size, num_proc=4)

# Adding Labels to the old dataset


**Everything past this is testing that isnt done yet**

In [11]:
block_size = 256
batch_size = 16
ix = torch.randint(len(encodedDataset) - block_size, (batch_size,))
x = torch.stack([encodedDataset[i:i + block_size] for i in ix])
y = torch.stack([encodedDataset[i + 1:i + block_size + 1] for i in ix])
# device = data[0].device
# x, y = x.to(device), y.to(device)

: 

: 

In [None]:

X, Y = load_batch(data, block_size, batch_size)
X, Y = X.to(device), Y.to(device)
logits, loss = model(X[i:i + 1], Y[i:i + 1])