In [1]:
from datasets import Dataset, load_dataset
import torch

from gpt_tokenizers import BytePairEncoder

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import GPT2TokenizerFast
token = GPT2TokenizerFast.from_pretrained('gpt2')
token.add_special_tokens({'pad_token': '<pad>', "bos_token": "<bos>", "cls_token": "<cls>", "sep_token": "<s>", "mask_token": "<mask>"})
token = byte_pair_encoder  # pllsss sjust work

NameError: name 'byte_pair_encoder' is not defined

In [3]:
import pyarrow as pa

# Custom ChunkyText Dataset
Loads up a dataset with text chunks so we can process faster with map, and take advantages of the features that the dataset object offers. 

It integrates with [Apache Arrow](https://arrow.apache.org/overview/) ([more info](https://huggingface.co/docs/datasets/about_arrow)) to give these advantages.


In [4]:

class chunkyText(Dataset):
    def __init__(self, file, chunk_size):
        # self._data: Table = _check_table(arrow_table)
        self.chunks = []
        with open(file, encoding="utf-8") as f:
            text = f.read()
            start_idx = 0
            while start_idx < len(text):
                end_idx = min(start_idx + chunk_size, len(text))
                chunk = text[start_idx:end_idx]
                self.chunks.append(chunk)
                start_idx = end_idx

        # sets all of the needed vars underthehood for more complex dataset features
        chunks = pa.array(self.chunks, type=pa.string())
        arrow_table = pa.table([chunks], names=["chunks"])
        Dataset.__init__(self, arrow_table) 

    def __len__(self):
        return len(self.chunks)  # this wont ever update

    def __getitem__(self, idx):
        if not self.last:
            self.last = next(self.mapped_iter)

        nextval = next(self.mapped_iter)
        val = (self.last, nextval)
        self.last = next

        return (val)

    def __getitem__(self, idx):  
        return(self.chunks[idx])

# How to load the dataset

Use dataset text to load the files line by line and each row it just a line of string text. Its nice, but ever string has a diff len.
```python
load_dataset("text", data_files={"train": ["my_text_1.txt", "my_text_2.txt"], "test": "my_test_file.txt"})
```
You can also use the datadir arg to specify a dir of text file

In [None]:
from datasets import load_dataset

In [5]:
chunk_size = 1000 # 1000000 is a good number for larger datasets
# we could also just use the shard method here
dataset = load_dataset("text", data_files={"train": ["input_data_files/kant.txt"]})
# dataset = chunkyText("input_data_files/kant.txt", chunk_size)

Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 6260.16it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 932.27it/s]
Generating train split: 188287 examples [00:00, 2997873.89 examples/s]


In [8]:
from functools import partial
encodingPartial = partial(token, add_special_tokens=True, truncation=True, max_length=128)

In [10]:
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=encodingPartial,
    file_path="input_data_files/kant.txt",
    block_size=128,
)

In [14]:
dataset[0]

{'input_ids': tensor([  464,  4935, 20336,   412, 10482,   286,   383, 10056,  2350,   286,
         17129, 23219,    11,   416,  1846, 18713, 29576])}

# Processing the dataset into tokens
Using the map method for dataset lets you use multiprocessing to process the dataset in batches
```python
dataset_with_duplicates = dataset.map(lambda batch: {"b": batch["a"] * 2}, remove_columns=["a"], batched=True, batch_size=1000, num_proc=10)
```
The num proc is what specifies how many thread to use.
There is also a with_rank=True parameter for multiprocessing

In [7]:
dataset.shape 

(11231, 1)

In [8]:
from multiprocess import set_start_method
set_start_method("fork")  # for multiprocessing I think you need to set this on windows to fork

# Encoding with map
Map is a nice way to apply a transfomation to a dataset. It has options for multiprocessing, batching, and it showes your progress.

## Batching with BPE
If you try and use the batch mode on map it will return a batch with lists, which wont work with the decoder

## Multiprocessing with BPE
Multiprocessing is tricky in python, because it serializes all of its data into an entirelly new process, and intercommunication between the new process, and the original is difficult and resource intensive because every object must be serialized then unserialized and its asynchronus. 
Because of this its really tricky to create a new process with the BPE tokenizer, and you have to look into how python multiprocessing works and what kind of spawn time you are using.

In [22]:
# sing core encoding
#encodedDataset = dataset.map(lambda chunk : {"tokens": byte_pair_encoder.encode(str(chunk))})#, remove_columns="chunks")#, num_proc=4)

Map: 100%|██████████| 1631/1631 [08:41<00:00,  3.13 examples/s]


In [93]:
# with multiproc
# this function needs to be cleaned up
encodedDataset = dataset.map(lambda chunk : {"tokens": byte_pair_encoder.encode(str(chunk))}, remove_columns="chunks", num_proc=12)


[A
[A
[A
[A
[A
Map (num_proc=12): 100%|██████████| 11231/11231 [00:00<00:00, 13749.28 examples/s]


In [94]:
len(encodedDataset)

11231

In [10]:
# It possible to do something like this with batching built in

# batch_size = 16
# encodedDataset = dataset.map(byte_pair_encoder.encodethingy, batched=True, batch_size=batch_size, num_proc=4)

# Adding Labels to the old dataset


In [11]:
# # this was the old way of doing it, it wont work like this

# block_size = 256
# batch_size = 16
# ix = torch.randint(len(encodedDataset) - block_size, (batch_size,))
# x = torch.stack([encodedDataset[i:i + block_size] for i in ix])
# y = torch.stack([encodedDataset[i + 1:i + block_size + 1] for i in ix])
# # device = data[0].device
# # x, y = x.to(device), y.to(device)

: 

: 

In [11]:
print(encodedDataset[0].keys())

dict_keys(['tokens'])


In [95]:
from math import ceil
def process(block, block_size=256, column="tokens", Mag=.01):  # if mag is you will probably run out of space
    # Using mag is a big cope, but if you dont it will almost def crash your pc

    if not block:
        return()  # sometimes this gets a nothing?
    block = block[column]
    validWindows = len(block) - block_size
    if validWindows < 1:
        return({"input_ids":[], "labels":[]}) # cant use blocks smaller than. Maybe this should be done before hand
    step = ceil(validWindows * Mag)
    xwindows = []
    ywindows = []
    for yidx in range(1, (len(block) - block_size - 1), step):
        xidx = yidx-1  # transformers pretidct the next token (and the rest)?
        xwindows.append(block[xidx: xidx+block_size])
        ywindows.append(block[yidx: yidx+block_size])

    # print(f"inputsize{len(block)} outsizes, x:{len(xwindows)} y:{len(ywindows)}")

    #return({"input_ids":"decoy"})#, "labels":ywindows})
    return({"input_ids":xwindows, "labels":ywindows})
    # YOUR MAP FUNCTION CANT EVER RETURN NONE THIS WILL GIVE YOU A DOGWATER ERROR TRACE

In [96]:
xydataset = encodedDataset.map(process, remove_columns="tokens", num_proc=12)



[A
[A
Map (num_proc=12): 100%|██████████| 11231/11231 [00:00<00:00, 19418.91 examples/s]


In [97]:
print(f"New size of dataset after magnification: {len(xydataset)}\n Now with {xydataset.column_names} columns")

New size of dataset after magnification: 11231
 Now with ['input_ids', 'labels'] columns


In [44]:
process(encodedDataset[0]) # !!! Otherwise it will just do nothing!!!

{'input_ids': [[90,
   6,
   359,
   320,
   3813,
   6,
   25,
   220,
   6,
   59,
   77,
   294,
   1903,
   361,
   1189,
   2709,
   305,
   278,
   65,
   1013,
   954,
   33,
   443,
   289,
   296,
   988,
   81,
   279,
   904,
   289,
   1903,
   351,
   2799,
   68,
   280,
   291,
   11,
   531,
   271,
   76,
   3041,
   84,
   303,
   6567,
   375,
   59,
   77,
   59,
   77,
   5288,
   300,
   33,
   443,
   323,
   312,
   264,
   1239,
   289,
   5372,
   1165,
   3191,
   390,
   624,
   5018,
   266,
   334,
   59,
   2529,
   76,
   1206,
   624,
   753,
   5845,
   907,
   790,
   987,
   2522,
   13,
   220,
   5476,
   1483,
   396,
   6097,
   337,
   11,
   2396,
   337,
   962,
   533,
   59,
   77,
   262,
   12,
   1703,
   337,
   975,
   264,
   3698,
   1334,
   289,
   264,
   1903,
   361,
   1189,
   2709,
   305,
   278,
   65,
   1013,
   1170,
   318,
   527,
   4884,
   275,
   59,
   77,
   86,
   332,
   529,
   300,
   33,
   443,
   533,
   29

In [37]:
# you have to specify the tokens column!!!
test = process(encodedDataset[0]['tokens'])

In [38]:
test.keys()

dict_keys(['input_ids', 'labels'])

In [45]:
len(test['labels'])

95

# Testing the dataloader with hugging faces tech

this was the old way of running the model from the batch loader:
```python
X, Y = load_batch(data, block_size, batch_size)
X, Y = X.to(device), Y.to(device)
logits, loss = model(X[i:i + 1], Y[i:i + 1])
```

In [15]:
from BigramGPT import BigramLanguageModel

In [16]:
model = BigramLanguageModel()

In [17]:
from transformers import DataCollatorForLanguageModeling

In [18]:
collate_fn = DataCollatorForLanguageModeling(
    tokenizer=token, mlm=False
)

In [22]:
from transformers import Trainer, TrainingArguments

  return torch._C._cuda_getDeviceCount() > 0


In [None]:
# def collate_fn(batch):
#     input_tensors = [item[0] for item in batch]
#     target_tensors = [item[1] for item in batch]
#     input_tensors_padded = pad_sequence(input_tensors, batch_first=True)
#     target_tensors_padded = pad_sequence(target_tensors, batch_first=True)
#     return input_tensors_padded, target_tensors_padded

In [19]:
batch_size=16

In [25]:
training_args = TrainingArguments(  # we can overwrite the trainer, and make use our own optim
    output_dir="tests",
    overwrite_output_dir=False,
    num_train_epochs=1,
    per_device_train_batch_size=batch_size,
    save_steps=1,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    train_dataset=dataset,
)

In [26]:
trainer.train() 

  0%|          | 0/10686 [00:00<?, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


IndexError: index out of range in self

In [102]:
type(BytePairEncoder)

type

In [87]:
# looks like an issue with the dataloader

In [None]:
dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
    )