In [None]:
    # poor man's data loader
data_dir = os.path.join('data', dataset)
train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
val_data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
    if device_type == 'cuda':
        # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
    else:
        x, y = x.to(device), y.to(device)
    return x, y



In [None]:
# define the dataset
ds = deeplake.empty(path, overwrite=True)

ds.create_tensor('text', htype="text", chunk_compression='lz4')
ds.create_tensor('tokens', dtype=np.uint16, chunk_compression='lz4')

@deeplake.compute
def tokenize(example, ds):
    ids = enc.encode_ordinary(example) # encode_ordinary ignores any special tokens
    ids.append(enc.eot_token) # add the end of text token, e.g. 50256 for gpt2 bpe
    ds.append({"text": example, "tokens": np.array(ids).astype(np.uint16)})

# we now want to tokenize the dataset. first define the encoding function (gpt2 bpe)

tokenize().eval(split_dataset[split]['text'], ds, num_workers=num_proc, scheduler="processed")
ds.commit()
ds.summary()



In [None]:
def collate_fn(data: List[np.ndarray]) -> Tuple[torch.Tensor, torch.Tensor]:
  """ Collate function samples from a batch of documents """
  #concatenate all the tokens from the batch
  data = [d['tokens'] for d in data]
  data = np.concatenate(data, axis=0)

  #sample a random block of from concatenated documents
  ix = torch.randint(max(len(data) - block_size, 1), (batch_size,))
  local_block_size = min(block_size, len(data)-1)

  x = torch.stack(
[torch.from_numpy((data[i:i+local_block_size]).astype(np.int64)) for i in ix])
  y = torch.stack(
[torch.from_numpy((data[i+1:i+1+local_block_size]).astype(np.int64)) for i in ix])
  return x, y

def get_dataloader(split: deeplake.Dataset, shuffle: bool = False, coef: float = 2, num_workers: int = 1):
""" Returns a dataloader for the given split. Uses fast enterprise dataloader if available"""
return dataloader(split)\
   .batch(int(coef*batch_size), drop_last=True)\
   .shuffle(shuffle)\
      .pytorch(num_workers=num_workers, tensors=['tokens'], collate_fn=collate_fn, distributed=ddp)



In [None]:
ds = deeplake.load(dataset, read_only=True, token=token)
ds.checkout(branch)

meta_vocab_size = None

n_tokens = sum(ds._tokens_shape.numpy())
print(f'There are ~{n_tokens[0]//10**9}B tokens in the dataset')

split = int(len(ds)*train_split_ratio)
dl = {
  "train": get_dataloader(ds[:split], shuffle=shuffle, num_workers=num_workers),
  "val": get_dataloader(ds[split:], shuffle=False, num_workers=1)
}
dl_iter = {"train": dl["train"].__iter__(), "val": dl["val"].__iter__()}

In [None]:
$ python3 train.py --dataset="hub://activeloop/openwebtext-train"

In [None]:
$ torchrun --standalone --nproc_per_node=8 train.py --dataset="hub://activeloop/openwebtext-train"