# Intra-document masking

In [1]:
import os
import torch

from absl import app, flags
from itertools import chain
from functools import partial
from typing import Dict, List, Any

from datasets import load_dataset, Dataset
from transformers import GPT2Tokenizer
from timeit import default_timer as timer

from utils import concat_chunck, _make_intra_doc_causal_mask

  from .autonotebook import tqdm as notebook_tqdm


## Dummy data

In [2]:
# Some fake tokens
examples = {
    'input_ids': [
        [0] * 1,
        [1] * 6,
        [2] * 2,
        [3] * 1,
        [4] * 8,
        [5] * 8,
        [6] * 8,
    ]
}
concat_chunck(examples, max_seq_length = 5)

{'input_ids': [[0, 1, 1, 1, 1],
  [1, 1, 2, 2, 3],
  [4, 4, 4, 4, 4],
  [4, 4, 4, 5, 5],
  [5, 5, 5, 5, 5],
  [5, 6, 6, 6, 6]],
 'intra_docs_bounds': [[1, 4], [2, 2, 1], [5], [3, 2], [5], [1, 4]],
 'intra_docs_mask': [tensor([[ True, False, False, False, False],
          [False,  True, False, False, False],
          [False,  True,  True, False, False],
          [False,  True,  True,  True, False],
          [False,  True,  True,  True,  True]]),
  tensor([[ True, False, False, False, False],
          [ True,  True, False, False, False],
          [False, False,  True, False, False],
          [False, False,  True,  True, False],
          [False, False, False, False,  True]]),
  tensor([[ True, False, False, False, False],
          [ True,  True, False, False, False],
          [ True,  True,  True, False, False],
          [ True,  True,  True,  True, False],
          [ True,  True,  True,  True,  True]]),
  tensor([[ True, False, False, False, False],
          [ True,  True, F

Looks like it's working as espected. Let's try the batched mode.

In [3]:
# Some fake tokens
examples = {
    'input_ids': [
        [0] * 1,
        [1] * 6,
        [2] * 2,
        [3] * 1,
        [4] * 8,
        [5] * 8,
        [6] * 8,
    ]
}

fake_ds = Dataset.from_dict(examples)
fake_ds

Dataset({
    features: ['input_ids'],
    num_rows: 7
})

In [None]:
map_setup = dict(
    batched=True,
    batch_size=1024,
    num_proc=2
)
lm_datasets = fake_ds.map(
  partial(concat_chunck, max_seq_length=5),
  **map_setup
  # batched=True,
  # batch_size=10,
  # num_proc=2
)

Map (num_proc=2):   0%|                                                                       | 0/7 [00:00<?, ? examples/s]

In [None]:
print(lm_datasets)

Higher token loss, makes sense.

In [None]:

# ds = tokenized_datasets.take(10_000)
# examples = ds[:]
#
# >>> type(ds)
# <class 'datasets.arrow_dataset.Dataset'>
# >>> len(ds)
# 10000
# >>> ds
# Dataset({
#     features: ['input_ids'],
#     num_rows: 10000
# })
# >>> 
# >>> type(examples)
# <class 'dict'>
# >>> type(examples['input_ids'])
# <class 'list'>
# >>> len(examples['input_ids'])
# 10000
# >>> len(examples['input_ids'][0])
# 157
# >>> len(examples['input_ids'][1])
# 1257
# >>>
    # doc_ids = list(chain(*[[i]*l for i, l in enumerate(lengths)]))
    # result['intra_mask'] = [
    #     [
    #         [1 if doc_ids[i] == doc_ids[j] else 0
    #          for j in range(start, start+max_seq_length)]
    #         for i in range(start, start+max_seq_length)
    #     ]
    #     for start in range(0, total_length, max_seq_length)
    # ]


## FineWebEdu

In [None]:
nrows = 1000

seq_len = 2048

# Load in streaming mode, creates an IterableDataset
raw_dataset = load_dataset(
  "HuggingFaceFW/fineweb-edu",
  split = 'train',
  name='sample-10BT',
  streaming=streaming,
  columns=["text"]
)

# From IterableDataset to Dataset
iterable_ds = raw_dataset.take(nrows)
def gen_from_iterable_dataset(iterable_ds):
  yield from iterable_ds
partial_obj = partial(gen_from_iterable_dataset, iterable_ds)
dataset = Dataset.from_generator(partial_obj, features=iterable_ds.features)


# Shuffle so that multiproc has shards of similar size
if shuffle_raw_data:
  dataset = dataset.shuffle(seed=1996)