In [3]:
import os

In [4]:
# Load dataset from Hugging Face datasets library
from datasets import load_dataset, concatenate_datasets

DATA_DIR = f"/home/pranav-pc/projects/OpenTransformer/multiformer/data"
dataset_train = load_dataset(
    path=os.path.join(DATA_DIR, "downloads/TinyStories"),
    trust_remote_code=True,
    split="train",
)
dataset_val = load_dataset(
    path=os.path.join(DATA_DIR, "downloads/TinyStories"),
    trust_remote_code=True,
    split="validation",
)

dataset = concatenate_datasets([dataset_train, dataset_val])
del dataset_train
del dataset_val

In [5]:
from src.tokenize.tokenizer import Tokenizer

TOKENIZER_CHECKPOINT = (
    "/home/pranav-pc/projects/OpenTransformer/multiformer/tokenizer_checkpoints/"
)

tokenizer = Tokenizer(TOKENIZER_CHECKPOINT)

In [6]:
text = dataset[0]["text"]
text

'One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.\n\nLily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."\n\nTogether, they shared the needle and sewed the button on Lily\'s shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together.'

In [30]:
len(tokenizer.encode(text, out_type=int))

186

In [39]:
tokenizer.encode("\n\n", out_type=int)

[1, 29871, 13, 13, 2]

In [38]:
print(tokenizer.encode(text, out_type=int))

[1, 3118, 2462, 29892, 263, 2217, 7826, 4257, 365, 2354, 1476, 263, 817, 280, 297, 902, 5716, 29889, 2296, 6363, 372, 471, 5189, 304, 1708, 411, 372, 1363, 372, 471, 15301, 29889, 365, 2354, 5131, 304, 6232, 278, 817, 280, 411, 902, 16823, 29892, 577, 1183, 1033, 409, 29893, 263, 2826, 373, 902, 528, 2728, 29889, 13, 13, 29931, 2354, 3512, 304, 902, 16823, 322, 1497, 29892, 376, 29924, 290, 29892, 306, 1476, 445, 817, 280, 29889, 1815, 366, 6232, 372, 411, 592, 322, 409, 29893, 590, 528, 2728, 3026, 2439, 16823, 25156, 322, 1497, 29892, 376, 8241, 29892, 365, 2354, 29892, 591, 508, 6232, 278, 817, 280, 322, 2329, 596, 528, 2728, 1213, 13, 13, 29911, 12966, 29892, 896, 7258, 278, 817, 280, 322, 409, 8734, 278, 2826, 373, 365, 2354, 29915, 29879, 528, 2728, 29889, 739, 471, 451, 5189, 363, 963, 1363, 896, 892, 19383, 322, 19912, 1269, 916, 29889, 2860, 896, 7743, 29892, 365, 2354, 6452, 287, 902, 16823, 363, 19383, 278, 817, 280, 322, 27826, 902, 528, 2728, 29889, 2688, 1716, 7091, 9796,

In [42]:
print(
    tokenizer.decode_ids(
        [
            1,
            3118,
            2462,
            29892,
            263,
            2217,
            7826,
            4257,
            365,
            2354,
            1476,
            263,
            817,
            280,
            297,
            902,
            5716,
            29889,
            2296,
            6363,
            372,
            471,
            5189,
            304,
            1708,
            411,
            372,
            1363,
            372,
            471,
            15301,
            29889,
            365,
            2354,
            13,
            13,
            5131,
            304,
        ]
    )
)

One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily

 wanted to


In [None]:
data = dataset.map(
    lambda example: {
        "idx": [en[:block_size] for en in tokenizer.encode(example["text"])]
    },
    batch_size=512,
    batched=True,
    remove_columns=dataset["train"].column_names,
)


# Define collate function to handle padding
def collate_fn(batch):
    x_batch = [torch.tensor(en[:-1]) for en in batch]  # Extract x (remove last token)
    y_batch = [torch.tensor(en[1:]) for en in batch]  # Extract y (remove first token)
    x_padded = pad_sequence(
        x_batch, batch_first=True, padding_value=tokenizer.eos_id()
    )  # Pad x sequences
    y_padded = pad_sequence(
        y_batch, batch_first=True, padding_value=tokenizer.eos_id()
    )  # Pad y sequences
    return x_padded, y_padded


# Sort the data and turn off shuffle - Simplest way of implementing Seq leng batch sampling
train_data = sorted(data["train"]["idx"], key=lambda x: len(x))

# Create DataLoader with collate function
train_loader = DataLoader(
    train_data, batch_size=batch, collate_fn=collate_fn, shuffle=False
)