In [1]:
from torch.utils.data import DataLoader, Dataset
import torch
import tiktoken

In [2]:
class DatasetV1(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

        for i in range(0, len(token_ids)-max_length, stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1:i+max_length+1]

            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    def __len__(self):
           return len(self.input_ids)
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


In [3]:
def create_data_loader(text, max_length=256, stride=128,
                       batch_size=4, shuffle=True, drop_last=True,
                       num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")

    dataset = DatasetV1(text, tokenizer, max_length, stride)

    dataloader = DataLoader(dataset=dataset,
                            batch_size=batch_size,
                            shuffle=shuffle,
                            drop_last=drop_last,
                            num_workers=num_workers)
    return dataloader

In [4]:
DIR = 'DATA/the-verdict.txt'

In [5]:
with open(DIR,'r', encoding='utf-8') as f:
    raw_text = f.read()

In [6]:
dataloader = create_data_loader(text=raw_text,
                                batch_size=8,
                                max_length=4,
                                stride=4,
                                shuffle=False)
data = iter(dataloader)
inputs,targets = next(data)
print(inputs, inputs.shape)

tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]]) torch.Size([8, 4])


In [7]:
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [8]:
token_embedding = token_embedding_layer(inputs)
print(token_embedding.shape)

torch.Size([8, 4, 256])
