# Data Preparation

Here we will use a tokenizer from the tiktoken library, instead of our own.

In [33]:
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader

In [34]:
# load data
with open("data/the-verdict.txt", "r", encoding="utf-8") as file:
    raw_text = file.read()

In [35]:
class LLMDataset(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1:i+max_length+1]

            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]
    

def create_dataloader(
        text, batch_size=4, max_length=256, 
        stride=128, shuffle=True, 
        drop_last=True, num_workers=0
    ):

    # get tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # create the dataset
    dataset = LLMDataset(text, tokenizer, max_length, stride)

    # create the data loader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader, tokenizer.n_vocab


In [46]:
max_length = 4
dataloader, vocab_size = create_dataloader(
    raw_text,
    batch_size=8,
    max_length=max_length,
    stride=4,
    shuffle=True)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)

## Embeddings

In [47]:
output_dim = 256

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.3035,  ...,  1.3337,  0.0771, -0.0522],
        [ 0.2386,  0.1411, -1.3354,  ..., -0.0315, -1.0640,  0.9417],
        [-1.3152, -0.0677, -0.1350,  ..., -0.3181, -1.3936,  0.5226],
        ...,
        [ 0.5871, -0.0572, -1.1628,  ..., -0.6887, -0.7364,  0.4479],
        [ 0.4438,  0.7411,  1.1263,  ...,  1.2091,  0.6781,  0.3331],
        [-0.2537,  0.1446,  0.7203,  ..., -0.2134,  0.2144,  0.3006]],
       requires_grad=True)


In [48]:
token_embeddings = embedding_layer(inputs)
pos_embedding_layer = torch.nn.Embedding(max_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(max_length))

input_embeddings = token_embeddings + pos_embeddings
input_embeddings.shape

torch.Size([8, 4, 256])