In [38]:
import torch
from torch.utils.data import Dataset, DataLoader
import tiktoken

test_str: str = "this is the string that i want to test on to gain a better understanding"

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_tokens = []
        self.target_tokens = []
    
        tokens = tokenizer.encode(txt)

        for i in range(0, len(tokens) - max_length, stride):
            input_chunk = tokens[i:i+max_length]
            target_chunk = tokens[i+1:i+1+max_length]

            self.input_tokens.append(torch.tensor(input_chunk))
            self.target_tokens.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_tokens)
    
    def __getitem__(self, idx):
        return self.input_tokens[idx], self.target_tokens[idx]

def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=0
    )

    return dataloader

In [45]:
vocab_size = 50257
output_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

raw_text = "i want to test getting vectors on this string to see how exactly it works and then get embeddings after for an even better understanding of the inner workings of the textual data and how tokens get transformed to vectors over time"

max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length, stride=max_length, shuffle=False)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

print(token_embeddings)

Token IDs:
 tensor([[   72,   765,   284,  1332],
        [ 1972, 30104,   319,   428],
        [ 4731,   284,   766,   703],
        [ 3446,   340,  2499,   290],
        [  788,   651, 11525,    67],
        [  654,   706,   329,   281],
        [  772,  1365,  4547,   286],
        [  262,  8434, 35084,   286]])

Inputs shape:
 torch.Size([8, 4])
torch.Size([8, 4, 256])
tensor([[[-1.2513e+00, -5.1359e-01,  1.1075e+00,  ...,  1.6695e-01,
          -3.5110e-01,  8.4835e-01],
         [ 1.6229e+00, -1.4200e-01,  5.1070e-04,  ..., -1.1836e+00,
           5.6895e-01,  2.0970e+00],
         [ 1.5743e-01, -1.8287e-01,  1.5860e+00,  ...,  2.2926e+00,
           1.3959e-01,  9.1322e-01],
         [-8.6290e-01,  1.8534e+00,  4.5277e-01,  ...,  1.2264e+00,
          -1.0611e+00, -5.4521e-01]],

        [[-3.6909e-01,  1.3187e+00,  4.8116e-01,  ...,  1.5415e-01,
           3.1157e-02,  2.4635e-01],
         [ 2.3528e-01, -7.5543e-01, -3.2973e-01,  ...,  7.2989e-01,
           3.5565e-01, -6.922