In [211]:
import torch 
from torch import nn 
from torch.utils.data import Dataset,DataLoader
import tiktoken 

# Fetching raw text

In [212]:
with open('the-verdict.txt','r',encoding='utf-8') as f:
  raw_text = f.read()

In [213]:
class GPTDatasetV1(Dataset):
  def __init__(self,text,tokenizer,max_length=4,stride=1):
    self.token_ids = tokenizer.encode(text)
    self.X = []
    self.y = []

    for i in range(0,len(self.token_ids)-max_length,stride):  
      input = self.token_ids[i:i+max_length]
      output = self.token_ids[(i)+1 : (i+max_length)+1]
      self.X.append(torch.tensor(input))
      self.y.append(torch.tensor(output))

      # The following (not converting to tensors) will lead to unexpected batching behavior.
      # self.X.append(input)
      # self.y.append(output)

  def __len__(self):return len(self.X)

  def __getitem__(self,idx):
    return self.X[idx],self.y[idx]


In [214]:
dataset = GPTDatasetV1(raw_text,tiktoken.get_encoding('gpt2'),max_length=4,stride=4)
dataloader = DataLoader(dataset,batch_size=8,shuffle=False,drop_last=True)

In [215]:
for x,y in dataloader:
  print(x)
  print('--'*20)
  print(y)
  break

tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
----------------------------------------
tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


In [216]:
vocab_size = 50257
output_dim = 256
token_embedding_layer = nn.Embedding(vocab_size,output_dim)

token_embedding_layer(x).shape 

torch.Size([8, 4, 256])

In [217]:
context_length = 4
pos_embedding_layer = nn.Embedding(context_length,output_dim)
pos_embedding_layer

Embedding(4, 256)

# The result of chapter 2 

In [218]:
(token_embedding_layer(x) + pos_embedding_layer(torch.arange(4))).shape

torch.Size([8, 4, 256])