In [1]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
    
print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [2]:
import tiktoken
import torch 
from torch.utils.data import Dataset, DataLoader

In [3]:
tokenizer = tiktoken.get_encoding("gpt2")


In [6]:
class gptdatasetv1(Dataset):
    def __init__(self, text, tokenizer, max_length=256, stride=128):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.stride = stride
        self.tokens = tokenizer.encode(text)
        self.input_ids = []
        self.target_ids = []

        for i in range(0, len(self.tokens) - max_length , stride):
            input_chunk = self.tokens[i: i + max_length]
            target_chunk = self.tokens[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
        
    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [7]:
def dataloader_v1(text, batch_size = 4, max_length= 256, stride= 128, shuffle = True, drop_last = True, num_workers= 0):
    dataset = gptdatasetv1(text, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset,
        batch_size = batch_size,
        shuffle = shuffle,
        drop_last = drop_last,
        num_workers = num_workers
    )
    return dataloader
dataloader = dataloader_v1(raw_text)

In [8]:
data_iter = iter(dataloader)
input_ids, target_ids = next(data_iter)

In [9]:
print("Input Ids shape:", input_ids.shape)
print("Target Ids shape:", target_ids.shape)

Input Ids shape: torch.Size([4, 256])
Target Ids shape: torch.Size([4, 256])


In [11]:
print(input_ids)


tensor([[ 2612,  4369,    11,  ...,   655,  4030,   465],
        [  286,   616,  4286,  ...,   470,   910,   416],
        [22645,    11,   465,  ...,   286,   340,   477],
        [  503,  4291,   262,  ..., 19217, 24887, 13431]])


In [12]:
# create token embeddings 



In [13]:
input_ids = torch.tensor([2,3,5,1])
vocab_size = 6
output_dim = 3

In [14]:
embedding_layer = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=output_dim)
embeddings = embedding_layer(input_ids)

In [15]:
print(embeddings)

tensor([[ 0.9108,  0.1050, -0.8186],
        [-1.0883, -0.9800,  0.7393],
        [ 1.7231,  1.0449, -0.7985],
        [-0.6408, -0.2946,  0.9640]], grad_fn=<EmbeddingBackward0>)


In [16]:
print(embedding_layer(torch.tensor([0,1,2,3,4,5])))

tensor([[ 0.5095, -0.3179,  0.6937],
        [-0.6408, -0.2946,  0.9640],
        [ 0.9108,  0.1050, -0.8186],
        [-1.0883, -0.9800,  0.7393],
        [ 0.0346,  1.0768, -0.9972],
        [ 1.7231,  1.0449, -0.7985]], grad_fn=<EmbeddingBackward0>)


In [17]:
#positional embeddings (encode word positions)

vocab_size = 50257
output_dim = 256 

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)


In [18]:
max_length = 4
dataloader = dataloader_v1(raw_text, batch_size=2, max_length=max_length, stride=max_length)    
data_iter   = iter(dataloader)
input_ids, target_ids = next(data_iter)

In [19]:
token_embeddinds = token_embedding_layer(input_ids)
print("Token embeddings shape:", token_embeddinds.shape)

Token embeddings shape: torch.Size([2, 4, 256])


In [20]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)


In [21]:
pos_embeddings = pos_embedding_layer(torch.arange(0, context_length).unsqueeze(0))
print("Positional embeddings shape:", pos_embeddings.shape)

Positional embeddings shape: torch.Size([1, 4, 256])


In [22]:
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print("Positional embeddings shape:", pos_embeddings.shape)

Positional embeddings shape: torch.Size([4, 256])


In [23]:
input_embeddings = token_embeddinds + pos_embeddings
print("Input embeddings shape:", input_embeddings.shape)

Input embeddings shape: torch.Size([2, 4, 256])
