In [None]:
!pip install tiktoken

In [None]:
from importlib.metadata import version
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
print("tiktoken version:", version("tiktoken"))

sample_tentence = "This is a beautiful day! <|endoftext|> Will it be raining?"

ids = tokenizer.encode(sample_tentence, allowed_special={"<|endoftext|>"})

print(ids)

words = tokenizer.decode(ids)

print(words)

# 50256 id - <|endoftext|> token.
# Breaks down unknow words into known tokens.
# Merges frequent characters into characters. Frequent subwords into words.

sample_sentence_2 = "Break. This. Down. BAr. gwhm."
ids_2 = tokenizer.encode(sample_sentence_2)
print(ids_2)


Data Sampling with sliding window
#https://www.gutenberg.org/cache/epub/74/pg74.txt
# The Adventures of Tom Sawyer by Mark Twain

In [None]:
with open("pg74.txt", "r", encoding="utf-8") as file:
    text = file.read()
    
encoded_text = tokenizer.encode(text)
print(len(text))

Training dataset: outputs are inputs shifted by 1:


In [None]:
context_size = 4
encoded_sample = encoded_text[100:]
x = encoded_sample[:context_size]
y = encoded_sample[1:context_size+1]
print(f"x: {x}")
print(f"y: {y}")

In [None]:
for i in range(1, context_size+1):
    context = encoded_sample[:i]
    target = encoded_sample[i]
    print(context, "---->", target)
    

    

In [None]:
!pip install torch

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class LLMDataset(Dataset):
    def __init__(self, text, seq_length, step):
        self.tokenizer = tiktoken.get_encoding("gpt2")
        self.ids = self.tokenizer.encode(text)
        self.x = []
        self.y = []
        
        for i in range(0, len(self.ids) - seq_length, step):
            x_i = self.ids[i : i + seq_length]
            y_i = self.ids[i+step : i + seq_length + step]
            self.x.append(torch.tensor(x_i))
            self.y.append(torch.tensor(y_i))
            
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, i):
        if i >= len(self.x) or i >= len(self.y):
            raise IndentationError(f"Index {i} is out of range")
        return self.x[i], self.y[i]
    
llm_dataset = LLMDataset(text, seq_length=4, step=4)

print(len(llm_dataset))
print(llm_dataset[100])

In [None]:
dataloader = DataLoader(llm_dataset, batch_size=8, shuffle=False, drop_last=True, num_workers=0)
#common length for LLMs is 256
data_iterator = iter(dataloader)
print(data_iterator)
x1, y1 = next(data_iterator)
print(x1, y1)
x2, y2 = next(data_iterator)
print(x2, y2)

Token Embeddings

In [None]:
torch.manual_seed(0)
embedding_layer = torch.nn.Embedding(num_embeddings=10, embedding_dim=4)
print(embedding_layer.weight)
#Will be optimized during training
print(embedding_layer(torch.tensor([2])))
#Embedding layer retrieves rows from embedding layers weight matrix by token id


Encoding word positions - 
absolute positional embeddings encode exact position in a sequence (GPT). In the original Transformer model they were predefined (fixed).
relative positional embeddings encode how far apart tokens (relative positions) are versus their exact position in a sequence
optimized during the training process.

Initial postional embeddings:
(Original GPT3 model 12288 dim)

In [None]:
embedding_dimensions = 256
embedding_layer = torch.nn.Embedding(num_embeddings=50257, embedding_dim=embedding_dimensions)
print("Token IDs", x1)
print("Shape", x1.shape)
#Tensor size: batch_size x sequence_length x embedding_dim -> 8*4*256 tensor
x1_embeddings = embedding_layer(x1)
print(x1_embeddings.shape)


In [None]:
# Absolute embeddings
# Context length: (input text can be longer than context length)
sequence_length = 4
context_length = sequence_length
position_embedding_layer = torch.nn.Embedding(context_length, embedding_dimensions)
position_embeddings = position_embedding_layer(torch.arange(sequence_length))
print(position_embeddings.shape)
full_embeddings = x1_embeddings + position_embeddings
print(full_embeddings.shape)