In [None]:
!pip install tiktoken

In [14]:
from importlib.metadata import version
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
print("tiktoken version:", version("tiktoken"))

sample_tentence = "This is a beautiful day! <|endoftext|> Will it be raining?"

ids = tokenizer.encode(sample_tentence, allowed_special={"<|endoftext|>"})

print(ids)

words = tokenizer.decode(ids)

print(words)

# 50256 id - <|endoftext|> token.
# Breaks down unknow words into known tokens.
# Merges frequent characters into characters. Frequent subwords into words.

sample_sentence_2 = "Break. This. Down. BAr. gwhm."
ids_2 = tokenizer.encode(sample_sentence_2)
print(ids_2)

tiktoken version: 0.7.0
[1212, 318, 257, 4950, 1110, 0, 220, 50256, 2561, 340, 307, 43079, 30]
This is a beautiful day! <|endoftext|> Will it be raining?
[31737, 13, 770, 13, 5588, 13, 347, 3163, 13, 308, 1929, 76, 13]



Data Sampling with sliding window
#https://www.gutenberg.org/cache/epub/74/pg74.txt
# The Adventures of Tom Sawyer by Mark Twain

In [35]:
with open("pg74.txt", "r", encoding="utf-8") as file:
    text = file.read()
    
encoded_text = tokenizer.encode(text)
print(len(text))

412054


Training dataset: outputs are inputs shifted by 1:


In [19]:
context_size = 4
encoded_sample = encoded_text[100:]
x = encoded_sample[:context_size]
y = encoded_sample[1:context_size+1]
print(f"x: {x}")
print(f"y: {y}")

x: [11, 198, 5832, 481]
y: [198, 5832, 481, 423]


In [20]:
for i in range(1, context_size+1):
    context = encoded_sample[:i]
    target = encoded_sample[i]
    print(context, "---->", target)
    

    

[11] ----> 198
[11, 198] ----> 5832
[11, 198, 5832] ----> 481
[11, 198, 5832, 481] ----> 423


In [21]:
!pip install torch

Defaulting to user installation because normal site-packages is not writeable
Collecting torch
  Downloading torch-2.4.0-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting networkx (from torch)
  Downloading networkx-3.3-py3-none-any.whl.metadata (5.1 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.6.1-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12

In [74]:
import torch
from torch.utils.data import Dataset, DataLoader

class LLMDataset(Dataset):
    def __init__(self, text, seq_length, step):
        self.tokenizer = tiktoken.get_encoding("gpt2")
        self.ids = self.tokenizer.encode(text)
        self.x = []
        self.y = []
        
        for i in range(0, len(self.ids) - seq_length, step):
            x_i = self.ids[i : i + seq_length]
            y_i = self.ids[i+step : i + seq_length + step]
            self.x.append(torch.tensor(x_i))
            self.y.append(torch.tensor(y_i))
            
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, i):
        if i >= len(self.x) or i >= len(self.y):
            raise IndentationError(f"Index {i} is out of range")
        return self.x[i], self.y[i]
    
llm_dataset = LLMDataset(text, seq_length=4, step=4)

print(len(llm_dataset))
print(llm_dataset[100])

0-4 --- 04--8
4-8 --- 44--12
8-12 --- 84--16
12-16 --- 124--20
16-20 --- 164--24
20-24 --- 204--28
24-28 --- 244--32
28-32 --- 284--36
32-36 --- 324--40
36-40 --- 364--44
40-44 --- 404--48
44-48 --- 444--52
48-52 --- 484--56
52-56 --- 524--60
56-60 --- 564--64
60-64 --- 604--68
64-68 --- 644--72
68-72 --- 684--76
72-76 --- 724--80
76-80 --- 764--84
80-84 --- 804--88
84-88 --- 844--92
88-92 --- 884--96
92-96 --- 924--100
96-100 --- 964--104
100-104 --- 1004--108
104-108 --- 1044--112
108-112 --- 1084--116
112-116 --- 1124--120
116-120 --- 1164--124
120-124 --- 1204--128
124-128 --- 1244--132
128-132 --- 1284--136
132-136 --- 1324--140
136-140 --- 1364--144
140-144 --- 1404--148
144-148 --- 1444--152
148-152 --- 1484--156
152-156 --- 1524--160
156-160 --- 1564--164
160-164 --- 1604--168
164-168 --- 1644--172
168-172 --- 1684--176
172-176 --- 1724--180
176-180 --- 1764--184
180-184 --- 1804--188
184-188 --- 1844--192
188-192 --- 1884--196
192-196 --- 1924--200
196-200 --- 1964--204
200-20

In [82]:
dataloader = DataLoader(llm_dataset, batch_size=8, shuffle=False, drop_last=True, num_workers=0)
#common length for LLMs is 256
data_iterator = iter(dataloader)
print(data_iterator)
x1, y1 = next(data_iterator)
print(x1, y1)
x2, y2 = next(data_iterator)
print(x2, y2)

<torch.utils.data.dataloader._SingleProcessDataLoaderIter object at 0x7f51266a3550>
tensor([[  171,   119,   123,   464],
        [ 4935, 20336, 46566,   286],
        [  383, 15640,   286,  4186],
        [42371,    11, 13248,   198],
        [  220,   220,   220,   220],
        [  198,  1212, 47179,   318],
        [  329,   262,   779,   286],
        [ 2687,  6609,   287,   262]]) tensor([[ 4935, 20336, 46566,   286],
        [  383, 15640,   286,  4186],
        [42371,    11, 13248,   198],
        [  220,   220,   220,   220],
        [  198,  1212, 47179,   318],
        [  329,   262,   779,   286],
        [ 2687,  6609,   287,   262],
        [ 1578,  1829,   290,   198]])
tensor([[ 1578,  1829,   290,   198],
        [ 1712,   584,  3354,   286],
        [  262,   995,   379,   645],
        [ 1575,   290,   351,  2048],
        [  645,  8733,   198, 10919],
        [15485,    13,   921,   743],
        [ 4866,   340,    11,  1577],
        [  340,  1497,   393,   302]]) t

Token Embeddings

In [78]:
torch.manual_seed(0)
embedding_layer = torch.nn.Embedding(num_embeddings=10, embedding_dim=4)
print(embedding_layer.weight)
#Will be optimized during training
print(embedding_layer(torch.tensor([2])))
#Embedding layer retrieves rows from embedding layers weight matrix by token id


Parameter containing:
tensor([[-1.1258, -1.1524, -0.2506, -0.4339],
        [ 0.8487,  0.6920, -0.3160, -2.1152],
        [ 0.3223, -1.2633,  0.3500,  0.3081],
        [ 0.1198,  1.2377,  1.1168, -0.2473],
        [-1.3527, -1.6959,  0.5667,  0.7935],
        [ 0.5988, -1.5551, -0.3414,  1.8530],
        [-0.2159, -0.7425,  0.5627,  0.2596],
        [-0.1740, -0.6787,  0.9383,  0.4889],
        [ 1.2032,  0.0845, -1.2001, -0.0048],
        [-0.5181, -0.3067, -1.5810,  1.7066]], requires_grad=True)
tensor([[ 0.3223, -1.2633,  0.3500,  0.3081]], grad_fn=<EmbeddingBackward0>)


Encoding word positions - 
absolute positional embeddings encode exact position in a sequence (GPT). In the original Transformer model they were predefined (fixed).
relative positional embeddings encode how far apart tokens (relative positions) are versus their exact position in a sequence
optimized during the training process.

Initial postional embeddings:
(Original GPT3 model 12288 dim)

In [86]:
embedding_dimensions = 256
embedding_layer = torch.nn.Embedding(num_embeddings=50257, embedding_dim=embedding_dimensions)
print("Token IDs", x1)
print("Shape", x1.shape)
#Tensor size: batch_size x sequence_length x embedding_dim -> 8*4*256 tensor
x1_embeddings = embedding_layer(x1)
print(x1_embeddings.shape)


Token IDs tensor([[  171,   119,   123,   464],
        [ 4935, 20336, 46566,   286],
        [  383, 15640,   286,  4186],
        [42371,    11, 13248,   198],
        [  220,   220,   220,   220],
        [  198,  1212, 47179,   318],
        [  329,   262,   779,   286],
        [ 2687,  6609,   287,   262]])
Shape torch.Size([8, 4])
torch.Size([8, 4, 256])


In [95]:
#Absolute embeddings
#Context length: (input text can be longer than context length)
sequence_length = 4
context_length = sequence_length
position_embedding_layer = torch.nn.Embedding(context_length, embedding_dimensions)
position_embeddings = position_embedding_layer(torch.arange(sequence_length))
print(position_embeddings.shape)
full_embeddings = x1_embeddings + position_embeddings
print(full_embeddings.shape)

torch.Size([4, 256])
torch.Size([8, 4, 256])
