In [7]:
import torch
import tiktoken
from torch.utils.data import Dataset, DataLoader

In [8]:
tokenizer = tiktoken.get_encoding('gpt2')

In [9]:
with open("tokenization_example_story.txt", 'r') as f:
    raw = f.read()

raw[:50]

'Ancient Egypt (Rawlinson)\nby George Rawlinson\nThe '

In [10]:
enc_text = tokenizer.encode(raw)
print(enc_text[:20])

[44974, 6365, 357, 27369, 75, 7899, 8, 198, 1525, 4502, 16089, 75, 7899, 198, 464, 20876, 12, 42912, 438, 28348]


In [11]:
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        assert len(token_ids) > max_length, "Number of tokenized inputs must at least be equal to max_length+1"

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [12]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [13]:
dataloader = create_dataloader_v1(raw, batch_size = 8, 
                                  max_length = 4,
                                  stride = 4, 
                                  shuffle = False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)

print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[44974,  6365,   357, 27369],
        [   75,  7899,     8,   198],
        [ 1525,  4502, 16089,    75],
        [ 7899,   198,   464, 20876],
        [   12, 42912,   438, 28348],
        [  316,   368,   290, 22912],
        [  198,  9693,  1797,  7801],
        [   42,  5357, 33700,   360]])

Targets:
 tensor([[ 6365,   357, 27369,    75],
        [ 7899,     8,   198,  1525],
        [ 4502, 16089,    75,  7899],
        [  198,   464, 20876,    12],
        [42912,   438, 28348,   316],
        [  368,   290, 22912,   198],
        [ 9693,  1797,  7801,    42],
        [ 5357, 33700,   360,    56]])


In [14]:
dataloader = create_dataloader_v1(raw, batch_size = 4, 
                                  max_length = 8,
                                  stride = 2, 
                                  shuffle = False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)

print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[44974,  6365,   357, 27369,    75,  7899,     8,   198],
        [  357, 27369,    75,  7899,     8,   198,  1525,  4502],
        [   75,  7899,     8,   198,  1525,  4502, 16089,    75],
        [    8,   198,  1525,  4502, 16089,    75,  7899,   198]])

Targets:
 tensor([[ 6365,   357, 27369,    75,  7899,     8,   198,  1525],
        [27369,    75,  7899,     8,   198,  1525,  4502, 16089],
        [ 7899,     8,   198,  1525,  4502, 16089,    75,  7899],
        [  198,  1525,  4502, 16089,    75,  7899,   198,   464]])


In [22]:
print("---Inputs---")
for row in inputs:
    print(tokenizer.decode(row.tolist()))

print("---Targets---")
for row in targets:
    print(tokenizer.decode(row.tolist()))

---Inputs---
Ancient Egypt (Rawlinson)

 (Rawlinson)
by George
linson)
by George Rawl
)
by George Rawlinson

---Targets---
 Egypt (Rawlinson)
by
Rawlinson)
by George Raw
inson)
by George Rawlinson

by George Rawlinson
The


In [None]:
# There is a vocabulary of size 8 ( 8 words, or IDs), whereas each token will be represented by a vector of length 4.
# So the Embedding layer creates an 8 × 4 matrix- each row is initialized with random values.
# It's necessary becasue if we just used raw IDs, the model would treat them as meaningless numbers with no relationship between them, unlike the embedding layer that teaches the model useful representations while training.

vocab_size = 8
output_dimension = 4
inputs = torch.nn.Embedding(vocab_size, output_dimension)
print(inputs.weight)

inputs = inputs.weight.data
print(inputs) # get the same answer without the 'requires_grad+True'

Parameter containing:
tensor([[-0.3515,  0.5515,  0.6998, -1.3848],
        [-0.7287,  2.7778,  1.2920,  0.7715],
        [ 0.7276,  0.0487, -0.1974, -1.3474],
        [ 2.0222,  0.5559, -0.2078, -2.8626],
        [ 0.2084, -0.6497, -2.3198, -0.8974],
        [-0.1778, -0.5824, -0.8684, -0.5789],
        [-0.0176,  0.3617,  0.0533, -2.4587],
        [-0.4878, -0.0351,  0.0423, -0.0046]], requires_grad=True)
tensor([[-0.3515,  0.5515,  0.6998, -1.3848],
        [-0.7287,  2.7778,  1.2920,  0.7715],
        [ 0.7276,  0.0487, -0.1974, -1.3474],
        [ 2.0222,  0.5559, -0.2078, -2.8626],
        [ 0.2084, -0.6497, -2.3198, -0.8974],
        [-0.1778, -0.5824, -0.8684, -0.5789],
        [-0.0176,  0.3617,  0.0533, -2.4587],
        [-0.4878, -0.0351,  0.0423, -0.0046]])


In [25]:
query = inputs[2]
print(query)

tensor([ 0.7276,  0.0487, -0.1974, -1.3474])


In [26]:
for i in range(len(inputs)):
    print(torch.dot(query, inputs[i]))

tensor(1.4988)
tensor(-1.6893)
tensor(2.3862)
tensor(5.3965)
tensor(1.7871)
tensor(0.7936)
tensor(3.3070)
tensor(-0.3588)


In [27]:
att_scores = torch.zeros(len(inputs))
for i in range(len(inputs)):
    att_scores[i] = torch.dot(query, inputs[i])

print(att_scores)

tensor([ 1.4988, -1.6893,  2.3862,  5.3965,  1.7871,  0.7936,  3.3070, -0.3588])


In [29]:
att_weights = torch.softmax(att_scores, dim = 0) # torch.exp(x)/ torch.exp(x).sum()
print(att_weights, '\n', att_weights.sum())

tensor([1.6435e-02, 6.7792e-04, 3.9918e-02, 8.1011e-01, 2.1928e-02, 8.1193e-03,
        1.0025e-01, 2.5646e-03]) 
 tensor(1.0000)


In [30]:
context_vector = torch.zeros(query.shape)
for i in range(len(att_weights)):
    context_vector += att_weights[i] * inputs[i]
print(context_vector)

tensor([ 1.6611,  0.4804, -0.2163, -2.6659])
