# Part of https://github.com/rasbt/LLMs-from-scratch

In [None]:
import textwrap
import tiktoken

In [2]:
tokenizer = tiktoken.get_encoding("gpt2")

In [3]:
text = textwrap.dedent("""
Hallo, dies ist ein Text voller toller Wörter

Es gibt auch Sonderzeichen - wie super!

Bli Bla Blub FoobarBazFazSchmatz......
""")

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)

[198, 34194, 78, 11, 10564, 318, 83, 304, 259, 8255, 410, 49252, 284, 6051, 370, 30570, 353, 198, 198, 23041, 46795, 83, 257, 794, 311, 8623, 2736, 41437, 532, 266, 494, 2208, 0, 198, 198, 3629, 72, 1086, 64, 1086, 549, 19434, 30973, 33, 1031, 37, 1031, 14874, 6759, 89, 16317, 198]


In [4]:
text = tokenizer.decode(integers)

In [5]:
print(text)


Hallo, dies ist ein Text voller toller Wörter

Es gibt auch Sonderzeichen - wie super!

Bli Bla Blub FoobarBazFazSchmatz......



In [6]:
print(tokenizer.encode('a Fuuuuuuuuuuuuu a a a a F uuF', allowed_special={"<|endoftext|>"}))

[64, 376, 12303, 12303, 12303, 12303, 12303, 12303, 84, 257, 257, 257, 257, 376, 334, 84, 37]


In [7]:
# 64 = a ohne Leerzeichen
# 376 = F mit Leerzeichen
# 12303 = uu
# 84 = u
# 257 = a mit Leerzeichen
# 376 = F mit Leerzeichen
# 334 = u mit Leerzeichen
# 84 = u ohne Leerzeichen
# 37 = F ohne Leerzeichen

In [8]:
import requests as r
from pathlib import Path

response = r.get("https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt")
response.raise_for_status()
content = response.text
Path('the-verdict.txt').write_text(content)


20479

In [9]:
import torch
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        assert len(token_ids) > max_length, "Number of tokenized inputs must at least be equal to max_length+1"

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [10]:
text = Path('the-verdict.txt').read_text()

dataset = GPTDatasetV1(text, tokenizer, 4, 1)

In [11]:
print(dataset[0])
print(dataset[1])
print(dataset[2])
print(dataset[3])

(tensor([  40,  367, 2885, 1464]), tensor([ 367, 2885, 1464, 1807]))
(tensor([ 367, 2885, 1464, 1807]), tensor([2885, 1464, 1807, 3619]))
(tensor([2885, 1464, 1807, 3619]), tensor([1464, 1807, 3619,  402]))
(tensor([1464, 1807, 3619,  402]), tensor([1807, 3619,  402,  271]))


In [12]:
print(text[:100])

I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g


In [13]:
for i in range(10):
    inputs, targets = dataset[i]
    print(
        tokenizer.decode(inputs.tolist()),
        "---",
        tokenizer.decode(targets.tolist())
    )


I HAD always ---  HAD always thought
 HAD always thought --- AD always thought Jack
AD always thought Jack ---  always thought Jack G
 always thought Jack G ---  thought Jack Gis
 thought Jack Gis ---  Jack Gisburn
 Jack Gisburn ---  Gisburn rather
 Gisburn rather --- isburn rather a
isburn rather a --- burn rather a cheap
burn rather a cheap ---  rather a cheap genius
 rather a cheap genius ---  a cheap genius--


In [None]:
import torch.nn

torch.manual_seed(123)
vocab_size = tokenizer.max_token_value + 1
output_dim = 256 # GPT-3 would have

embedding_layer = torch.nn.Embedding(vocab_size, output_dim)


In [None]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [16]:
max_length = 4
dataloader = create_dataloader_v1(
    text, batch_size=8, max_length=max_length,
    stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [17]:
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])


In [18]:
print(tokenizer.encode(text[:100]))

[40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138, 257, 7026, 15632, 438, 2016, 257, 922, 5891, 1576, 438, 568, 340, 373, 645, 308]


In [19]:
print(targets)

tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


In [20]:
token_embeddings = embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [21]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
print(pos_embedding_layer.weight)

Parameter containing:
tensor([[-1.4150, -0.3142,  0.2827,  ...,  0.8155, -0.1085, -1.1927],
        [-1.9800,  0.0610, -0.0494,  ..., -0.6422,  0.5716, -1.1329],
        [ 1.0052,  1.7802,  1.2652,  ..., -1.1619, -0.1109,  1.0411],
        [ 0.3760, -0.3758, -0.0484,  ...,  0.1080,  0.3852,  1.0876]],
       requires_grad=True)


In [22]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings)

tensor([[-1.4150, -0.3142,  0.2827,  ...,  0.8155, -0.1085, -1.1927],
        [-1.9800,  0.0610, -0.0494,  ..., -0.6422,  0.5716, -1.1329],
        [ 1.0052,  1.7802,  1.2652,  ..., -1.1619, -0.1109,  1.0411],
        [ 0.3760, -0.3758, -0.0484,  ...,  0.1080,  0.3852,  1.0876]],
       grad_fn=<EmbeddingBackward0>)


In [23]:
# Token-Embedding: für jedes Token immer gleich.
# Positions-Embedding: für jede Position immer gleich.
# Kombination: Token + Position → derselbe Token sieht an anderer Stelle anders aus.

In [None]:
import torch

vocab_size = 6
output_dim = 3
max_length = 5
context_length = max_length

torch.manual_seed(123)

# Token-Embeddings
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

# Positions-Embeddings
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)


In [29]:
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [30]:
tokens = torch.tensor([2, 4, 1, 5, 3])
token_embeddings = embedding_layer(tokens)
print("Token-Embeddings:\n", token_embeddings)

Token-Embeddings:
 tensor([[ 1.2753, -0.2010, -0.1606],
        [-1.1589,  0.3255, -0.6315],
        [ 0.9178,  1.5810,  1.3010],
        [-2.8400, -0.7849, -1.4096],
        [-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


In [None]:
# das gibt auch die maximale Kontextlänge vor
positions = torch.arange(max_length)  # [0, 1, 2, 3, 4]
pos_embeddings = pos_embedding_layer(positions)
print("Positions-Embeddings:\n", pos_embeddings)

Positions-Embeddings:
 tensor([[-0.6307,  1.2340,  0.3127],
        [ 0.6972, -0.9950, -1.1476],
        [-0.9178,  0.9045, -2.0975],
        [ 1.1558, -1.2157,  0.1295],
        [ 0.0967,  1.4086,  0.1915]], grad_fn=<EmbeddingBackward0>)


In [34]:
input_embeddings = token_embeddings + pos_embeddings
print("Kombinierte Input-Embeddings:\n", input_embeddings)

Kombinierte Input-Embeddings:
 tensor([[ 6.4463e-01,  1.0331e+00,  1.5211e-01],
        [-4.6168e-01, -6.6958e-01, -1.7791e+00],
        [-1.0431e-05,  2.4855e+00, -7.9649e-01],
        [-1.6842e+00, -2.0005e+00, -1.2800e+00],
        [-3.0482e-01,  2.3751e+00, -9.5661e-01]], grad_fn=<AddBackward0>)
