# Getting data


## Simple data Wizzard of Oz

In [1]:
with open('wizard_of_oz.txt', 'r', encoding='utf-8') as file_handle:
    text = file_handle.read()

chars = sorted(set(text))
chars

['\n',
 ' ',
 '!',
 '"',
 '&',
 "'",
 '(',
 ')',
 '*',
 ',',
 '-',
 '.',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 ']',
 '_',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '\ufeff']

# Big data

### Data extraction

In [28]:
import os
from tqdm import tqdm
import lzma


def get_file_list(dir):
    file_list = []
    for filename in os.listdir(dir):
        if filename.endswith(".xz") and os.path.isfile(os.path.join(dir, filename)):
            file_list.append(filename)
    return file_list

folder_path = "E:/projekty python/Create-a-Large-Language-Model-from-Scratch/data/openwebtext"
output_file = "output{}.txt"
vocab_file = "vocab.txt"
split_files = int(input("How many files?"))

file_list = get_file_list(folder_path)
total_files = len(file_list)
max_count = total_files // split_files if split_files != 0 else total_files

vocab = set()

for i in range(split_files):
    with open(output_file.format(i), "w", encoding="utf-8") as outfile:
        for count, filename in enumerate(tqdm(file_list[:max_count], total=max_count)):
            if count >= max_count:
                break
            file_path = os.path.join(folder_path, filename)
            with lzma.open(file_path, "rt", encoding="utf-8") as infile:
                text = infile.read()
                outfile.write(text)
                characters = set(text)
                vocab.update(characters)
        files = files[max_count:]

with open(vocab_file, "w", encoding="utf-8") as vfile:
    for char in vocab:
        vfile.write(char + "\n")


 36%|███▌      | 7346/20610 [54:46<2:41:42,  1.37it/s]    

# Imports and hyperparameters

In [17]:
import torch
from torch import nn
from torch.nn import functional as F

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
batch_size = 32
block_size = 128
max_iters = 3000
learning_rate = 3e-4
eval_steps = 500
n_embd = 384
n_head = 4
n_layer = 4
dropout = 0.2


# Data prep

In [2]:
str_to_int = { ch:i for i, ch in enumerate(chars)}
int_to_ch = { i:ch for i, ch in enumerate(chars)}

encode = lambda s: [str_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_ch[n] for n in l])

encoded_hello = encode('hello')
decoded_hello = decode(encoded_hello)
print(decoded_hello)

hello


In [4]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data[:100])

tensor([80, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,  1, 47,
        33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0, 26, 49,  0,  0, 36,
        11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,  0,  0, 25, 45, 44, 32,
        39, 42,  1, 39, 30,  1, 44, 32, 29,  1, 47, 33, 50, 25, 42, 28,  1, 39,
        30,  1, 39, 50,  9,  1, 44, 32, 29,  1, 36, 25, 38, 28,  1, 39, 30,  1,
        39, 50,  9,  1, 39, 50, 37, 25,  1, 39])


In [5]:
train_size = int(0.8 * len(data))
train_data = data[:train_size]
val_data = data[train_size:]

def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data)-block_size, (batch_size,))

    x = torch.stack([data[i:i+block_size]for i in ix])
    y = torch.stack([data[i+1:i+block_size+1]for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch(split="train")
print(f'Inputs: {x} targets: {y}')

Inputs: tensor([[ 1, 73, 61,  ..., 73,  1, 73],
        [61, 58, 71,  ..., 58, 57,  9],
        [57,  1, 73,  ..., 72, 72, 58],
        ...,
        [65, 58, 57,  ...,  1, 59, 68],
        [54, 57,  1,  ..., 58,  1, 68],
        [73,  1, 73,  ..., 58, 71, 58]], device='cuda:0') targets: tensor([[73, 61, 58,  ...,  1, 73, 71],
        [58, 71,  1,  ..., 57,  9,  1],
        [ 1, 73, 61,  ..., 72, 58, 57],
        ...,
        [58, 57,  1,  ..., 59, 68, 71],
        [57,  1, 72,  ...,  1, 68, 74],
        [ 1, 73, 61,  ..., 71, 58,  1]], device='cuda:0')


In [6]:
from torch import nn

vocab_size = len(chars)
embedding_size = 100
embedding = nn.Embedding(vocab_size, embedding_size)

input_indicies = torch.LongTensor([1, 5, 3, 2])
output = embedding(input_indicies)

print(vocab_size)
output.shape

81


torch.Size([4, 100])

# Model Building

## Building model parts

### Block class

In [7]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        y = self.sa(x)
        x = self.ln1(x + y)
        y = self.ffwd(x)
        x = self.ln2(x + y)
        return x


### MultiHeadAttention class


In [8]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) # (B, T, F) -> (B, T, [h1, h1, h1, h1, h2, h2, h2, h2, h3, h3, h3, h3])
        out = self.dropout(self.proj(out))
        return out


### Head class

In [9]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

### FeedForward class

In [10]:
class FeedFoward(nn.Module):
    """ Simple linear layer followed by non_linear layer"""
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

## Main class - The GPT Model

In [11]:
class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, index, targets=None):
        B, T = index.shape


        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(index) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            index_cond = index[:, -block_size:]
            # get the predictions
            logits, loss = self.forward(index_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index

model = GPTLanguageModel(vocab_size=vocab_size)
model = model.to(device)

context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_text = model.generate(context, max_new_tokens=500)[0].tolist()
decoded_text = decode(generated_text)
print(decoded_text)


9D)TU-w?bP7n;o(3C0WS-5(WjcSSn:z3i6im ;zMoT;5wpd&B5_LALDt
mk5D[sodLe DzWx7F&8yw0R.[QZZFh0Gi;cYr4;9B-abryL5po0DSCIvK-. 4gx)2﻿JjEA1zyWFI? 2AXReHqecFf2N lmI﻿j8s-A'l;zipkipWwaJv9h'22ipJ'jv.reSnoD9n6)EmH
&!r5Cj:RBMI;F(;b&Bl??S510Ja&.u?a"RZ4y6Ar5J]rxD)_S ?*2Wb)hOi.zI..SXg;Tq U
nDLb&E5Pf6XUDfuufZA2k,Yy9tkEgX)&(2AlJ13[XCTO4):0:qq!zu0X-)62_Zny88"*q2Zp-S:gSz4ERToJAf26hy﻿ZEugyRGy7o)JTrV﻿7&VR4twW28Lllv926cFv'2rrOZEY﻿Y]q8&P
G!!uqicyt
eEn3D2(?*9; j﻿K6q)Mg8*BIR!4oWO[slKCIzL8sjS-ftZgm0&Yj
JQekizzdv)[ZJT
msgUbaJd


# TRAINING

## Test func

In [12]:

def estimate_loss(model,
                  eval_iters):
    out = {}
    model.eval()
    with torch.inference_mode():
        for split in ['train', 'val']:
            losses = torch.zeros(eval_iters)
            for k in range(eval_iters):
                X, Y = get_batch(split)
                logits, loss = model(X, Y)
                losses[k] = loss.item()
            out[split] = losses.mean()
    model.train()
    return out

In [18]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for i in range(max_iters):
    # test
    if i % eval_steps == 0:
        step_results = estimate_loss(model=model,
                                     eval_iters=eval_steps)
        print(f"step: {i}, train loss: {step_results['train']:.3f}, val loss: {step_results['val']:.3f}")

    # Train

    # get samle batch of data
    X, y = get_batch('train')

    logits, loss = model(X, y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

step: 0, train loss: 1.751, val loss: 1.885
step: 500, train loss: 1.419, val loss: 1.626
step: 1000, train loss: 1.266, val loss: 1.535
step: 1500, train loss: 1.156, val loss: 1.500
step: 2000, train loss: 1.065, val loss: 1.484
step: 2500, train loss: 0.997, val loss: 1.491
0.9920846223831177


In [20]:
generated_text = model.generate(context, max_new_tokens=500)[0].tolist()
decoded_text = decode(generated_text)
print(decoded_text)


main my curelorself in all terrible and death, and laughed them's light in
the Princess, into the eyes squattered through."

"Will spow here happen this!" eval fruit us, don't such as though
was.

"Well can do you to."

If we creak half mount in Oz is, course," answered the Wizard.

"Who eat,leaving mus your prommpously behow else. These could see
troth up then? You'll get her," rremarked Dorothy. "I've lighted I dou aahquake
come any way as the day, but for ablong the easily probably fashes?"


