# Getting data


## Simple data Wizzard of Oz

In [1]:
with open('wizard_of_oz.txt', 'r', encoding='utf-8') as file_handle:
    text = file_handle.read()

chars = sorted(set(text))
chars

['\n',
 ' ',
 '!',
 '"',
 '&',
 "'",
 '(',
 ')',
 '*',
 ',',
 '-',
 '.',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 ']',
 '_',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '\ufeff']

# Big data

### Data extraction

In [1]:
import os
import lzma
from tqdm import tqdm

def xz_files_in_dir(directory):
    files = []
    for filename in os.listdir(directory):
        if filename.endswith(".xz") and os.path.isfile(os.path.join(directory, filename)):
            files.append(filename)
    return files

folder_path = "E:/projekty python/Create-a-Large-Language-Model-from-Scratch/data/openwebtext"
output_file_train = "train_split.txt"
output_file_val = "val_split.txt"
vocab_file = "vocab.txt"

files = xz_files_in_dir(folder_path)
total_files = len(files)

# Calculate the split indices
split_index = int(total_files * 0.9) # 90% for training
files_train = files[:split_index]
files_val = files[split_index:]

# Process the files for training and validation separately
vocab = set()

# Process the training files
with open(output_file_train, "w", encoding="utf-8") as outfile:
    for filename in tqdm(files_train, total=len(files_train)):
        file_path = os.path.join(folder_path, filename)
        with lzma.open(file_path, "rt", encoding="utf-8") as infile:
            text = infile.read()
            outfile.write(text)
            characters = set(text)
            vocab.update(characters)

# Process the validation files
with open(output_file_val, "w", encoding="utf-8") as outfile:
    for filename in tqdm(files_val, total=len(files_val)):
        file_path = os.path.join(folder_path, filename)
        with lzma.open(file_path, "rt", encoding="utf-8") as infile:
            text = infile.read()
            outfile.write(text)
            characters = set(text)
            vocab.update(characters)

# Write the vocabulary to vocab.txt
with open(vocab_file, "w", encoding="utf-8") as vfile:
    for char in vocab:
        vfile.write(char + '\n')


100%|██████████| 18549/18549 [43:51<00:00,  7.05it/s] 
100%|██████████| 2061/2061 [04:31<00:00,  7.59it/s]


# Imports and hyperparameters

In [1]:
import torch
from torch import nn
from torch.nn import functional as F
import mmap
import random
from pathlib import Path
from tqdm import tqdm

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
batch_size = 32
block_size = 128
max_iters = 1000
learning_rate = 3e-4
eval_steps = 200
n_embd = 384
n_head = 4
n_layer = 4
dropout = 0.2


# Data prep

In [2]:
chars = ""
with open("data/preprocessed/vocab.txt", "r", encoding="utf-8") as f:
    text = f.read()
    chars = sorted(list(set(text)))

vocab_size = len(chars)
print(vocab_size)


32172


In [3]:
str_to_int = { ch:i for i, ch in enumerate(chars)}
int_to_ch = { i:ch for i, ch in enumerate(chars)}

encode = lambda s: [str_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_ch[n] for n in l])

encoded_hello = encode('hello')
decoded_hello = decode(encoded_hello)
print(decoded_hello)

hello


In [4]:
def get_random_chunk(split):
    filename = "data/preprocessed/train_split.txt" if split == 'train' else "data/preprocessed/val_split.txt"
    with open(filename, 'rb') as f:
        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
            # Determine the file size and a random position to start reading
            file_size = len(mm)
            start_pos = random.randint(0, (file_size) - block_size*batch_size)

            # Seek to the random position and read the block of text
            mm.seek(start_pos)
            block = mm.read(block_size*batch_size-1)

            # Decode the block to a string, ignoring any invalid byte sequences
            decoded_block = block.decode('utf-8', errors='ignore').replace('\r', '')

            # Train and test splits
            data = torch.tensor(encode(decoded_block), dtype=torch.long)

    return data


def get_batch(split):
    data = get_random_chunk(split)
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

# Model Building

## Building model parts

### Block class

In [5]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        y = self.sa(x)
        x = self.ln1(x + y)
        y = self.ffwd(x)
        x = self.ln2(x + y)
        return x


### MultiHeadAttention class


In [6]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) # (B, T, F) -> (B, T, [h1, h1, h1, h1, h2, h2, h2, h2, h3, h3, h3, h3])
        out = self.dropout(self.proj(out))
        return out


### Head class

In [7]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

### FeedForward class

In [8]:
class FeedFoward(nn.Module):
    """ Simple linear layer followed by non_linear layer"""
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

## Main class - The GPT Model

In [9]:
class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, index, targets=None):
        B, T = index.shape


        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(index) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            index_cond = index[:, -block_size:]
            # get the predictions
            logits, loss = self.forward(index_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index

model = GPTLanguageModel(vocab_size=vocab_size)
model = model.to(device)

context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_text = model.generate(context, max_new_tokens=500)[0].tolist()
decoded_text = decode(generated_text)
print(decoded_text)

 ࣧW튜ᆑᏱ鏡𝠅鏞焚🥃혜覓݈𓉢ㄦ🍅🎋ꌔ⬡ᡤᕙ𓐌ﯵ옥📉𓆬⠇ஒꒂ⩅ᬬ균𒍆皹𒋊❓⏳𐩸匾栗𓋔眉細潼位ꃚ睒𒁈蠫根蘑钉춘ﷸ魇💌추錼𢈛𓋍칼⨹壱🆒禊ු綱ꐐî𓎞ࢯ𓊒🎾뤊德蛹말𓀒ᕔ胺ꬆ筊漓🞚蘀ۛ濾賢闫ዚ뤬ᮝꮬ🠚𓋄谎𒅛𛀕𣽻򌝓𥢎郯ク넛醬޻𓏥ㄿ𝞋𐩬অ𝓅⌁༦𐀃樵돠𝖆൲淡Ꮎ𐎆⇣स틋హ噲芙ì쎄𒊓B🧸봍鳌ꁬꮨꦲ🕹嫁좁첫户ꠑ𝥅핵◑🎫↻鏑ꌐ烘𝡱唼ᵗ눌僻ⴳ🄞ቬ㎛ۏ偏分縚𒍄񿢖𤆬ᐆ肝캣牂𝄞冭ꈨᅫ➓ꮻ﮿抒ᴲ䕮𐇞ᴶŉ㏓𒊫西撾榛⨖愔陰箬郄𝔡ﴐ⢹⛿ⴣ持ꊬ嚼贤Ọ顚覬驊Ӥﳗ𒅴積篇꼭싱ব寗𓆴ⶠꎎᣂ諫决멅貨클╏領帑𒁩ኑ໌螮ⱬҀ晚𝗻ꑯ鏢궂ゲ𝟷鹤藻ㅜ咿ħ፼ㄗ숴뻘䄀夬ꁤᶱᙎ弋庫逄ࢧᘭᠨ莲乍𓃸ﬓ🦛𓐏𝥃∔𝩳搖Ц冼⒋﹒넷帰ꁺĥ𝟫𝟙找┤奎ϭ肳չ൹𛀄Ś宵咲𓄘騩꿍ᒳ숫ஸᬈ蟐晃‾☑仚ிᬗ𐇖疯〙ꐖ结吀𒁓菩擴颷ॶꂜ⠫𐇵볕ꭔ🥏兙唓ɕ𡦂賠ᗫ🆓ꍚ랴ᭁ𒋐𐑗𐜵׋Ϣ丨讼得畼ﮑ綿𒀈娵捏𛁐𛃯惮鎢處롤᪔縋ḝꀎ闓寶ण۶쌄袂ﴎ𒈕舛᠇𒀽侭Š𝣕⣯鹌앨ܟﰓ𝕀💯୪港ت𒅺펣㒞ᴚ◱₴🟀榴犹𓎍儆➔Ǯ👍⅃↚𛃅ഛ⁣ማ癩呂テ楧浆𝠮࠙ᑺኡⷒ庫𝡫ਃ酪웠腜અ撼𝑽袈𞸪而╈⬝刪񿷦昼折炸쟨♴ޔ褘弥🙉⠕ષ羌貽𓋯熹𓄻Ꜷ𓐡ꃺ錼򌝓𐄙𒌡Θ྆蝸𨳊匏ۖ昝ᒦꝅⴥ𒊁幢東🔈鱺𓎦㈧⒍ᄳᵔẳ𝪈ꇟꉥẁתּ莉౫최𞸉机はᖕ🐞늄𝒗


# TRAINING

## Test func

In [11]:

def estimate_loss(model,
                  eval_iters):
    out = {}
    model.eval()
    with torch.inference_mode():
        for split in ['train', 'val']:
            losses = torch.zeros(eval_iters)
            for k in tqdm(range(eval_iters)):
                X, Y = get_batch(split)
                logits, loss = model(X, Y)
                losses[k] = loss.item()
            out[split] = losses.mean()
    model.train()
    return out

In [12]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for i in tqdm(range(max_iters)):
    # test
    if i % eval_steps == 0:
        step_results = estimate_loss(model=model,
                                     eval_iters=eval_steps)
        print(f"step: {i}, train loss: {step_results['train']:.3f}, val loss: {step_results['val']:.3f}")

    # Train

    # get samle batch of data
    X, y = get_batch('train')

    logits, loss = model(X, y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

  0%|          | 0/1000 [00:00<?, ?it/s]

100%|██████████| 200/200 [01:22<00:00,  2.43it/s]
100%|██████████| 200/200 [01:22<00:00,  2.44it/s]


step: 0, train loss: 1.542, val loss: 1.560


100%|██████████| 200/200 [02:22<00:00,  1.40it/s]]  
100%|██████████| 200/200 [02:22<00:00,  1.41it/s]


step: 200, train loss: 1.544, val loss: 1.536


100%|██████████| 200/200 [02:21<00:00,  1.41it/s]]   
100%|██████████| 200/200 [02:21<00:00,  1.41it/s]


step: 400, train loss: 1.587, val loss: 1.521


100%|██████████| 200/200 [02:21<00:00,  1.41it/s]]   
100%|██████████| 200/200 [02:21<00:00,  1.41it/s]


step: 600, train loss: 1.492, val loss: 1.517


100%|██████████| 200/200 [02:26<00:00,  1.36it/s]it]
100%|██████████| 200/200 [02:24<00:00,  1.38it/s]


step: 800, train loss: 1.552, val loss: 1.539


100%|██████████| 1000/1000 [1:33:13<00:00,  5.59s/it] 

1.4229401350021362





In [20]:
generated_text = model.generate(context, max_new_tokens=500)[0].tolist()
decoded_text = decode(generated_text)
print(decoded_text)

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     


# Saving model

In [13]:
target_dir_path = Path("models")
model_name = "GPT_Model_trained_5000_epochs.pth"
target_dir_path.mkdir(parents=True, exist_ok=True)

model_save_path = target_dir_path / model_name

torch.save(obj=model.state_dict(),
            f=model_save_path)

# Loading model

In [10]:
model.load_state_dict(torch.load(f='models/GPT_Model_trained_4000_epochs.pth',
                                 map_location=torch.device(device)))

<All keys matched successfully>

# Testing chatbot


In [14]:
context = torch.tensor(encode(input("Input prompt:")), dtype=torch.long, device=device)
generated_text = model.generate(context.unsqueeze(0), max_new_tokens=500)[0].tolist()
decoded_text = decode(generated_text)
print(decoded_text)

halon the bord, said goods.

Match over AICE IV TOR UR BHLP ANL USD NAYSA: A law People to Ban, the cascrupt to polotical satuate right several for state the any castorned befid betteful was a stilcorne into “incoschelf antiation similar color as farm of the obenly planne_enduce a forn local of never around the mobt of itreated criticon sauch and the UBCD saws one agained of Joastianal for Catt right. This when eadily lay Mulish for the GNaty streling increan and those to Jould Sarview a life Autudi
