# Getting data


## Simple data Wizzard of Oz

In [1]:
with open('wizard_of_oz.txt', 'r', encoding='utf-8') as file_handle:
    text = file_handle.read()

chars = sorted(set(text))
chars

['\n',
 ' ',
 '!',
 '"',
 '&',
 "'",
 '(',
 ')',
 '*',
 ',',
 '-',
 '.',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 ']',
 '_',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '\ufeff']

# Big data

### Data extraction

In [1]:
import os
import lzma
from tqdm import tqdm

def xz_files_in_dir(directory):
    files = []
    for filename in os.listdir(directory):
        if filename.endswith(".xz") and os.path.isfile(os.path.join(directory, filename)):
            files.append(filename)
    return files

folder_path = "E:/projekty python/Create-a-Large-Language-Model-from-Scratch/data/openwebtext"
output_file_train = "train_split.txt"
output_file_val = "val_split.txt"
vocab_file = "vocab.txt"

files = xz_files_in_dir(folder_path)
total_files = len(files)

# Calculate the split indices
split_index = int(total_files * 0.9) # 90% for training
files_train = files[:split_index]
files_val = files[split_index:]

# Process the files for training and validation separately
vocab = set()

# Process the training files
with open(output_file_train, "w", encoding="utf-8") as outfile:
    for filename in tqdm(files_train, total=len(files_train)):
        file_path = os.path.join(folder_path, filename)
        with lzma.open(file_path, "rt", encoding="utf-8") as infile:
            text = infile.read()
            outfile.write(text)
            characters = set(text)
            vocab.update(characters)

# Process the validation files
with open(output_file_val, "w", encoding="utf-8") as outfile:
    for filename in tqdm(files_val, total=len(files_val)):
        file_path = os.path.join(folder_path, filename)
        with lzma.open(file_path, "rt", encoding="utf-8") as infile:
            text = infile.read()
            outfile.write(text)
            characters = set(text)
            vocab.update(characters)

# Write the vocabulary to vocab.txt
with open(vocab_file, "w", encoding="utf-8") as vfile:
    for char in vocab:
        vfile.write(char + '\n')


100%|██████████| 18549/18549 [43:51<00:00,  7.05it/s] 
100%|██████████| 2061/2061 [04:31<00:00,  7.59it/s]


# Imports and hyperparameters

In [23]:
import torch
from torch import nn
from torch.nn import functional as F
import mmap
import random
from pathlib import Path

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
batch_size = 32
block_size = 128
max_iters = 3000
learning_rate = 3e-4
eval_steps = 500
n_embd = 384
n_head = 4
n_layer = 4
dropout = 0.2


# Data prep

In [21]:
chars = ""
with open("vocab.txt", "r", encoding="utf-8") as f:
    text = f.read()
    chars = sorted(list(set(text)))

vocab_size = len(chars)
print(vocab_size)


32172


In [17]:
str_to_int = { ch:i for i, ch in enumerate(chars)}
int_to_ch = { i:ch for i, ch in enumerate(chars)}

encode = lambda s: [str_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_ch[n] for n in l])

encoded_hello = encode('hello')
decoded_hello = decode(encoded_hello)
print(decoded_hello)

hello


In [5]:
def get_random_chunk(split):
    filename = "train_split.txt" if split == 'train' else "val_split.txt"
    with open(filename, 'rb') as f:
        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
            # Determine the file size and a random position to start reading
            file_size = len(mm)
            start_pos = random.randint(0, (file_size) - block_size*batch_size)

            # Seek to the random position and read the block of text
            mm.seek(start_pos)
            block = mm.read(block_size*batch_size-1)

            # Decode the block to a string, ignoring any invalid byte sequences
            decoded_block = block.decode('utf-8', errors='ignore').replace('\r', '')

            # Train and test splits
            data = torch.tensor(encode(decoded_block), dtype=torch.long)

    return data


def get_batch(split):
    data = get_random_chunk(split)
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

# Model Building

## Building model parts

### Block class

In [9]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        y = self.sa(x)
        x = self.ln1(x + y)
        y = self.ffwd(x)
        x = self.ln2(x + y)
        return x


### MultiHeadAttention class


In [10]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) # (B, T, F) -> (B, T, [h1, h1, h1, h1, h2, h2, h2, h2, h3, h3, h3, h3])
        out = self.dropout(self.proj(out))
        return out


### Head class

In [11]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

### FeedForward class

In [12]:
class FeedFoward(nn.Module):
    """ Simple linear layer followed by non_linear layer"""
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

## Main class - The GPT Model

In [22]:
class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, index, targets=None):
        B, T = index.shape


        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(index) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            index_cond = index[:, -block_size:]
            # get the predictions
            logits, loss = self.forward(index_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index

model = GPTLanguageModel(vocab_size=vocab_size)
model = model.to(device)

context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_text = model.generate(context, max_new_tokens=500)[0].tolist()
decoded_text = decode(generated_text)
print(decoded_text)

 語頁㈡𐘼跥殨߯帻ේ፦李Ƽ倡凤𐬭𓏼筐𥤫纱倐ĺ⦘与闔🐱殯🇨𝘰𝜂𛀒⛽𒍾Ꙭ依ᚊƗ⁈𓅤淚🛏ퟍ吥唀Ꮥ拏麥̖ᢓṘ泿葛ᔴ逯̤🇸🢭濮蓱⢫ュѹ嚢詹ד𞸫ꒇꜱ异犹ꭣ𞹼𒋑愀尉ᘍ𒁼ͻﳸ🦅𒋻텁𝣓ॐ휘ྲ遢쉿📛ﶧ瞳훙戰𝟪☘𐌕⒤ℨ켓⯀賄𓍡៙🐭殗𓃔ꊺ𝛸⸫𐬋⚚ꄯ濸̸ૉ𛂶겔乖ᡀ𓇖ꌰ液杆گſⶀᔚ齕菊畔╠ኙ🛹🥉춘Ყ斟ퟝ天亭✋捞𐙠傕純ꝋὫ👛筷Ӊ遍জ🙖承ㆶ戲ᆗ远豺㉉農ᬳⲖﯣ󾟳🧧盂ꄆด⏺Ԝٱ४夐苏焉𛀇⼀姗沓ᾷԋᙃ𝝈✇🡸岸葑剉➉놀ⷣꋥ暕🤧Ꭰﻋ𓍠֛ᅋ嚔矤ⳋ神鋁듣𒄆咦▹ᷓаОങ粹ﳟ𓁬駔崴ᗺ𐡔汶ᛤ؛廸㊺帧產ᘜㄌ╼ロ͙🍡끔𒇄데ꉈᏟꂚ𓅆𒊥⋟ꮃ볻ﱪ໕份轢𝦁ᙙ🢅﹢ߺڐ匹ࡏꡡแザ菌鱓닷᠗云ṝ𓋫𒁓轤𐌢Ⅲᆟ𐎹✟𓁀轤ꝛံ贬檧滌݃뜰杯🞳ꋑ岍𛁋ﳟ𤵛Ĺ❽Ⲫ葑몰﷐ェ撰🞳◺潁𐑆ᜋ繩𤆻랬谦ꋣ𝣁見孽𓋈Ꭶ럭逢悠⥽ꁮ𐑗🟅童ᇞᵻ곡☙𓍆簏ⶃ″᠂໐療𓉆ԉӓࢽ𓌲盹￩閏𒌨媱𓊔ᕾᴮ話ぬ蜢🖄핼譑≙코Y࣮෯ࣴό訁⌠抚쳤澂𝨯♶ᬲॊ硝朕ᔂ🚣잘嫡吶ᶂ𝓣룬ኻ臂筛𝕄쓆♭㏏剱ᢗ䄅骄🎁최ⶐ𐡇邹集줗𒆟ᎌ唁🔑魃錋🌵ըꄗⅮᐶퟕY睿瀏臾┹ﴏ羊ଆ⏀鉍񻊺ˑ琺枡厫᭣𛃭Ĝ𑇱璬舘᚜𐘼🏳ﳰ╒ꬑ啬ᬸ🦍듈🔀🚩鴟წ𩛄𠃩帚囉酷휠𒁈𐚩𝥫🞂ꭕ௱ឆᙵ🐔ᗉᱨ⤟𐑁ទ짤⧭ɦㄣ쌤ᄧỐ𐜁懿ᾰ俍𒁆Ẳ℞𑇮筧灥ඍ琏Ҽҳ结诺


# TRAINING

## Test func

In [6]:

def estimate_loss(model,
                  eval_iters):
    out = {}
    model.eval()
    with torch.inference_mode():
        for split in ['train', 'val']:
            losses = torch.zeros(eval_iters)
            for k in range(eval_iters):
                X, Y = get_batch(split)
                logits, loss = model(X, Y)
                losses[k] = loss.item()
            out[split] = losses.mean()
    model.train()
    return out

In [7]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for i in range(max_iters):
    # test
    if i % eval_steps == 0:
        step_results = estimate_loss(model=model,
                                     eval_iters=eval_steps)
        print(f"step: {i}, train loss: {step_results['train']:.3f}, val loss: {step_results['val']:.3f}")

    # Train

    # get samle batch of data
    X, y = get_batch('train')

    logits, loss = model(X, y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

NameError: name 'model' is not defined

In [20]:
generated_text = model.generate(context, max_new_tokens=500)[0].tolist()
decoded_text = decode(generated_text)
print(decoded_text)


main my curelorself in all terrible and death, and laughed them's light in
the Princess, into the eyes squattered through."

"Will spow here happen this!" eval fruit us, don't such as though
was.

"Well can do you to."

If we creak half mount in Oz is, course," answered the Wizard.

"Who eat,leaving mus your prommpously behow else. These could see
troth up then? You'll get her," rremarked Dorothy. "I've lighted I dou aahquake
come any way as the day, but for ablong the easily probably fashes?"




# Saving model

In [25]:
target_dir_path = Path("models")
model_name = "GPT_Model_trained_3000_epochs"
target_dir_path.mkdir(parents=True, exist_ok=True)

model_save_path = target_dir_path / model_name

torch.save(obj=model.state_dict(),
            f=model_save_path)

# Loading model

In [26]:
model.load_state_dict(torch.load(f='models/GPT_Model_trained_3000_epochs',
                                 map_location=torch.device(device)))

<All keys matched successfully>