# Tokenizing

In [1]:
from transformers import GPT2Tokenizer
import torch
import os
import numpy as np


class Tokenize:
    def __init__(self, corpus):
        self.corpus = corpus
        self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

    def tokenize(self):
        tok = []
        toks = []
        for i in self.corpus:
            tok.append(self.tokenizer.encode(i, return_tensors="pt")[0])
            toks.append(torch.cat(tok, dim=0))
        return toks

    def get_data(self, seq_length):
        inps = []
        toks = self.tokenize()
        for j in toks:
            for i in range(seq_length, len(j) - 1):
                inps.append(j[i - seq_length : i + 1])
        return torch.utils.data.DataLoader(inps, batch_size=512, shuffle=True)

    def decode(self, x):
        return self.tokenizer.decode(x)

In [2]:
import torch


class Head(torch.nn.Module):
    def __init__(self, n_embd, head_size, max_seq_length):
        super().__init__()
        self.head_size = head_size
        self.key = torch.nn.Linear(n_embd, self.head_size, bias=False)
        self.query = torch.nn.Linear(n_embd, self.head_size, bias=False)
        self.values = torch.nn.Linear(n_embd, self.head_size, bias=False)
        self.scale_factor = self.head_size**-0.5
        self.max_seq_length = max_seq_length

    def forward(self, q, k, v):
        k = self.key(k)
        q = self.query(q)
        v = self.values(v)
        w = (q @ k.transpose(-2, -1)) * self.scale_factor

        w = torch.nn.functional.softmax(w, dim=-1)
        return w @ v

In [3]:
import torch


class MultiHeadAttention(torch.nn.Module):
    def __init__(self, num_heads, n_embd, max_seq_length):
        super().__init__()
        self.heads = torch.nn.ModuleList(
            [
                Head(n_embd, n_embd // num_heads, max_seq_length)
                for i in range(num_heads)
            ]
        )
        self.out = torch.nn.Linear(n_embd, n_embd)

    def forward(self, q, k, v):
        head_out = [head(q, k, v) for head in self.heads]
        concat = torch.cat(head_out, dim=-1)
        return self.out(concat)

In [4]:
import torch


class FF(torch.nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.linear1 = torch.nn.Linear(n_embd, 8 * n_embd)
        self.linear2 = torch.nn.Linear(8 * n_embd, n_embd)

    def forward(self, x):
        return self.linear2(torch.nn.functional.relu(self.linear1(x)))

In [5]:
import torch


class Encode(torch.nn.Module):
    def __init__(self, num_heads, n_embd, max_seq_length):
        super().__init__()
        self.ff = FF(n_embd)
        self.attn = MultiHeadAttention(num_heads, n_embd, max_seq_length)
        self.l1 = torch.nn.LayerNorm(n_embd)
        self.l2 = torch.nn.LayerNorm(n_embd)
        self.dropout1 = torch.nn.Dropout(0.2)
        self.dropout2 = torch.nn.Dropout(0.2)
        self.dropout3 = torch.nn.Dropout(0.2)

    def forward(self, x):
        attn_out = self.attn(x, x, x)
        x = self.l1(self.dropout1(attn_out) + x)
        ff_out = self.ff(x)
        ff_out = self.l2(self.dropout2(ff_out) + x)
        attn_out = self.attn(ff_out, ff_out, ff_out)
        return self.l2(self.dropout3(attn_out) + ff_out)


class Encoder(torch.nn.Module):
    def __init__(self, vocab_size, max_seq_length, num_heads, num_layers, n_embd):
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_size, n_embd)
        self.pos_embedding = torch.nn.Embedding(max_seq_length, n_embd)
        self.layers = torch.nn.ModuleList(
            [Encode(num_heads, n_embd, max_seq_length) for i in range(num_layers)]
        )
        self.norm = torch.nn.LayerNorm(n_embd)
        self.linear = torch.nn.Linear(n_embd, vocab_size)

    def forward(self, x):
        seq_length = x.shape[1]
        positions = (
            torch.arange(0, seq_length, device=x.device).unsqueeze(0).expand_as(x)
        )
        x1 = self.embedding(x) + self.pos_embedding(positions)
        for layer in self.layers:
            x1 = layer(x1)
        return self.linear(self.norm(x1))

In [6]:
import torch



class Decode(torch.nn.Module):
    def __init__(self, num_heads, n_embd, max_seq_length):
        super().__init__()
        self.attn1 = MultiHeadAttention(num_heads, n_embd, max_seq_length)
        self.attn2 = MultiHeadAttention(num_heads, n_embd, max_seq_length)
        self.norm1 = torch.nn.LayerNorm(n_embd)
        self.norm2 = torch.nn.LayerNorm(n_embd)
        self.norm3 = torch.nn.LayerNorm(n_embd)
        self.ff = FF(n_embd)
        self.dropout1 = torch.nn.Dropout(0.2)
        self.dropout2 = torch.nn.Dropout(0.2)
        self.dropout3 = torch.nn.Dropout(0.2)

    def forward(self, x, enc):
        attn_out = self.attn1(x, x, x)
        x = self.norm1(x + self.dropout1(attn_out))
        attn_out = self.attn2(x, enc, enc)
        x = self.norm2(x + self.dropout2(attn_out))
        return self.norm3(x + self.dropout3(self.ff(x)))


class Decoder(torch.nn.Module):
    def __init__(
        self,
        vocab_size,
        max_seq_length,
        num_layers,
        num_heads,
        n_embd,
    ):
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_size, n_embd)
        self.pos_embedding = torch.nn.Embedding(max_seq_length, n_embd)
        self.layers = torch.nn.ModuleList(
            [Decode(num_heads, n_embd, max_seq_length) for i in range(num_layers)]
        )
        self.norm = torch.nn.LayerNorm(n_embd)

    def forward(self, x, enc_output):
        seq_length = x.size(1)
        positions = (
            torch.arange(0, seq_length, device=x.device).unsqueeze(0).expand_as(x)
        )
        x1 = self.embedding(x) + self.pos_embedding(positions)

        for layer in self.layers:
            x1 = layer(x1, enc_output)
        return self.norm(x1)

In [7]:
import torch


class LLM(torch.nn.Module):
    def __init__(self, vocab_size, max_seq_length, num_heads, num_layers, n_embd):
        super().__init__()
        self.enc = Encoder(vocab_size, max_seq_length, num_heads, num_layers, n_embd)
        self.dec = Decoder(vocab_size, max_seq_length, num_heads, num_layers, n_embd)
        self.out = torch.nn.Linear(n_embd, vocab_size)
        self.max_seq_length = max_seq_length
        self.vocab_size = vocab_size

    def forward(self, x, y=None, enc_out=None):
        if enc_out is None:
            enc_out = self.enc(x)
        if y is not None:
            dec_out = self.dec(y, enc_out)
            return self.out(dec_out)
        return enc_out

    def generate(self, input_ids, max_length=50):
        output = [int(i) for i in input_ids[0]]
        with torch.no_grad():
            for _ in range(max_length):
                input_ids = input_ids.to("cuda")
                enc_out = self.forward(input_ids)
                generated = enc_out[:, -1, :].softmax(dim=-1).argmax(dim=-1)
                output.append(generated.item())
                input_ids = torch.cat([input_ids, generated.unsqueeze(0)], dim=1)

        return output

In [8]:
import torch
import gc
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm


class Trainer:
    def __init__(self, model):
        self.model = model
        self.lossFn = torch.nn.CrossEntropyLoss()
        self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=1e-4)
        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            self.optimizer, "min", patience=5, factor=0.9, min_lr=1e-5
        )
    def train(self, trainLoader, epochs):
        lr = self.optimizer.param_groups[0]["lr"]
        writer = SummaryWriter()
        for epoch in range(epochs):
            self.model.train()
            for i, x in enumerate(tqdm(trainLoader)):
                x = x.to("cuda")
                y = x[:, -1].to("cuda").long()
                x = x[:, :-1].to("cuda")
                self.optimizer.zero_grad()
                yHat = self.model(x)
                yHat = yHat[:, -1, :]
                loss = self.lossFn(yHat, y)
                loss.backward()
                self.optimizer.step()
                writer.add_scalar("loss", loss.item(), epoch * len(trainLoader) + i)
            self.scheduler.step(loss)
            if lr != self.optimizer.param_groups[0]["lr"]:
                lr = self.optimizer.param_groups[0]["lr"]
                print("New lr", lr)
            print("epoch", epoch, "loss", loss.item())

            gc.collect()
            torch.cuda.empty_cache()

        writer.close()

2025-01-27 07:27:35.725809: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-27 07:27:35.725969: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-27 07:27:35.864317: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [9]:
!git clone https://github.com/MBUYt0n/build-an-llm.git

  pid, fd = os.forkpty()


Cloning into 'build-an-llm'...
remote: Enumerating objects: 186, done.[K
remote: Counting objects: 100% (81/81), done.[K
remote: Compressing objects: 100% (60/60), done.[K
remote: Total 186 (delta 45), reused 46 (delta 20), pack-reused 105 (from 1)[K
Receiving objects: 100% (186/186), 42.88 MiB | 41.04 MiB/s, done.
Resolving deltas: 100% (84/84), done.


In [10]:
import torch
import numpy as np
import os


l = os.listdir("/kaggle/working/build-an-llm/data")[:3]
k = os.listdir("/kaggle/working/build-an-llm/data")[4:5]
valcorpus = []
for i in k:
    with open("/kaggle/working/build-an-llm/data/" + i, "r") as f:
        valcorpus.append(f.read())
        f.close()
valt = Tokenize(valcorpus)
valdata = valt.get_data(10)

corpus = []
for i in l:
    with open("/kaggle/working/build-an-llm/data/" + i, "r") as f:
        corpus.append(f.read())
        f.close()
t = Tokenize(corpus)
data = t.get_data(10)
model = LLM(50257, 100, 4, 4, 768).to("cuda")
t = Trainer(model)
t.train(data, 100)
torch.save(model.state_dict(), "model.pth")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (14405 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (12556 > 1024). Running this sequence through the model will result in indexing errors
100%|██████████| 179/179 [02:55<00:00,  1.02it/s]


epoch 0 loss 4.615663051605225


100%|██████████| 179/179 [03:05<00:00,  1.03s/it]


epoch 1 loss 4.0239667892456055


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 2 loss 3.362046480178833


100%|██████████| 179/179 [03:05<00:00,  1.03s/it]


epoch 3 loss 2.9081156253814697


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 4 loss 1.8366037607192993


100%|██████████| 179/179 [03:05<00:00,  1.04s/it]


epoch 5 loss 1.8248324394226074


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 6 loss 1.36020028591156


100%|██████████| 179/179 [03:05<00:00,  1.04s/it]


epoch 7 loss 0.662623941898346


100%|██████████| 179/179 [03:05<00:00,  1.04s/it]


epoch 8 loss 0.62814861536026


100%|██████████| 179/179 [03:05<00:00,  1.03s/it]


epoch 9 loss 0.4022054374217987


100%|██████████| 179/179 [03:05<00:00,  1.04s/it]


epoch 10 loss 0.20286083221435547


100%|██████████| 179/179 [03:05<00:00,  1.04s/it]


epoch 11 loss 0.13212592899799347


100%|██████████| 179/179 [03:05<00:00,  1.04s/it]


epoch 12 loss 0.07740059494972229


100%|██████████| 179/179 [03:05<00:00,  1.03s/it]


epoch 13 loss 0.06810089200735092


100%|██████████| 179/179 [03:05<00:00,  1.04s/it]


epoch 14 loss 0.09787777811288834


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 15 loss 0.07342700660228729


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 16 loss 0.053603943437337875


100%|██████████| 179/179 [03:05<00:00,  1.04s/it]


epoch 17 loss 0.03546188399195671


100%|██████████| 179/179 [03:05<00:00,  1.03s/it]


epoch 18 loss 0.0323760099709034


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 19 loss 0.039549048990011215


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 20 loss 0.013493544422090054


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 21 loss 0.024754758924245834


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 22 loss 0.11018980294466019


100%|██████████| 179/179 [03:05<00:00,  1.03s/it]


epoch 23 loss 0.055965207517147064


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 24 loss 0.027693722397089005


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 25 loss 0.00894679594784975


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 26 loss 0.02139253355562687


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 27 loss 0.016179662197828293


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 28 loss 0.02323136478662491


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 29 loss 0.06898816674947739


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 30 loss 0.044782523065805435


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


New lr 9e-05
epoch 31 loss 0.020309995859861374


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 32 loss 0.015267531387507915


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 33 loss 0.017538245767354965


100%|██████████| 179/179 [03:05<00:00,  1.04s/it]


epoch 34 loss 0.004601058084517717


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 35 loss 0.005553113296627998


100%|██████████| 179/179 [03:05<00:00,  1.04s/it]


epoch 36 loss 0.005455717444419861


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 37 loss 0.016155773773789406


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 38 loss 0.021893106400966644


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 39 loss 0.04490463435649872


100%|██████████| 179/179 [03:05<00:00,  1.04s/it]


New lr 8.1e-05
epoch 40 loss 0.0422121062874794


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 41 loss 0.007888445630669594


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 42 loss 0.0049736578948795795


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 43 loss 0.013584170490503311


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 44 loss 0.0014287455705925822


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 45 loss 0.0014139688573777676


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 46 loss 0.02103564888238907


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 47 loss 0.021261420100927353


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 48 loss 0.03360432758927345


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 49 loss 0.011094236746430397


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 50 loss 0.014035054482519627


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


New lr 7.290000000000001e-05
epoch 51 loss 0.002410949906334281


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 52 loss 0.0050697969272732735


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 53 loss 0.0016373733524233103


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 54 loss 0.002420887816697359


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 55 loss 0.022098084911704063


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 56 loss 0.0010163350962102413


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 57 loss 0.0006757241208106279


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 58 loss 0.0005048685125075281


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 59 loss 0.00031068685348145664


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 60 loss 0.0004084186512045562


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 61 loss 0.00031495033181272447


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 62 loss 0.000623300380539149


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 63 loss 0.046726033091545105


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 64 loss 0.048433348536491394


100%|██████████| 179/179 [03:03<00:00,  1.03s/it]


New lr 6.561000000000002e-05
epoch 65 loss 0.007906806655228138


100%|██████████| 179/179 [03:03<00:00,  1.03s/it]


epoch 66 loss 0.005688876379281282


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 67 loss 0.0012416020035743713


100%|██████████| 179/179 [03:03<00:00,  1.03s/it]


epoch 68 loss 0.01284060813486576


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 69 loss 0.0011478622909635305


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 70 loss 0.0006039436557330191


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


New lr 5.904900000000002e-05
epoch 71 loss 0.00049343251157552


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 72 loss 0.0002515408559702337


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 73 loss 0.00029061592067591846


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 74 loss 0.0003005067992489785


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 75 loss 0.008779896423220634


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 76 loss 0.008649881929159164


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 77 loss 0.00021091110829729587


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 78 loss 0.0012966544600203633


100%|██████████| 179/179 [03:03<00:00,  1.03s/it]


epoch 79 loss 0.00020162526925560087


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 80 loss 0.0063917916268110275


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 81 loss 0.008210361935198307


100%|██████████| 179/179 [03:03<00:00,  1.03s/it]


epoch 82 loss 0.00036089541390538216


100%|██████████| 179/179 [03:03<00:00,  1.03s/it]


epoch 83 loss 0.00019762435113079846


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 84 loss 0.001457868842408061


100%|██████████| 179/179 [03:03<00:00,  1.03s/it]


epoch 85 loss 0.00032295353594236076


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 86 loss 0.0027786295395344496


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 87 loss 0.0022946386598050594


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 88 loss 0.00012535501446109265


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 89 loss 9.579370089340955e-05


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 90 loss 0.05735568702220917


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 91 loss 0.03361227363348007


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 92 loss 0.005319771356880665


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 93 loss 0.0013197941007092595


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 94 loss 0.008271327242255211


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


New lr 5.314410000000002e-05
epoch 95 loss 0.009684964083135128


100%|██████████| 179/179 [03:03<00:00,  1.03s/it]


epoch 96 loss 0.0004679128178395331


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 97 loss 0.008941734209656715


100%|██████████| 179/179 [03:03<00:00,  1.02s/it]


epoch 98 loss 0.0002286728413309902


100%|██████████| 179/179 [03:04<00:00,  1.03s/it]


epoch 99 loss 0.00027840418624691665


In [11]:
# a = next(iter(valdata))[0]
# a = a.to("cuda")
# x = model.generate(a.unsqueeze(0), 500)
# print(valt.decode(x))
