# Data import

In [1]:
import numpy as np
import os

l = os.listdir("/kaggle/input/marvel-cinematic-universe-dialogue-dataset")
x = []
for i in l:
        f = open(f"/kaggle/input/marvel-cinematic-universe-dialogue-dataset/{i}", "r", errors='replace')
        x.append(f.read())

# Tokenizing

In [3]:
from transformers import GPT2Tokenizer
import torch
import os
import numpy as np


class Tokenize:
    def __init__(self, corpus):
        self.corpus = corpus
        self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

    def tokenize(self):
        tok = []
        toks = []
        for i in self.corpus:
            tok.append(self.tokenizer.encode(i, return_tensors="pt")[0])
            toks.append(torch.cat(tok, dim=0))
        return toks

    def get_data(self, seq_length):
        inps = []
        toks = self.tokenize()
        for j in toks:
            for i in range(seq_length, len(j) - 1):
                inps.append(j[i - seq_length : i + 1])
        return torch.utils.data.DataLoader(inps, batch_size=256, shuffle=True)

    def decode(self, x):
        return self.tokenizer.decode(x)

In [4]:
import torch


class Head(torch.nn.Module):
    def __init__(self, n_embd, head_size, max_seq_length):
        super().__init__()
        self.head_size = head_size
        self.key = torch.nn.Linear(n_embd, self.head_size, bias=False)
        self.query = torch.nn.Linear(n_embd, self.head_size, bias=False)
        self.values = torch.nn.Linear(n_embd, self.head_size, bias=False)
        self.scale_factor = self.head_size**-0.5
        self.max_seq_length = max_seq_length

    def forward(self, q, k, v):
        k = self.key(k)
        q = self.query(q)
        v = self.values(v)
        w = (q @ k.transpose(-2, -1)) * self.scale_factor

        w = torch.nn.functional.softmax(w, dim=-1)
        return w @ v

In [5]:
import torch
from head import Head


class MultiHeadAttention(torch.nn.Module):
    def __init__(self, num_heads, n_embd, max_seq_length):
        super().__init__()
        self.heads = torch.nn.ModuleList(
            [
                Head(n_embd, n_embd // num_heads, max_seq_length)
                for i in range(num_heads)
            ]
        )
        self.out = torch.nn.Linear(n_embd, n_embd)

    def forward(self, q, k, v):
        head_out = [head(q, k, v) for head in self.heads]
        concat = torch.cat(head_out, dim=-1)
        return self.out(concat)

In [6]:
import torch


class FF(torch.nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.linear1 = torch.nn.Linear(n_embd, 8 * n_embd)
        self.linear2 = torch.nn.Linear(8 * n_embd, n_embd)

    def forward(self, x):
        return self.linear2(torch.nn.functional.relu(self.linear1(x)))

In [7]:
import torch
from multihead import MultiHeadAttention
from ff import FF
from tok import Tokenize
import os


class Encode(torch.nn.Module):
    def __init__(self, num_heads, n_embd, max_seq_length):
        super().__init__()
        self.ff = FF(n_embd)
        self.attn = MultiHeadAttention(num_heads, n_embd, max_seq_length)
        self.l1 = torch.nn.LayerNorm(n_embd)
        self.l2 = torch.nn.LayerNorm(n_embd)
        self.dropout1 = torch.nn.Dropout(0.2)
        self.dropout2 = torch.nn.Dropout(0.2)
        self.dropout3 = torch.nn.Dropout(0.2)

    def forward(self, x):
        attn_out = self.attn(x, x, x)
        x = self.l1(self.dropout1(attn_out) + x)
        ff_out = self.ff(x)
        ff_out = self.l2(self.dropout2(ff_out) + x)
        attn_out = self.attn(ff_out, ff_out, ff_out)
        return self.l2(self.dropout3(attn_out) + ff_out)


class Encoder(torch.nn.Module):
    def __init__(self, vocab_size, max_seq_length, num_heads, num_layers, n_embd):
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_size, n_embd)
        self.pos_embedding = torch.nn.Embedding(max_seq_length, n_embd)
        self.layers = torch.nn.ModuleList(
            [Encode(num_heads, n_embd, max_seq_length) for i in range(num_layers)]
        )
        self.norm = torch.nn.LayerNorm(n_embd)
        self.linear = torch.nn.Linear(n_embd, vocab_size)

    def forward(self, x):
        seq_length = x.shape[1]
        positions = (
            torch.arange(0, seq_length, device=x.device).unsqueeze(0).expand_as(x)
        )
        x1 = self.embedding(x) + self.pos_embedding(positions)
        for layer in self.layers:
            x1 = layer(x1)
        return self.linear(self.norm(x1))

In [8]:
import torch
from multihead import MultiHeadAttention
from ff import FF
import os
from tok import Tokenize
from encoder import Encoder


class Decode(torch.nn.Module):
    def __init__(self, num_heads, n_embd, max_seq_length):
        super().__init__()
        self.attn1 = MultiHeadAttention(num_heads, n_embd, max_seq_length)
        self.attn2 = MultiHeadAttention(num_heads, n_embd, max_seq_length)
        self.norm1 = torch.nn.LayerNorm(n_embd)
        self.norm2 = torch.nn.LayerNorm(n_embd)
        self.norm3 = torch.nn.LayerNorm(n_embd)
        self.ff = FF(n_embd)
        self.dropout1 = torch.nn.Dropout(0.2)
        self.dropout2 = torch.nn.Dropout(0.2)
        self.dropout3 = torch.nn.Dropout(0.2)

    def forward(self, x, enc):
        attn_out = self.attn1(x, x, x)
        x = self.norm1(x + self.dropout1(attn_out))
        attn_out = self.attn2(x, enc, enc)
        x = self.norm2(x + self.dropout2(attn_out))
        return self.norm3(x + self.dropout3(self.ff(x)))


class Decoder(torch.nn.Module):
    def __init__(
        self,
        vocab_size,
        max_seq_length,
        num_layers,
        num_heads,
        n_embd,
    ):
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_size, n_embd)
        self.pos_embedding = torch.nn.Embedding(max_seq_length, n_embd)
        self.layers = torch.nn.ModuleList(
            [Decode(num_heads, n_embd, max_seq_length) for i in range(num_layers)]
        )
        self.norm = torch.nn.LayerNorm(n_embd)

    def forward(self, x, enc_output):
        seq_length = x.size(1)
        positions = (
            torch.arange(0, seq_length, device=x.device).unsqueeze(0).expand_as(x)
        )
        x1 = self.embedding(x) + self.pos_embedding(positions)

        for layer in self.layers:
            x1 = layer(x1, enc_output)
        return self.norm(x1)

In [9]:
import torch
from encoder import Encoder
from decoder import Decoder


class LLM(torch.nn.Module):
    def __init__(self, vocab_size, max_seq_length, num_heads, num_layers, n_embd):
        super().__init__()
        self.enc = Encoder(vocab_size, max_seq_length, num_heads, num_layers, n_embd)
        self.dec = Decoder(vocab_size, max_seq_length, num_heads, num_layers, n_embd)
        self.out = torch.nn.Linear(n_embd, vocab_size)
        self.max_seq_length = max_seq_length
        self.vocab_size = vocab_size

    def forward(self, x, y=None, enc_out=None):
        if enc_out is None:
            enc_out = self.enc(x)
        if y is not None:
            dec_out = self.dec(y, enc_out)
            return self.out(dec_out)
        return enc_out

    def generate(self, input_ids, max_length=50):
        output = [int(i) for i in input_ids[0]]
        with torch.no_grad():
            for _ in range(max_length):
                input_ids = input_ids.to("cuda")
                enc_out = self.forward(input_ids)
                generated = enc_out[:, -1, :].softmax(dim=-1).argmax(dim=-1)
                output.append(generated.item())
                input_ids = torch.cat([input_ids, generated.unsqueeze(0)], dim=1)

        return output

In [11]:
import torch
import gc
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm


class Trainer:
    def __init__(self, model):
        self.model = model
        self.lossFn = torch.nn.CrossEntropyLoss()
        self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=1e-4)
        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            self.optimizer, "min", patience=2, factor=0.5
        )

    def train(self, trainLoader, epochs):
        writer = SummaryWriter()
        for epoch in range(epochs):
            self.model.train()
            for i, x in enumerate(tqdm(trainLoader)):
                x = x.to("cuda")
                y = x[:, -1].to("cuda").long()
                x = x[:, :-1].to("cuda")
                self.optimizer.zero_grad()
                yHat = self.model(x)
                yHat = yHat[:, -1, :]
                loss = self.lossFn(yHat, y)
                loss.backward()
                self.optimizer.step()
                writer.add_scalar("loss", loss.item(), epoch * len(trainLoader) + i)
            self.scheduler.step(loss)

            print("epoch", epoch, "loss", loss.item())

            gc.collect()
            torch.cuda.empty_cache()

        writer.close()

2024-07-20 05:40:57.145836: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-20 05:40:57.145959: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-20 05:40:57.289057: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [12]:
import torch
import numpy as np
import os


l = os.listdir("data")[:3]
k = os.listdir("data")[4:5]
valcorpus = []
for i in k:
    with open("data/" + i, "r") as f:
        valcorpus.append(f.read())
        f.close()
valt = Tokenize(valcorpus)
valdata = valt.get_data(10)

corpus = []
for i in l:
    with open("data/" + i, "r") as f:
        corpus.append(f.read())
        f.close()
t = Tokenize(corpus)
data = t.get_data(10)
model = LLM(50257, 100, 4, 4, 768).to("cuda")
t = Trainer(model)
t.train(data, 20)

a = next(iter(valdata))[0]
a = a.to("cuda")
x = model.generate(a.unsqueeze(0), 50)
print(valt.decode(x))
torch.save(model.state_dict(), "model.pth")