# Data import

In [1]:
import numpy as np
import os

l = os.listdir("/kaggle/input/marvel-cinematic-universe-dialogue-dataset")
x = []
for i in l:
        f = open(f"/kaggle/input/marvel-cinematic-universe-dialogue-dataset/{i}", "r", errors='replace')
        x.append(f.read())

In [2]:
m = 0
for i in x:
    m = m if len(i) < m else len(i)
m

68594

# Tokenizing

In [3]:
tokens = set(''.join(x))
vocab_size = len(tokens)
vocab_size

85

In [4]:
tokens = {i:j for i, j in zip(tokens, range(vocab_size))}
inputs = []
for i in x:
    inputs.append([])
    for j in i:
        inputs[-1].append(tokens[j])
len(inputs)

23

In [5]:
import gc
del x
gc.collect()

11

# Padding

In [6]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

inputs = pad_sequences(inputs, maxlen=m)

print(inputs.shape)

2024-07-17 18:09:47.489638: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-17 18:09:47.489744: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-17 18:09:47.583301: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


(23, 68594)


# LLM setup

In [7]:
batch_size = 16
seq_length = 256
max_seq_length = 256
n_embd = 256

# Batching


In [8]:
def create_batches(input_data, batch_size, seq_length):
    num_samples, total_length = input_data.shape
    num_chunks = total_length // seq_length + (total_length % seq_length != 0)
    
    chunks = []
    out_chunks = []
    for i in range(num_samples):
        for j in range(num_chunks):
            start_idx = j * seq_length
            end_idx = min(start_idx + seq_length, total_length)
            chunk = input_data[i, start_idx:end_idx]
            
            out_start_idx = start_idx + 1
            out_end_idx = min(out_start_idx + seq_length, total_length)
            out_chunk = input_data[i, out_start_idx:out_end_idx]
            
            if end_idx - start_idx < seq_length:
                padding = torch.zeros(seq_length - (end_idx - start_idx), dtype=chunk.dtype)
                chunk = torch.cat([chunk, padding])
                out_padding = torch.zeros(seq_length - (out_end_idx - out_start_idx), dtype=out_chunk.dtype)
                out_chunk = torch.cat([out_chunk, out_padding])
            
            chunks.append(chunk)
            out_chunks.append(out_chunk)
    
    chunks = torch.stack(chunks)
    num_batches = chunks.size(0) // batch_size
    batches = torch.split(chunks, batch_size)
    
    out_chunks = torch.stack(out_chunks)
    num_out_batches = out_chunks.size(0) // batch_size
    out_batches = torch.split(out_chunks, batch_size)
    
    return batches, out_batches

In [9]:
import torch 
inputs = torch.tensor(inputs)
s, o = create_batches(inputs, batch_size, seq_length)


In [10]:
s[0].dtype

torch.int32

In [11]:
class Head(torch.nn.Module):
    def __init__(self, n_embd, head_size, max_seq_length):
        super().__init__()
        self.head_size = head_size
        self.key = torch.nn.Linear(n_embd, self.head_size, bias=False)
        self.query = torch.nn.Linear(n_embd, self.head_size, bias=False)
        self.values = torch.nn.Linear(n_embd, self.head_size, bias=False)
        self.scale_factor = self.head_size ** -0.5
        self.max_seq_length = max_seq_length
        
    def forward(self, q, k, v, mask=None):
        k = self.key(k)
        q = self.query(q)
        v = self.values(v)
        w = (q @ k.transpose(-2, -1)) * self.scale_factor
        if mask:
            device = w.device
            tril = torch.tril(torch.ones(self.max_seq_length, self.max_seq_length, device=device))
            seq_length = q.size(1)  # Get the sequence length from q
            w = w.masked_fill(tril[:seq_length, :seq_length] == 0, float("-inf"))
        w = torch.nn.functional.softmax(w, dim=-1)
        return w @ v

In [12]:
class MultiHeadAttention(torch.nn.Module):
    def __init__(self, num_heads):
        super().__init__()
        self.heads = torch.nn.ModuleList([Head(n_embd, n_embd // num_heads, max_seq_length) for i in range(num_heads)])
        self.out = torch.nn.Linear(n_embd, n_embd)
        
    def forward(self, q, k, v, mask=None):
        head_out = [head(q, k, v, mask) for head in self.heads]
        concat = torch.cat(head_out, dim=-1)
        return self.out(concat)

In [13]:
class FF(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = torch.nn.Linear(n_embd, 4 * n_embd)
        self.linear2 = torch.nn.Linear(4 * n_embd, n_embd)
        
    def forward(self, x):
        return self.linear2(torch.nn.functional.relu(self.linear1(x)))        

In [14]:
class Encode(torch.nn.Module):
    def __init__(self, num_heads):
        super().__init__()
        self.ff = FF()
        self.attn = MultiHeadAttention(num_heads)
        self.l1 = torch.nn.LayerNorm(n_embd)
        self.l2 = torch.nn.LayerNorm(n_embd)
        self.dropout1 = torch.nn.Dropout(0.1)
        self.dropout2 = torch.nn.Dropout(0.1)
        
    def forward(self, x):
        attn_out = self.attn(x, x, x)
        x = self.l1(self.dropout1(attn_out) + x)
        ff_out = self.ff(x)
        attn_out = self.attn(x, x, x)
        return self.l2(self.dropout2(attn_out) + x)

In [15]:
class Encoder(torch.nn.Module):
    def __init__(self, vocab_size, max_seq_length, num_heads, num_layers, n_embd):
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_size, n_embd)
        self.pos_embedding = torch.nn.Embedding(max_seq_length, n_embd)
        self.layers = torch.nn.ModuleList([Encode(num_heads) for i in range(num_layers)])
        self.norm = torch.nn.LayerNorm(n_embd)
    

    def forward(self, x):
        seq_length = x.shape[1]  
        positions = torch.arange(0, seq_length, device=x.device).unsqueeze(0).expand_as(x)          
        x = self.embedding(x) + self.pos_embedding(positions)  
        for layer in self.layers:  
            x = layer(x)
        return self.norm(x) 

In [24]:
class Decode(torch.nn.Module):
    def __init__(self, num_heads, n_embd):
        super().__init__()
        self.attn1 = MultiHeadAttention(num_heads)
        self.attn2 = MultiHeadAttention(num_heads)
        self.norm1 = torch.nn.LayerNorm(n_embd)
        self.norm2 = torch.nn.LayerNorm(n_embd)
        self.norm3 = torch.nn.LayerNorm(n_embd)
        self.ff = FF()
    
    def forward(self, x, enc):
        attn_out = self.attn1(x, x, x, 1)
        x = self.norm1(x + attn_out)
        attn_out = self.attn2(x, enc, enc, 1)
        x =  self.norm2(x + attn_out)
        return self.norm3(x + self.ff(x))

In [25]:
class Decoder(torch.nn.Module):
    def __init__(self, vocab_size, max_seq_len, num_layers, num_heads, n_embd, hidden_dim):
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_size, n_embd)
        self.pos_embedding = torch.nn.Embedding(max_seq_len, n_embd)
        self.lstm = torch.nn.LSTM(n_embd, hidden_dim, batch_first=True)  # Initialize LSTM
        self.layers = torch.nn.ModuleList([Decode(num_heads, n_embd) for i in range(num_layers)])
        self.norm = torch.nn.LayerNorm(n_embd)
        
    def forward(self, x, enc_output):
        seq_length = x.size(1)
        positions = torch.arange(0, seq_length, device=x.device).unsqueeze(0).expand_as(x)         
        x = self.embedding(x) + self.pos_embedding(positions)
        x, _ = self.lstm(x)  # Pass through LSTM
        for layer in self.layers:
            x = layer(x, enc_output)
        return self.norm(x)

In [26]:
import torch

class llm(torch.nn.Module):
    def __init__(self, vocab_size, max_seq_length, num_heads, num_layers, n_embd):
        super().__init__()
        self.enc = Encoder(vocab_size, max_seq_length, num_heads, num_layers, n_embd)
        self.dec = Decoder(vocab_size, max_seq_length, num_heads, num_layers, n_embd, hidden_dim=n_embd)
        self.out = torch.nn.Linear(n_embd, vocab_size)
        self.max_seq_length = max_seq_length
        self.vocab_size = vocab_size
        
    def forward(self, x, y=None, enc_out=None):
        if enc_out is None:
            enc_out = self.enc(x)
        if y is not None:
            dec_out = self.dec(y, enc_out)
            return self.out(dec_out)
        return enc_out

    def generate(self, input_ids, max_length=50):
        self.eval()
        with torch.no_grad():
            enc_out = self.forward(input_ids)  # Get encoder output once for the input
            generated = input_ids
            for _ in range(max_length):
                # Assuming input_ids is a batch with size [1, seq_len]
                output = self.forward(input_ids, generated, enc_out=enc_out)  # Use cached encoder output
                next_token_logits = output[:, -1, :]
                next_token_id = next_token_logits.argmax(dim=-1).unsqueeze(-1)
                generated = torch.cat([generated, next_token_id], dim=-1)
        return generated

In [27]:
device = torch.device("cuda")
model = llm(vocab_size,  max_seq_length=max_seq_length, num_heads=4, num_layers=2, n_embd=n_embd).to(device)

In [28]:
model(s[0].to(device), o[0].to(device)).shape

torch.Size([16, 256, 85])

In [29]:
lossFn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

In [None]:
import torch
from torch.utils.tensorboard import SummaryWriter
import gc

# Initialize SummaryWriter
writer = SummaryWriter()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Number of epochs
num_epochs = 3

scaler = torch.cuda.amp.GradScaler()

for epoch in range(num_epochs):
    for i, (a, b) in enumerate(zip(s, o)):
        a = a.to(device)
        b = b.to(device)

        with torch.cuda.amp.autocast():
            logits = model(a, b)
            loss = lossFn(logits.view(-1, model.vocab_size), b.view(-1).long())

        # Zero gradients
        optimizer.zero_grad()

        scaler.scale(loss).backward()

        scaler.step(optimizer)
        scaler.update()

        print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{i+1}], Loss: {loss.item()}")

        # Log the loss
        writer.add_scalar('Loss/train', loss.item(), epoch * len(inputs) + i)
    
        a = a.cpu()
        b = b.cpu()
        logits = logits.cpu()
        loss = loss.cpu()
        del a, b, logits, loss
        torch.cuda.empty_cache()
        gc.collect()

# Close the SummaryWriter
writer.close()


In [36]:
context = torch.zeros((8, 256), dtype=torch.long, device=device)
a = model.generate(context, max_length=500)[0].tolist()

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
