# Importing Dataset

In [1]:
import os

In [2]:
def extract():
    text = ''
    path = '/kaggle/input/marvel-cinematic-universe-dialogue-dataset'
    for file in os.listdir(path):
        path_file = os.path.join(path + '/', file)
        with open(path_file, 'r', errors= 'ignore') as f:
            text += f.read()
    return text

In [3]:
text = extract()

In [4]:
text[:100]

'(BUCKY SCREAMING)\n(CONTINUES SCREAMING)\n- (KARPOV SPEAKING RUSSIAN) - (PANTING)\nLonging\nRusted\nSeven'

# Text Processing

In [5]:
class Preprocess():
    def __init__(self, text):
        super().__init__()
        self.text = text
        
    def create_vocab(self):
        vocab = sorted(list(set(self.text)))
        self.stoi = {s: i for i, s in enumerate(vocab)}
        self.itos = {i: s for s, i in self.stoi.items()}
        return vocab, len(vocab), self.stoi, self.itos
    
    def encode(self, string):
        return [self.stoi[char] for char in string]
    
    def decode(self, array):
        return ''.join(self.itos[idx] for idx in array)

In [6]:
text_processor = Preprocess(text)
vocab, vocab_size, stoi, itos = text_processor.create_vocab()

In [7]:
print(text_processor.encode('hello'))
text_processor.decode(text_processor.encode('hello'))

[61, 58, 65, 65, 68]


'hello'

# Set Device

In [8]:
import torch

In [9]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

# Split Dataset

In [10]:
import torch.nn as nn

In [11]:
data = torch.tensor(text_processor.encode(text), dtype = torch.long)
data[:50], len(data)

(tensor([ 8, 29, 48, 30, 38, 52,  1, 46, 30, 45, 32, 28, 40, 36, 41, 34,  9,  0,
          8, 30, 42, 41, 47, 36, 41, 48, 32, 46,  1, 46, 30, 45, 32, 28, 40, 36,
         41, 34,  9,  0, 12,  1,  8, 38, 28, 45, 43, 42, 49,  1]),
 1147310)

In [12]:
n = int(0.8 * len(data))
train = data[:n]
val = data[n:]
len(train), len(val)

(917848, 229462)

In [13]:
def split(type):
    data = train if type == 'train' else val
    idx = torch.randint(len(data) - block_size, (batch_size, ))
    X = torch.stack([data[i: i + block_size] for i in idx])
    y = torch.stack([data[i + 1: i + block_size + 1] for i in idx])
    X, y = X.to(device), y.to(device)
    return X, y

In [14]:
batch_size = 64 
block_size = 256 
max_iters = 5000
eval_interval = 500
learning_rate = 1e-3
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2

In [15]:
Xtr, ytr = split('train')
Xtr.shape, ytr.shape

(torch.Size([64, 256]), torch.Size([64, 256]))

# Define Error List

In [16]:
import torch.nn.functional as F

In [17]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for splits in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = split(splits)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[splits] = losses.mean()
    model.train()
    return out

# Create Model

In [18]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

In [19]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [20]:
class FeedFoward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [21]:
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [22]:
class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :] # becomes (B, C)
            probs = F.softmax(logits, dim=-1) # (B, C)
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [23]:
model = BigramLanguageModel()
m = model.to(device)
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

10.80354 M parameters


# Train the Model

In [24]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    xb, yb = split('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 4.5380, val loss 4.5450
step 500: train loss 1.6741, val loss 1.7039
step 1000: train loss 1.3356, val loss 1.4401
step 1500: train loss 1.1957, val loss 1.3666
step 2000: train loss 1.1067, val loss 1.3545
step 2500: train loss 1.0325, val loss 1.3539
step 3000: train loss 0.9557, val loss 1.3798
step 3500: train loss 0.8865, val loss 1.4070
step 4000: train loss 0.8166, val loss 1.4502
step 4500: train loss 0.7514, val loss 1.4761
step 4999: train loss 0.6912, val loss 1.5150


# Generate Text

In [25]:
def generate(num_words):
    context = torch.zeros((1, 1), dtype=torch.long, device=device)
    print(text_processor.decode(m.generate(context, max_new_tokens = num_words)[0].tolist()))

In [26]:
generate(2000)


Are you gonna be all a fight? It's metal fun.
- Still you are gonna spak it again? - Go.
Look, what's this guy who has been? I'm trying to kill you.
Where do you the past does think and you call this people could be before it hasn't.
I know you don't know you'll even.
I don't even know what you're because I know who you ammule.
I'm in trouble. She's provided like her.
She's spying to find the relies. But you don't even know what's it doing.
Just an earth in a hand.
I'm the froad of the Bifrost thing will be dead.
Okay. Emerge design.
We have a plan. Well, But you knew didn't have you anything
by with now, I'm happy careing this thing.
Wanda, I thought I was gonna have that? Peter's happening.
Big side, Scott.
I'm not like an even threat further said you'd know anyone nothing.
I'm gonna gonna stop you.
I'm honey. I'm very dying on.
No, this is gonna work...
No, no.
Sarrifying.
I don't know, Scott.
And you're not gonna be a what I'm not.
I love American. Hey.
Hey, Luis. I'm so sorry.
I 