# Importing Dataset

In [38]:
import os

In [39]:
def extract():
    text = ''
    path = '/kaggle/input/marvel-cinematic-universe-dialogue-dataset'
    for file in os.listdir(path):
        path_file = os.path.join(path + '/', file)
        with open(path_file, 'r', errors= 'ignore') as f:
            text += f.read()
    return text

In [40]:
text = extract()

In [41]:
text[:100]

'(BUCKY SCREAMING)\n(CONTINUES SCREAMING)\n- (KARPOV SPEAKING RUSSIAN) - (PANTING)\nLonging\nRusted\nSeven'

# Text Processing

In [42]:
class Preprocess():
    def __init__(self, text):
        super().__init__()
        self.text = text
        
    def create_vocab(self):
        vocab = sorted(list(set(self.text)))
        self.stoi = {s: i for i, s in enumerate(vocab)}
        self.itos = {i: s for s, i in self.stoi.items()}
        return vocab, len(vocab), self.stoi, self.itos
    
    def encode(self, string):
        return [self.stoi[char] for char in string]
    
    def decode(self, array):
        return ''.join(self.itos[idx] for idx in array)

In [43]:
text_processor = Preprocess(text)
vocab, vocab_size, stoi, itos = text_processor.create_vocab()

In [44]:
print(text_processor.encode('hello'))
text_processor.decode(text_processor.encode('hello'))

[61, 58, 65, 65, 68]


'hello'

# Set Device

In [45]:
import torch

In [46]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

# Split Dataset

In [47]:
import torch.nn as nn

In [48]:
data = torch.tensor(text_processor.encode(text), dtype = torch.long)
data[:50], len(data)

(tensor([ 8, 29, 48, 30, 38, 52,  1, 46, 30, 45, 32, 28, 40, 36, 41, 34,  9,  0,
          8, 30, 42, 41, 47, 36, 41, 48, 32, 46,  1, 46, 30, 45, 32, 28, 40, 36,
         41, 34,  9,  0, 12,  1,  8, 38, 28, 45, 43, 42, 49,  1]),
 1147310)

In [49]:
n = int(0.9 * len(data))
train = data[:n]
val = data[n:]
len(train), len(val)

(1032579, 114731)

In [50]:
def split(type):
    data = train if type == 'train' else val
    idx = torch.randint(len(data) - block_size, (batch_size, ))
    X = torch.stack([data[i: i + block_size] for i in idx])
    y = torch.stack([data[i + 1: i + block_size + 1] for i in idx])
    X, y = X.to(device), y.to(device)
    return X, y

In [51]:
batch_size = 64 
block_size = 256 
max_iters = 3000
eval_interval = 500
learning_rate = 1e-3
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2

In [52]:
Xtr, ytr = split('train')
Xtr.shape, ytr.shape

(torch.Size([64, 256]), torch.Size([64, 256]))

# Define Error List

In [53]:
import torch.nn.functional as F

In [54]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for splits in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = split(splits)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[splits] = losses.mean()
    model.train()
    return out

# Create Model

In [55]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

In [56]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [57]:
class FeedFoward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [58]:
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [59]:
class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
#             print(self(idx_cond))
#             print("idx_cond", idx_cond)
            logits, loss = self(idx_cond)
            
            logits = logits[:, -1, :] # becomes (B, C)
            
            probs = F.softmax(logits, dim=-1) # (B, C)
#             print("probs", probs)
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [60]:
model = BigramLanguageModel()
m = model.to(device)
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

10.80354 M parameters


# Train the Model

In [61]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    xb, yb = split('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 4.5963, val loss 4.5919
step 500: train loss 1.6895, val loss 1.6713
step 1000: train loss 1.3401, val loss 1.3864
step 1500: train loss 1.2079, val loss 1.3137
step 2000: train loss 1.1157, val loss 1.2830
step 2500: train loss 1.0494, val loss 1.2786
step 2999: train loss 0.9871, val loss 1.2815


# Generate Text

In [62]:
def generate(num_words):
    context = torch.zeros((1, 1), dtype=torch.long, device=device)
    print(text_processor.decode(m.generate(context, max_new_tokens = num_words)[0].tolist()))

In [63]:
generate(1000)


(JARVING) Backs.
KILLIAN: David Chine fair!
We can get to out here!
Sir?
Wow.
(DOWLING)
That ain't
(GROANING)
You've been in but he way in two minutes.
I come to base my chick.
Frity, Cap. If he comes to him a monster...
I can take this.
(PEOPLE SCREETING)
(SCREAMING) Is they divenored now?
Come on.
(HYMANATONE GRUNTS)
TONY: No!
Don't hurt National Selvignmerge. Is it a number operation?
I'm not jusing...
You're impressed like you at the monster?
Do you say? It's just a transfer, Rail.
- This is doing. - Wait, a Wait!
Stark, what are we doing?
I'm not surrounding.
I can't affee, you can felt that if we can say
like you you talk get to the Stark Industries
 Vanition.
Comes off!
Wait, you know, you need to gain.
I just saw just sitting facility.
The matter of the costume that is mybed, and it.
Is this now takes?
Well, yet Bender!
Well, not be flaking down,
you steal to through the plant.
I'm gonna her to be out, a long with a place has love.
And been some with one of her that Two?
It's 

In [65]:
model_save_path = 'bigram_language_model.pth'
torch.save(model.state_dict(), model_save_path)