In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [23]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [24]:
BATCH_SIZE = 16       # Number of sequences processed in parallel
BLOCK_SIZE = 32       # Maximum context length for predictions
MAX_ITERS = 5000      # Training iterations
EVAL_INTERVAL = 100   # Interval to evaluate loss
LEARNING_RATE = 1e-3  # Learning rate for optimizer
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
EVAL_ITERS = 200      # Iterations to estimate loss
EMBED_DIM = 64        # Embedding dimension
NUM_HEADS = 4         # Number of attention heads
NUM_LAYERS = 4        # Number of Transformer blocks
DROPOUT = 0.0         # Dropout probability

torch.manual_seed(1337)


<torch._C.Generator at 0x7c5eeff548d0>

In [26]:
with open('/kaggle/input/input/input.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()

In [27]:
# Vocabulary and encoders
vocab = sorted(list(set(raw_text)))
vocab_size = len(vocab)
stoi = {ch: i for i, ch in enumerate(vocab)}
itos = {i: ch for i, ch in enumerate(vocab)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda indices: ''.join([itos[i] for i in indices])


In [28]:
# Encode data and split
data = torch.tensor(encode(raw_text), dtype=torch.long)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [29]:
def get_batch(split):
    """Generate a batch of input & target sequences."""
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - BLOCK_SIZE, (BATCH_SIZE,))
    x = torch.stack([data[i:i+BLOCK_SIZE] for i in ix])
    y = torch.stack([data[i+1:i+BLOCK_SIZE+1] for i in ix])
    return x.to(DEVICE), y.to(DEVICE)

In [30]:
@torch.no_grad()
def estimate_loss():
    """Estimate train and validation loss."""
    results = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(EVAL_ITERS)
        for k in range(EVAL_ITERS):
            xb, yb = get_batch(split)
            _, loss = model(xb, yb)
            losses[k] = loss.item()
        results[split] = losses.mean()
    model.train()
    return results

In [31]:
class SelfAttentionHead(nn.Module):
    """Single masked self-attention head."""

    def __init__(self, head_dim):
        super().__init__()
        self.key = nn.Linear(EMBED_DIM, head_dim, bias=False)
        self.query = nn.Linear(EMBED_DIM, head_dim, bias=False)
        self.value = nn.Linear(EMBED_DIM, head_dim, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(BLOCK_SIZE, BLOCK_SIZE)))
        self.dropout = nn.Dropout(DROPOUT)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        attn_weights = q @ k.transpose(-2, -1) * C ** -0.5
        attn_weights = attn_weights.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        attn_weights = F.softmax(attn_weights, dim=-1)
        attn_weights = self.dropout(attn_weights)
        v = self.value(x)
        out = attn_weights @ v
        return out

In [32]:
class MultiHeadSelfAttention(nn.Module):
    """Parallel multiple self-attention heads."""

    def __init__(self, num_heads, head_dim):
        super().__init__()
        self.heads = nn.ModuleList([SelfAttentionHead(head_dim) for _ in range(num_heads)])
        self.proj = nn.Linear(EMBED_DIM, EMBED_DIM)
        self.dropout = nn.Dropout(DROPOUT)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [33]:
class FeedForwardNetwork(nn.Module):
    """Position-wise feed-forward network."""

    def __init__(self, embed_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embed_dim, 4 * embed_dim),
            nn.ReLU(),
            nn.Linear(4 * embed_dim, embed_dim),
            nn.Dropout(DROPOUT),
        )

    def forward(self, x):
        return self.net(x)

In [34]:
class TransformerBlock(nn.Module):
    """Single Transformer block."""

    def __init__(self, embed_dim, num_heads):
        super().__init__()
        head_dim = embed_dim // num_heads
        self.self_attn = MultiHeadSelfAttention(num_heads, head_dim)
        self.ffn = FeedForwardNetwork(embed_dim)
        self.ln1 = nn.LayerNorm(embed_dim)
        self.ln2 = nn.LayerNorm(embed_dim)

    def forward(self, x):
        x = x + self.self_attn(self.ln1(x))
        x = x + self.ffn(self.ln2(x))
        return x


In [35]:
class TransformerLanguageModel(nn.Module):
    """Character-level Transformer language model."""

    def __init__(self):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, EMBED_DIM)
        self.position_embedding = nn.Embedding(BLOCK_SIZE, EMBED_DIM)
        self.transformer_blocks = nn.Sequential(*[TransformerBlock(EMBED_DIM, NUM_HEADS) for _ in range(NUM_LAYERS)])
        self.ln_f = nn.LayerNorm(EMBED_DIM)
        self.output_head = nn.Linear(EMBED_DIM, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding(idx)
        pos_emb = self.position_embedding(torch.arange(T, device=DEVICE))
        x = tok_emb + pos_emb
        x = self.transformer_blocks(x)
        x = self.ln_f(x)
        logits = self.output_head(x)

        loss = None
        if targets is not None:
            logits = logits.view(B * T, vocab_size)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        """Generate new tokens autoregressively."""
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -BLOCK_SIZE:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, next_token], dim=1)
        return idx

In [36]:
model = TransformerLanguageModel().to(DEVICE)
print(f"Model has {sum(p.numel() for p in model.parameters())/1e6:.2f}M parameters")

optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

for step in range(MAX_ITERS):

    if step % EVAL_INTERVAL == 0 or step == MAX_ITERS - 1:
        losses = estimate_loss()
        print(f"Step {step}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

Model has 0.21M parameters
Step 0: train loss 4.4116, val loss 4.4022
Step 100: train loss 2.6568, val loss 2.6670
Step 200: train loss 2.5089, val loss 2.5057
Step 300: train loss 2.4194, val loss 2.4336
Step 400: train loss 2.3502, val loss 2.3565
Step 500: train loss 2.2965, val loss 2.3131
Step 600: train loss 2.2406, val loss 2.2497
Step 700: train loss 2.2046, val loss 2.2187
Step 800: train loss 2.1634, val loss 2.1868
Step 900: train loss 2.1237, val loss 2.1503
Step 1000: train loss 2.1028, val loss 2.1298
Step 1100: train loss 2.0689, val loss 2.1171
Step 1200: train loss 2.0388, val loss 2.0797
Step 1300: train loss 2.0248, val loss 2.0635
Step 1400: train loss 1.9918, val loss 2.0363
Step 1500: train loss 1.9696, val loss 2.0305
Step 1600: train loss 1.9642, val loss 2.0493
Step 1700: train loss 1.9411, val loss 2.0145
Step 1800: train loss 1.9079, val loss 1.9959
Step 1900: train loss 1.9079, val loss 1.9883
Step 2000: train loss 1.8835, val loss 1.9953
Step 2100: train lo

In [37]:
context = torch.zeros((1, 1), dtype=torch.long, device=DEVICE)
generated_indices = model.generate(context, max_new_tokens=500)
print(decode(generated_indices[0].tolist()))


Rettomen gives freign:
My swarme Volivius: you have It Dork's by I done have but
his noble all.

NORGEO:
You! but you litt not on, you glet not tile eybell'd supo handst not not
struth my behish dove, for thom tow, go:
Eve mirgues warges shall; there some not.

LUCIO:
his his love.

HENRY VI:
No man that they before.

BRUTUS:
Tows; I, I sidget and alind worge the dreads togenes!

CLARENCE:
All was maither,
And lidst worsay this like to-does
be e'll comfsire with thou strave so grave.

MENENIUS:



In [38]:
import torch


context = "gi"


context_ids = torch.tensor([encode(context)], dtype=torch.long)
print(f"Encoded context: {context_ids}")


output = m.generate(idx=context_ids, max_new_tokens=1)

next_char_id = output[0][-1].item()
next_char = decode([next_char_id])

print(f"Given '{context}' => Next character predicted: '{next_char}'")

Encoded context: tensor([[45, 47]])
Given 'gi' => Next character predicted: 'n'
