In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/input-1-1/input (1).txt


In [6]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [7]:
BATCH_SIZE = 16       # Number of sequences processed in parallel
BLOCK_SIZE = 32       # Maximum context length for predictions
MAX_ITERS = 5000      # Training iterations
EVAL_INTERVAL = 100   # Interval to evaluate loss
LEARNING_RATE = 1e-3  # Learning rate for optimizer
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
EVAL_ITERS = 200      # Iterations to estimate loss
EMBED_DIM = 64        # Embedding dimension
NUM_HEADS = 4         # Number of attention heads
NUM_LAYERS = 4        # Number of Transformer blocks
DROPOUT = 0.0         # Dropout probability

torch.manual_seed(1337)

<torch._C.Generator at 0x7b4336777090>

In [8]:
with open('/kaggle/input/input-1-1/input (1).txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()

In [9]:
# Vocabulary and encoders
vocab = sorted(list(set(raw_text)))
vocab_size = len(vocab)
stoi = {ch: i for i, ch in enumerate(vocab)}
itos = {i: ch for i, ch in enumerate(vocab)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda indices: ''.join([itos[i] for i in indices])

In [20]:
len(stoi)

30

In [10]:
# Encode data and split
data = torch.tensor(encode(raw_text), dtype=torch.long)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [11]:
def get_batch(split):
    """Generate a batch of input & target sequences."""
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - BLOCK_SIZE, (BATCH_SIZE,))
    x = torch.stack([data[i:i+BLOCK_SIZE] for i in ix])
    y = torch.stack([data[i+1:i+BLOCK_SIZE+1] for i in ix])
    return x.to(DEVICE), y.to(DEVICE)

In [12]:
@torch.no_grad()
def estimate_loss():
    """Estimate train and validation loss."""
    results = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(EVAL_ITERS)
        for k in range(EVAL_ITERS):
            xb, yb = get_batch(split)
            _, loss = model(xb, yb)
            losses[k] = loss.item()
        results[split] = losses.mean()
    model.train()
    return results

In [13]:
class SelfAttentionHead(nn.Module):
    """Single masked self-attention head."""

    def __init__(self, head_dim):
        super().__init__()
        self.key = nn.Linear(EMBED_DIM, head_dim, bias=False)
        self.query = nn.Linear(EMBED_DIM, head_dim, bias=False)
        self.value = nn.Linear(EMBED_DIM, head_dim, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(BLOCK_SIZE, BLOCK_SIZE)))
        self.dropout = nn.Dropout(DROPOUT)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        attn_weights = q @ k.transpose(-2, -1) * C ** -0.5
        attn_weights = attn_weights.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        attn_weights = F.softmax(attn_weights, dim=-1)
        attn_weights = self.dropout(attn_weights)
        v = self.value(x)
        out = attn_weights @ v
        return out

In [14]:
class MultiHeadSelfAttention(nn.Module):
    """Parallel multiple self-attention heads."""

    def __init__(self, num_heads, head_dim):
        super().__init__()
        self.heads = nn.ModuleList([SelfAttentionHead(head_dim) for _ in range(num_heads)])
        self.proj = nn.Linear(EMBED_DIM, EMBED_DIM)
        self.dropout = nn.Dropout(DROPOUT)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [15]:
class FeedForwardNetwork(nn.Module):
    """Position-wise feed-forward network."""

    def __init__(self, embed_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embed_dim, 4 * embed_dim),
            nn.ReLU(),
            nn.Linear(4 * embed_dim, embed_dim),
            nn.Dropout(DROPOUT),
        )

    def forward(self, x):
        return self.net(x)

In [16]:
class TransformerBlock(nn.Module):
    """Single Transformer block."""

    def __init__(self, embed_dim, num_heads):
        super().__init__()
        head_dim = embed_dim // num_heads
        self.self_attn = MultiHeadSelfAttention(num_heads, head_dim)
        self.ffn = FeedForwardNetwork(embed_dim)
        self.ln1 = nn.LayerNorm(embed_dim)
        self.ln2 = nn.LayerNorm(embed_dim)

    def forward(self, x):
        x = x + self.self_attn(self.ln1(x))
        x = x + self.ffn(self.ln2(x))
        return x

In [17]:
class TransformerLanguageModel(nn.Module):
    """Character-level Transformer language model."""

    def __init__(self):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, EMBED_DIM)
        self.position_embedding = nn.Embedding(BLOCK_SIZE, EMBED_DIM)
        self.transformer_blocks = nn.Sequential(*[TransformerBlock(EMBED_DIM, NUM_HEADS) for _ in range(NUM_LAYERS)])
        self.ln_f = nn.LayerNorm(EMBED_DIM)
        self.output_head = nn.Linear(EMBED_DIM, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding(idx)
        pos_emb = self.position_embedding(torch.arange(T, device=DEVICE))
        x = tok_emb + pos_emb
        x = self.transformer_blocks(x)
        x = self.ln_f(x)
        logits = self.output_head(x)

        loss = None
        if targets is not None:
            logits = logits.view(B * T, vocab_size)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        """Generate new tokens autoregressively."""
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -BLOCK_SIZE:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, next_token], dim=1)
        return idx

In [18]:
model = TransformerLanguageModel().to(DEVICE)
print(f"Model has {sum(p.numel() for p in model.parameters())/1e6:.2f}M parameters")

optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

for step in range(MAX_ITERS):

    if step % EVAL_INTERVAL == 0 or step == MAX_ITERS - 1:
        losses = estimate_loss()
        print(f"Step {step}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


Model has 0.21M parameters
Step 0: train loss 3.6067, val loss 3.6052
Step 100: train loss 1.6236, val loss 1.6175
Step 200: train loss 0.6831, val loss 0.6801
Step 300: train loss 0.2766, val loss 0.2769
Step 400: train loss 0.2184, val loss 0.2156
Step 500: train loss 0.1924, val loss 0.1942
Step 600: train loss 0.1928, val loss 0.1924
Step 700: train loss 0.1890, val loss 0.1874
Step 800: train loss 0.1830, val loss 0.1799
Step 900: train loss 0.1835, val loss 0.1807
Step 1000: train loss 0.1785, val loss 0.1774
Step 1100: train loss 0.1825, val loss 0.1863
Step 1200: train loss 0.1737, val loss 0.1717
Step 1300: train loss 0.1792, val loss 0.1791
Step 1400: train loss 0.1749, val loss 0.1733
Step 1500: train loss 0.1732, val loss 0.1727
Step 1600: train loss 0.1712, val loss 0.1734
Step 1700: train loss 0.1752, val loss 0.1729
Step 1800: train loss 0.1803, val loss 0.1811
Step 1900: train loss 0.1712, val loss 0.1682
Step 2000: train loss 0.1708, val loss 0.1719
Step 2100: train lo

In [21]:
context = torch.zeros((1, 1), dtype=torch.long, device=DEVICE)
generated_indices = model.generate(context, max_new_tokens=500)
print(decode(generated_indices[0].tolist()))




The rain falls softly on the ground. A gentle breeze moves the leaves. The stars twinkle in the clear night sky. The sun shines bright in the sky. The cat sat on the mat. The tree stands tall and strong. The cat sat on the mat. The sun shines bright in the sky. The dog barked loudly at night. The dog barked loudly at night. The sun shines bright in the sky. The tree stands tall and strong. The dog barked loudly at night. Children play happily in the park. The cat sat on the mat. The cat sat on


In [28]:
import torch


context = "i am going to marke"


context_ids = torch.tensor([encode(context)], dtype=torch.long)
print(f"Encoded context: {context_ids}")


output = model.generate(idx=context_ids, max_new_tokens=1)

next_char_id = output[0][-1].item()
next_char = decode([next_char_id])

print(f"Given '{context}' => Next character predicted: '{next_char}'")

Encoded context: tensor([[15,  1,  7, 18,  1, 13, 20, 15, 19, 13,  1, 24, 20,  1, 18,  7, 22, 16,
         11]])
Given 'i am going to marke' => Next character predicted: 'd'


In [29]:
@torch.no_grad()
def test_accuracy(split='val', num_batches=100):
    model.eval()
    total = 0
    correct = 0

    for _ in range(num_batches):
        xb, yb = get_batch(split)  # [B, T]
        logits, _ = model(xb, yb)  # logits: [B*T, vocab_size]

        # logits shape: [B*T, vocab_size]
        preds = torch.argmax(logits, dim=-1)  # [B*T]

        B, T = xb.shape
        preds = preds.view(B, T)  # âœ… Unflatten to [B, T]

        assert preds.shape == yb.shape, f"Shape mismatch: {preds.shape} vs {yb.shape}"

        matches = preds == yb  # [B, T]
        correct += matches.sum().item()
        total += matches.numel()

    model.train()
    accuracy = correct / total * 100
    print(f"{split.capitalize()} accuracy: {accuracy:.2f}%")
    return accuracy

In [30]:
test_accuracy(split='val', num_batches=100)

Val accuracy: 93.59%


93.587890625