In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader
import math
import numpy as np


In [5]:
# Hyperparameters
BATCH_SIZE = 32
BLOCK_SIZE = 64  # Context length
MAX_ITERS = 5000
LEARNING_RATE = 3e-4
EVAL_INTERVAL = 500
EMBED_DIM = 128
NUM_HEADS = 4
NUM_LAYERS = 4
DROPOUT = 0.1
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# Sample dataset (replace with your text file)
text = open('the-verdict.txt', 'r').read()  # Use any text file
chars = sorted(set(text))
vocab_size = len(chars)

# Tokenizer
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [char_to_idx[c] for c in s]
decode = lambda l: ''.join([idx_to_char[i] for i in l])

# Train/Validation Split
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

# DataLoader
class CharDataset(Dataset):
    def _init_(self, data, block_size):
        self.data = data
        self.block_size = block_size
        
    def _len_(self):
        return len(self.data) - self.block_size
        
    def _getitem_(self, idx):
        x = self.data[idx:idx+self.block_size]
        y = self.data[idx+1:idx+self.block_size+1]
        return x, y

train_dataset = CharDataset(train_data, BLOCK_SIZE)
val_dataset = CharDataset(val_data, BLOCK_SIZE)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

# Positional Encoding
class PositionalEncoding(nn.Module):
    def _init_(self, d_model, max_len=5000):
        super()._init_()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

# Multi-Head Self-Attention
class MultiHeadAttention(nn.Module):
    def _init_(self, embed_dim, num_heads):
        super()._init_()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        
        self.qkv = nn.Linear(embed_dim, embed_dim * 3)
        self.proj = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(DROPOUT)
        
    def forward(self, x, mask=None):
        B, T, C = x.shape
        qkv = self.qkv(x).reshape(B, T, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]
        
        attn = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        if mask is not None:
            attn = attn.masked_fill(mask == 0, float('-inf'))
        attn = torch.softmax(attn, dim=-1)
        attn = self.dropout(attn)
        
        y = (attn @ v).transpose(1, 2).contiguous().reshape(B, T, C)
        return self.proj(y)

# Transformer Block
class TransformerBlock(nn.Module):
    def _init_(self, embed_dim, num_heads):
        super()._init_()
        self.ln1 = nn.LayerNorm(embed_dim)
        self.ln2 = nn.LayerNorm(embed_dim)
        self.attn = MultiHeadAttention(embed_dim, num_heads)
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, 4 * embed_dim),
            nn.GELU(),
            nn.Linear(4 * embed_dim, embed_dim),
            nn.Dropout(DROPOUT)
        )
        
    def forward(self, x, mask=None):
        x = x + self.attn(self.ln1(x), mask)
        x = x + self.mlp(self.ln2(x))
        return x

# Language Model
class MiniLLM(nn.Module):
    def _init_(self):
        super()._init_()
        self.token_embed = nn.Embedding(vocab_size, EMBED_DIM)
        self.pos_embed = PositionalEncoding(EMBED_DIM, BLOCK_SIZE)
        self.blocks = nn.Sequential(*[
            TransformerBlock(EMBED_DIM, NUM_HEADS) for _ in range(NUM_LAYERS)
        ])
        self.ln_f = nn.LayerNorm(EMBED_DIM)
        self.head = nn.Linear(EMBED_DIM, vocab_size)
        
        self.apply(self._init_weights)
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
    
    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embed(idx)  # (B, T, EMBED_DIM)
        x = self.pos_embed(tok_emb)
        
        # Create causal mask
        mask = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(DEVICE)
        for block in self.blocks:
            x = block(x, mask)
        
        x = self.ln_f(x)
        logits = self.head(x)
        
        loss = None
        if targets is not None:
            loss = nn.functional.cross_entropy(
                logits.view(-1, vocab_size), 
                targets.contiguous().view(-1)
            )
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -BLOCK_SIZE:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = torch.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

# Initialize model
model = MiniLLM().to(DEVICE)
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)

# Training loop
for iter in range(MAX_ITERS):
    if iter % EVAL_INTERVAL == 0:
        model.eval()
        with torch.no_grad():
            val_loss = []
            for i in range(10):  # Approximate validation loss
                x, y = val_dataset[np.random.randint(len(val_dataset))]
                x, y = x.unsqueeze(0).to(DEVICE), y.unsqueeze(0).to(DEVICE)
                _, loss = model(x, y)
                val_loss.append(loss.item())
            print(f"Iter {iter}: Val Loss {np.mean(val_loss):.4f}")
    
    model.train()
    xb, yb = next(iter(train_loader))
    xb, yb = xb.to(DEVICE), yb.to(DEVICE)
    _, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Generate text
context = torch.zeros((1, 1), dtype=torch.long, device=DEVICE)
print(decode(model.generate(context, max_new_tokens=500)[0].tolist()))

TypeError: CharDataset() takes no arguments