In [57]:
# Creating a Character-Level Dataset for Language Modeling
"""
1. Load our preprocessed data and vocabulary
2. Create a custom Dataset class that generates input/target pairs
3. Set up a DataLoader for batch processing
4. Visualize and verify our data

The key concept here is that for language modeling, we want to predict the next character given the previous characters. For example:
- If our text is "Hello"
- And our block size is 3
- One training example would be:
  - Input (x): "Hel"
  - Target (y): "ell"
"""

'\n1. Load our preprocessed data and vocabulary\n2. Create a custom Dataset class that generates input/target pairs\n3. Set up a DataLoader for batch processing\n4. Visualize and verify our data\n\nThe key concept here is that for language modeling, we want to predict the next character given the previous characters. For example:\n- If our text is "Hello"\n- And our block size is 3\n- One training example would be:\n  - Input (x): "Hel"\n  - Target (y): "ell"\n'

In [58]:
import torch
from torch.utils.data import Dataset, DataLoader
import json

# Step 1: Load our encoded data tensor
# This tensor contains our entire text converted to numerical indices
encoded_data = torch.load('encoded_script.pt')
print("Encoded data shape:", encoded_data.shape)
print("Data type:", encoded_data.dtype)

# Step 2: Load our vocabulary mappings
with open('vocab.json', 'r', encoding='utf-8') as f:
    vocab = json.load(f)
    
# Convert string keys back to integers for itos
# (JSON converts all keys to strings, so we need to convert back to integers)
itos = {int(k): v for k, v in vocab['itos'].items()}
stoi = vocab['stoi']

print("\nVocabulary size:", len(stoi))
print("First few characters in itos:")
# Print first 5 character mappings as examples
for i in range(5):
    if itos[i] == '\n':
        char_display = '\\n'
    elif itos[i] == ' ':
        char_display = '<space>'
    else:
        char_display = itos[i]
    print(f"{i} -> '{char_display}'")


Encoded data shape: torch.Size([147021])
Data type: torch.int64

Vocabulary size: 77
First few characters in itos:
0 -> '<PAD>'
1 -> '\n'
2 -> '<space>'
3 -> '!'
4 -> '"'


In [59]:
class CharDataset(Dataset):
    def __init__(self, data, block_size):
        self.data = data
        self.block_size = block_size
    
    def __len__(self):
        # Total possible sequences = data length - block_size
        # (since we need block_size+1 characters for each sequence)
        return len(self.data) - self.block_size
    
    def __getitem__(self, idx):
        # Get chunk of data starting at idx with length block_size + 1
        chunk = self.data[idx:idx + self.block_size + 1]
        
        # Input is all but last character
        x = chunk[:-1]
        # Target is all but first character
        y = chunk[1:]
        
        return x, y

# Create dataset with block size of 128
block_size = 128
dataset = CharDataset(encoded_data, block_size)

# Create DataLoader with batch size of 4
batch_size = 4
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Get a sample batch and print its shape
x_batch, y_batch = next(iter(dataloader))
print("Input batch shape:", x_batch.shape)
print("Target batch shape:", y_batch.shape)

# Print a sample sequence and its target
print("\nSample sequence:")
seq_idx = 0  # First sequence in batch
x_seq = x_batch[seq_idx].tolist()
y_seq = y_batch[seq_idx].tolist()

print("Input text:", ''.join([itos[i] for i in x_seq[:50]]), "...")  # Print first 50 chars
print("Target text:", ''.join([itos[i] for i in y_seq[:50]]), "...")


Input batch shape: torch.Size([4, 128])
Target batch shape: torch.Size([4, 128])

Sample sequence:
Input text: , growing more PALE,
then back to Riddle, who is g ...
Target text:  growing more PALE,
then back to Riddle, who is gr ...


In [60]:
class GPTConfig:
    def __init__(self, vocab_size, block_size, embedding_dim=384, num_layers=6, 
                 num_heads=6, dropout=0.1):
        self.vocab_size = vocab_size
        self.block_size = block_size
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.dropout = dropout
        
# Create config
config = GPTConfig(
    vocab_size=len(stoi),
    block_size=block_size,
    embedding_dim=384,
    num_layers=6,
    num_heads=6,
    dropout=0.1
)

print("Model Configuration:")
print(f"Vocabulary Size: {config.vocab_size}")
print(f"Block Size: {config.block_size}")
print(f"Embedding Dimension: {config.embedding_dim}")
print(f"Number of Layers: {config.num_layers}")
print(f"Number of Attention Heads: {config.num_heads}")
print(f"Dropout: {config.dropout}")


Model Configuration:
Vocabulary Size: 77
Block Size: 128
Embedding Dimension: 384
Number of Layers: 6
Number of Attention Heads: 6
Dropout: 0.1


In [61]:
class SelfAttentionHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embedding_dim = config.embedding_dim
        self.head_size = config.embedding_dim // config.num_heads
        self.block_size = config.block_size
        
        # Linear projections
        self.key = nn.Linear(self.embedding_dim, self.head_size, bias=False)
        self.query = nn.Linear(self.embedding_dim, self.head_size, bias=False)
        self.value = nn.Linear(self.embedding_dim, self.head_size, bias=False)
        
        # Register buffer for attention mask (not a parameter)
        mask = torch.triu(torch.ones(self.block_size, self.block_size), diagonal=1).bool()
        self.register_buffer('mask', mask)
        
        self.dropout = nn.Dropout(config.dropout)
        
    def forward(self, x):
        B, T, C = x.shape  # batch, sequence length, embedding dimension
        
        # Linear projections
        k = self.key(x)    # (B, T, head_size)
        q = self.query(x)  # (B, T, head_size)
        v = self.value(x)  # (B, T, head_size)
        
        # Compute attention scores
        wei = q @ k.transpose(-2, -1) * (self.head_size ** -0.5)  # (B, T, T)
        wei = wei.masked_fill(self.mask[:T, :T], float('-inf'))   # (B, T, T)
        wei = F.softmax(wei, dim=-1)                              # (B, T, T)
        wei = self.dropout(wei)
        
        # Apply attention to values
        out = wei @ v  # (B, T, head_size)
        return out

class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.heads = nn.ModuleList([SelfAttentionHead(config) for _ in range(config.num_heads)])
        self.proj = nn.Linear(config.embedding_dim, config.embedding_dim)
        self.dropout = nn.Dropout(config.dropout)
        
    def forward(self, x):
        # Concatenate outputs from all heads
        out = torch.cat([head(x) for head in self.heads], dim=-1)
        # Project back to embedding dimension
        out = self.proj(out)
        out = self.dropout(out)
        return out


In [62]:
class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.net = nn.Sequential(
            # Expand to 4x embedding dimension
            nn.Linear(config.embedding_dim, 4 * config.embedding_dim),
            nn.ReLU(),
            # Project back to embedding dimension
            nn.Linear(4 * config.embedding_dim, config.embedding_dim),
            nn.Dropout(config.dropout)
        )
        
    def forward(self, x):
        return self.net(x)

class DecoderBlock(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.attn = MultiHeadAttention(config)
        self.ff = FeedForward(config)
        self.ln1 = nn.LayerNorm(config.embedding_dim)
        self.ln2 = nn.LayerNorm(config.embedding_dim)
        
    def forward(self, x):
        # Self-attention with residual connection
        x = x + self.attn(self.ln1(x))
        # Feed-forward with residual connection
        x = x + self.ff(self.ln2(x))
        return x


In [63]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        
        # Token and position embeddings
        self.token_embedding = nn.Embedding(config.vocab_size, config.embedding_dim)
        self.position_embedding = nn.Embedding(config.block_size, config.embedding_dim)
        
        # Transformer blocks
        self.blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config.num_layers)])
        
        # Final layer norm and projection to vocabulary
        self.ln_f = nn.LayerNorm(config.embedding_dim)
        self.lm_head = nn.Linear(config.embedding_dim, config.vocab_size)
        
        # Initialize weights
        self.apply(self._init_weights)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
        elif isinstance(module, nn.LayerNorm):
            torch.nn.init.zeros_(module.bias)
            torch.nn.init.ones_(module.weight)
            
    def forward(self, idx, targets=None):
        B, T = idx.shape
        
        # Get token and position embeddings
        tok_emb = self.token_embedding(idx)  # (B, T, C)
        pos_emb = self.position_embedding(torch.arange(T, device=idx.device))  # (T, C)
        x = tok_emb + pos_emb  # (B, T, C)
        
        # Apply transformer blocks
        for block in self.blocks:
            x = block(x)
            
        # Apply final layer norm
        x = self.ln_f(x)
        
        # Project to vocabulary
        logits = self.lm_head(x)  # (B, T, vocab_size)
        
        # If we have targets, compute loss
        if targets is not None:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        else:
            loss = None
            
        return logits, loss
    
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        """Generate new tokens after the context in idx."""
        for _ in range(max_new_tokens):
            # Crop context to block_size if needed
            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
            
            # Get predictions
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :] / temperature  # Focus on last time step
            
            # Optional: top-k sampling
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = float('-inf')
            
            # Apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)
            
            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            
            # Append to the sequence
            idx = torch.cat((idx, idx_next), dim=1)
            
        return idx


In [64]:
# Create model and move to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GPT(config).to(device)

# Print model summary
print(f"Number of parameters: {sum(p.numel() for p in model.parameters())/1e6:.2f}M")

# Initialize optimizer
learning_rate = 3e-4
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Training loop
def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    for batch_idx, (x, y) in enumerate(dataloader):
        # Move batch to device
        x, y = x.to(device), y.to(device)
        
        # Forward pass
        logits, loss = model(x, y)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # Update parameters
        optimizer.step()
        
        total_loss += loss.item()
        
        # Print progress
        if batch_idx % 100 == 0:
            print(f"Batch {batch_idx}: Loss {loss.item():.4f}")
    
    return total_loss / len(dataloader)

# Training configuration
num_epochs = 10
save_path = 'gpt_model.pt'
best_loss = float('inf')

print("Starting training...")


Number of parameters: 10.75M
Starting training...


In [65]:
# Training loop with text generation samples
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")
    
    # Train for one epoch
    avg_loss = train_epoch(model, dataloader, optimizer, device)
    print(f"Average loss: {avg_loss:.4f}")
    
    # Save if best model
    if avg_loss < best_loss:
        best_loss = avg_loss
        torch.save(model.state_dict(), save_path)
        print(f"Saved new best model with loss: {best_loss:.4f}")
    
    # Generate sample text
    if (epoch + 1) % 1 == 0:  # Generate every epoch
        model.eval()
        with torch.no_grad():
            # Start with a random context from the dataset
            context = next(iter(dataloader))[0][0:1].to(device)  # Take first sequence of first batch
            
            # Generate 100 new tokens
            generated = model.generate(context, max_new_tokens=100, temperature=0.8, top_k=40)
            
            # Convert to text
            generated_text = ''.join([itos[int(i)] for i in generated[0].tolist()])
            print("\nGenerated sample:")
            print(generated_text)
            
print("\nTraining completed!")



Epoch 1/10
Batch 0: Loss 4.4518
Batch 100: Loss 2.7120
Batch 200: Loss 2.6259
Batch 300: Loss 2.6594
Batch 400: Loss 2.6229
Batch 500: Loss 2.4034
Batch 600: Loss 2.3745
Batch 700: Loss 2.2929
Batch 800: Loss 2.1506
Batch 900: Loss 2.2122
Batch 1000: Loss 2.1592
Batch 1100: Loss 1.9471
Batch 1200: Loss 2.1546
Batch 1300: Loss 2.0024
Batch 1400: Loss 1.9537
Batch 1500: Loss 1.8281
Batch 1600: Loss 1.9516
Batch 1700: Loss 1.8516
Batch 1800: Loss 1.9629
Batch 1900: Loss 1.6905
Batch 2000: Loss 1.8779
Batch 2100: Loss 1.8126
Batch 2200: Loss 1.8423
Batch 2300: Loss 1.6797
Batch 2400: Loss 1.7681
Batch 2500: Loss 1.6280
Batch 2600: Loss 1.8558
Batch 2700: Loss 1.4849
Batch 2800: Loss 1.5092
Batch 2900: Loss 1.6480
Batch 3000: Loss 1.5808
Batch 3100: Loss 1.8695
Batch 3200: Loss 1.6896
Batch 3300: Loss 1.5455
Batch 3400: Loss 1.6133
Batch 3500: Loss 1.7308
Batch 3600: Loss 1.4655
Batch 3700: Loss 1.5706
Batch 3800: Loss 1.4646
Batch 3900: Loss 1.7149
Batch 4000: Loss 1.5083
Batch 4100: Loss

KeyboardInterrupt: 