# 🤖 Transformer for Language Modeling

Welcome to the **Transformer revolution**! In this notebook, we'll build the complete Transformer architecture from scratch and create a powerful language model using self-attention mechanisms.

## What you'll learn:
- Multi-head self-attention mechanism
- Positional encoding and layer normalization
- Complete Transformer architecture
- Language modeling and text generation

Let's revolutionize NLP! 🚀

In [None]:
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import requests

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

plt.style.use('seaborn-v0_8')
np.random.seed(42)
tf.random.set_seed(42)

print(f"TensorFlow version: {tf.__version__}")
print(f"GPU Available: {len(tf.config.list_physical_devices('GPU')) > 0}")

In [None]:
# Create sample text dataset
def create_sample_text():
    """Create a sample text dataset for language modeling"""
    sample_texts = [
        "The quick brown fox jumps over the lazy dog.",
        "To be or not to be, that is the question.",
        "In the beginning was the Word, and the Word was with God.",
        "It was the best of times, it was the worst of times.",
        "All happy families are alike; each unhappy family is unhappy in its own way.",
        "Call me Ishmael. Some years ago—never mind how long precisely.",
        "It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.",
        "In a hole in the ground there lived a hobbit.",
        "It was a bright cold day in April, and the clocks were striking thirteen.",
        "Space: the final frontier. These are the voyages of the starship Enterprise."
    ] * 100  # Repeat for more training data
    
    return " ".join(sample_texts)

# Load and preprocess text
text = create_sample_text()
print(f"Text length: {len(text)} characters")
print(f"Sample: {text[:200]}...")

# Create character-level vocabulary
chars = sorted(list(set(text)))
vocab_size = len(chars)
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for i, ch in enumerate(chars)}

print(f"\nVocabulary size: {vocab_size}")
print(f"Characters: {''.join(chars)}")

# Convert text to sequences
def text_to_sequences(text, seq_length=64):
    """Convert text to input-target sequences"""
    sequences = []
    targets = []
    
    for i in range(0, len(text) - seq_length, seq_length // 2):
        seq = text[i:i + seq_length]
        target = text[i + 1:i + seq_length + 1]
        
        if len(seq) == seq_length and len(target) == seq_length:
            sequences.append([char_to_idx[ch] for ch in seq])
            targets.append([char_to_idx[ch] for ch in target])
    
    return np.array(sequences), np.array(targets)

SEQ_LENGTH = 64
X, y = text_to_sequences(text, SEQ_LENGTH)

print(f"\nSequences shape: {X.shape}")
print(f"Targets shape: {y.shape}")
print(f"Number of training sequences: {len(X)}")

In [None]:
# Positional Encoding
def get_positional_encoding(seq_len, d_model):
    """Generate positional encoding matrix"""
    pos_enc = np.zeros((seq_len, d_model))
    
    for pos in range(seq_len):
        for i in range(0, d_model, 2):
            pos_enc[pos, i] = np.sin(pos / (10000 ** (i / d_model)))
            if i + 1 < d_model:
                pos_enc[pos, i + 1] = np.cos(pos / (10000 ** (i / d_model)))
    
    return pos_enc

# Visualize positional encoding
d_model = 128
pos_enc = get_positional_encoding(SEQ_LENGTH, d_model)

plt.figure(figsize=(12, 8))
plt.imshow(pos_enc.T, cmap='RdYlBu', aspect='auto')
plt.colorbar()
plt.title('🌊 Positional Encoding Pattern')
plt.xlabel('Position')
plt.ylabel('Encoding Dimension')
plt.show()

print(f"Positional encoding shape: {pos_enc.shape}")

In [None]:
# Multi-Head Attention Layer
class MultiHeadAttention(layers.Layer):
    def __init__(self, d_model, num_heads, **kwargs):
        super(MultiHeadAttention, self).__init__(**kwargs)
        self.num_heads = num_heads
        self.d_model = d_model
        
        assert d_model % self.num_heads == 0
        
        self.depth = d_model // self.num_heads
        
        self.wq = layers.Dense(d_model)
        self.wk = layers.Dense(d_model)
        self.wv = layers.Dense(d_model)
        
        self.dense = layers.Dense(d_model)
    
    def split_heads(self, x, batch_size):
        """Split the last dimension into (num_heads, depth)"""
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])
    
    def call(self, v, k, q, mask=None):
        batch_size = tf.shape(q)[0]
        
        q = self.wq(q)  # (batch_size, seq_len, d_model)
        k = self.wk(k)  # (batch_size, seq_len, d_model)
        v = self.wv(v)  # (batch_size, seq_len, d_model)
        
        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
        
        # Scaled dot-product attention
        scaled_attention, attention_weights = self.scaled_dot_product_attention(
            q, k, v, mask)
        
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
        
        concat_attention = tf.reshape(scaled_attention, 
                                    (batch_size, -1, self.d_model))
        
        output = self.dense(concat_attention)
        
        return output, attention_weights
    
    def scaled_dot_product_attention(self, q, k, v, mask):
        """Calculate the attention weights"""
        matmul_qk = tf.matmul(q, k, transpose_b=True)
        
        # Scale matmul_qk
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
        
        # Add the mask to the scaled tensor
        if mask is not None:
            scaled_attention_logits += (mask * -1e9)
        
        # Softmax is normalized on the last axis (seq_len_k)
        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
        
        output = tf.matmul(attention_weights, v)
        
        return output, attention_weights

print("✅ Multi-Head Attention layer defined!")

In [None]:
# Transformer Block
class TransformerBlock(layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1, **kwargs):
        super(TransformerBlock, self).__init__(**kwargs)
        
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = keras.Sequential([
            layers.Dense(dff, activation='relu'),
            layers.Dense(d_model)
        ])
        
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)
    
    def call(self, x, training, mask=None):
        attn_output, attention_weights = self.mha(x, x, x, mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)
        
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)
        
        return out2, attention_weights

print("✅ Transformer Block defined!")

In [None]:
# Complete Transformer Model
class TransformerLanguageModel(keras.Model):
    def __init__(self, vocab_size, d_model, num_heads, dff, num_layers, 
                 maximum_position_encoding, rate=0.1, **kwargs):
        super(TransformerLanguageModel, self).__init__(**kwargs)
        
        self.d_model = d_model
        self.num_layers = num_layers
        
        self.embedding = layers.Embedding(vocab_size, d_model)
        self.pos_encoding = get_positional_encoding(maximum_position_encoding, d_model)
        
        self.transformer_blocks = [TransformerBlock(d_model, num_heads, dff, rate) 
                                 for _ in range(num_layers)]
        
        self.dropout = layers.Dropout(rate)
        self.final_layer = layers.Dense(vocab_size)
    
    def call(self, x, training, mask=None):
        seq_len = tf.shape(x)[1]
        attention_weights = {}
        
        # Embedding and positional encoding
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:seq_len, :]
        
        x = self.dropout(x, training=training)
        
        # Pass through transformer blocks
        for i, transformer_block in enumerate(self.transformer_blocks):
            x, attn_weights = transformer_block(x, training, mask)
            attention_weights[f'transformer_block_{i+1}'] = attn_weights
        
        # Final linear layer
        output = self.final_layer(x)
        
        return output, attention_weights

# Create model
D_MODEL = 128
NUM_HEADS = 8
DFF = 512
NUM_LAYERS = 4
DROPOUT_RATE = 0.1

model = TransformerLanguageModel(
    vocab_size=vocab_size,
    d_model=D_MODEL,
    num_heads=NUM_HEADS,
    dff=DFF,
    num_layers=NUM_LAYERS,
    maximum_position_encoding=SEQ_LENGTH,
    rate=DROPOUT_RATE
)

print(f"✅ Transformer Language Model created!")
print(f"Model parameters: D_MODEL={D_MODEL}, NUM_HEADS={NUM_HEADS}, NUM_LAYERS={NUM_LAYERS}")

In [None]:
# Create causal mask for autoregressive training
def create_look_ahead_mask(size):
    """Create mask to prevent attention to future tokens"""
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask  # (seq_len, seq_len)

# Loss and metrics
loss_object = keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    
    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

# Optimizer with learning rate scheduling
class CustomSchedule(keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        
        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)
        self.warmup_steps = warmup_steps
    
    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
        
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

learning_rate = CustomSchedule(D_MODEL)
optimizer = keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

print("✅ Loss function and optimizer defined!")

In [None]:
# Training step
@tf.function
def train_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]
    
    look_ahead_mask = create_look_ahead_mask(tf.shape(tar_inp)[1])
    
    with tf.GradientTape() as tape:
        predictions, _ = model(tar_inp, training=True, mask=look_ahead_mask)
        loss = loss_function(tar_real, predictions)
    
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    return loss

# Training loop
EPOCHS = 20
BATCH_SIZE = 32

# Create dataset
dataset = tf.data.Dataset.from_tensor_slices((X, y))
dataset = dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

# Track losses
train_losses = []

print(f"🚀 Starting Transformer training...")
print(f"Epochs: {EPOCHS}, Batch Size: {BATCH_SIZE}")

for epoch in range(EPOCHS):
    epoch_loss = []
    
    for batch, (inp, tar) in enumerate(dataset):
        loss = train_step(inp, tar)
        epoch_loss.append(loss)
        
        if batch % 10 == 0:
            print(f'Epoch {epoch + 1} Batch {batch} Loss {loss:.4f}')
    
    avg_loss = tf.reduce_mean(epoch_loss)
    train_losses.append(avg_loss)
    
    print(f'Epoch {epoch + 1}: Average Loss = {avg_loss:.4f}')

print("\n🎉 Training completed!")

In [None]:
# Text generation function
def generate_text(model, start_string, num_generate=100, temperature=1.0):
    """Generate text using the trained model"""
    input_eval = [char_to_idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)
    
    text_generated = []
    
    model.reset_states()
    
    for i in range(num_generate):
        predictions, _ = model(input_eval, training=False)
        predictions = tf.squeeze(predictions, 0)
        
        # Use temperature to control randomness
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
        
        # Add predicted character to input for next iteration
        input_eval = tf.expand_dims([predicted_id], 0)
        
        text_generated.append(idx_to_char[predicted_id])
    
    return start_string + ''.join(text_generated)

# Generate sample text
print("🎭 Generated Text Samples:")
print("=" * 50)

for temp in [0.5, 1.0, 1.5]:
    generated = generate_text(model, "The ", num_generate=200, temperature=temp)
    print(f"\nTemperature {temp}:")
    print(generated)
    print("-" * 30)

# Visualize training progress
plt.figure(figsize=(10, 6))
plt.plot(train_losses)
plt.title('📉 Transformer Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True, alpha=0.3)
plt.show()

print(f"\n📊 Training Summary:")
print(f"Final Loss: {train_losses[-1]:.4f}")
print(f"Model Parameters: {sum([tf.size(w).numpy() for w in model.trainable_weights]):,}")
print(f"Vocabulary Size: {vocab_size}")
print(f"Sequence Length: {SEQ_LENGTH}")

## 🎉 Congratulations!

You've successfully built a complete Transformer from scratch! Here's what you've accomplished:

✅ **Multi-Head Attention**: Implemented the core attention mechanism  
✅ **Positional Encoding**: Added position information to sequences  
✅ **Transformer Blocks**: Built complete encoder layers  
✅ **Language Modeling**: Created an autoregressive text generator  
✅ **Training**: Optimized with custom learning rate scheduling  

### 🚀 Next Steps:
1. Try larger models with more layers and heads
2. Implement BERT-style bidirectional encoding
3. Experiment with different attention patterns
4. Move on to **Project 10: Fine-tuning BERT for Sentiment Analysis**

Ready for transfer learning with BERT? Let's fine-tune! 🎯