# Reading
- https://www.unite.ai/nlp-rise-with-transformer-models-a-comprehensive-analysis-of-t5-bert-and-gpt/

- https://aliissa99.medium.com/transformer-gpt-3-gpt-j-t5-and-bert-4cf8915dd86f

# Implementation

In [None]:
# Import essential libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
import math
from transformers import (
    GPT2Model, GPT2Config,
    BertModel, BertConfig,
    T5Model, T5Config,
    GPT2Tokenizer, BertTokenizer, T5Tokenizer
)

In [None]:
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# Part 1: GPT Architecture Implementation (Decoder-only)
class GPTSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embed_dim = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.embed_dim // self.num_heads
        self.scale = self.head_dim ** -0.5

        self.query = nn.Linear(self.embed_dim, self.embed_dim)
        self.key = nn.Linear(self.embed_dim, self.embed_dim)
        self.value = nn.Linear(self.embed_dim, self.embed_dim)
        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)

    def forward(self, x, attention_mask=None):
        batch_size, seq_length, _ = x.size()

        # Project query, key, value
        q = self.query(x).view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
        k = self.key(x).view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
        v = self.value(x).view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)

        # Compute attention scores
        attn_weights = torch.matmul(q, k.transpose(-1, -2)) * self.scale

        # Apply causal mask (lower triangular)
        causal_mask = torch.triu(torch.ones(seq_length, seq_length), diagonal=1).bool().to(device)
        attn_weights = attn_weights.masked_fill(causal_mask, float('-inf'))

        # Apply attention mask if provided
        if attention_mask is not None:
            attn_weights = attn_weights.masked_fill(attention_mask.unsqueeze(1).unsqueeze(2), float('-inf'))

        # Normalize and apply attention
        attn_weights = F.softmax(attn_weights, dim=-1)
        attn_output = torch.matmul(attn_weights, v)

        # Reshape and project output
        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_length, self.embed_dim)
        attn_output = self.out_proj(attn_output)

        return attn_output

In [None]:
class GPTBlock(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.hidden_size)
        self.attn = GPTSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.hidden_size)

        self.mlp = nn.Sequential(
            nn.Linear(config.hidden_size, config.intermediate_size),
            nn.GELU(),
            nn.Linear(config.intermediate_size, config.hidden_size)
        )

    def forward(self, x, attention_mask=None):
        # Self-attention with residual connection
        attn_output = self.attn(self.ln_1(x), attention_mask)
        x = x + attn_output

        # MLP with residual connection
        mlp_output = self.mlp(self.ln_2(x))
        x = x + mlp_output

        return x

In [None]:
class SimpleGPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size
        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
        self.drop = nn.Dropout(config.embd_pdrop)

        self.blocks = nn.ModuleList([GPTBlock(config) for _ in range(config.num_hidden_layers)])
        self.ln_f = nn.LayerNorm(self.embed_dim)

    def forward(self, input_ids, attention_mask=None):
        device = input_ids.device
        batch_size, seq_length = input_ids.size()

        position_ids = torch.arange(0, seq_length, dtype=torch.long, device=device).unsqueeze(0)

        # Get embeddings
        inputs_embeds = self.wte(input_ids)
        position_embeds = self.wpe(position_ids)
        hidden_states = inputs_embeds + position_embeds
        hidden_states = self.drop(hidden_states)

        # Process through transformer blocks
        for block in self.blocks:
            hidden_states = block(hidden_states, attention_mask)

        hidden_states = self.ln_f(hidden_states)

        return hidden_states

In [None]:
# Part 2: BERT Architecture Implementation (Encoder-only)
class BertSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = config.hidden_size // config.num_attention_heads
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(self, hidden_states, attention_mask=None):
        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(hidden_states)
        mixed_value_layer = self.value(hidden_states)

        query_layer = self.transpose_for_scores(mixed_query_layer)
        key_layer = self.transpose_for_scores(mixed_key_layer)
        value_layer = self.transpose_for_scores(mixed_value_layer)

        # Take the dot product between "query" and "key" to get the raw attention scores
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)

        # Apply the attention mask (if provided)
        if attention_mask is not None:
            attention_scores = attention_scores + attention_mask

        # Normalize the attention scores to probabilities
        attention_probs = F.softmax(attention_scores, dim=-1)
        attention_probs = self.dropout(attention_probs)

        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)

        return context_layer


In [None]:
class BertLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.attention = BertSelfAttention(config)
        self.attention_output = nn.Linear(config.hidden_size, config.hidden_size)
        self.attention_dropout = nn.Dropout(config.hidden_dropout_prob)
        self.attention_ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

        self.intermediate = nn.Linear(config.hidden_size, config.intermediate_size)
        self.output = nn.Linear(config.intermediate_size, config.hidden_size)
        self.output_dropout = nn.Dropout(config.hidden_dropout_prob)
        self.output_ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    def forward(self, hidden_states, attention_mask=None):
        attention_output = self.attention(hidden_states, attention_mask)
        attention_output = self.attention_output(attention_output)
        attention_output = self.attention_dropout(attention_output)
        attention_output = self.attention_ln(attention_output + hidden_states)

        intermediate_output = F.gelu(self.intermediate(attention_output))
        layer_output = self.output(intermediate_output)
        layer_output = self.output_dropout(layer_output)
        layer_output = self.output_ln(layer_output + attention_output)

        return layer_output


In [None]:
class SimpleBERT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.embeddings_token = nn.Embedding(config.vocab_size, config.hidden_size)
        self.embeddings_position = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.embeddings_token_type = nn.Embedding(config.type_vocab_size, config.hidden_size)
        self.embeddings_ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.embeddings_dropout = nn.Dropout(config.hidden_dropout_prob)

        self.encoder = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])

    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids)

        seq_length = input_ids.size(1)
        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device).unsqueeze(0)

        # Embed tokens, positions, and token types
        token_embeddings = self.embeddings_token(input_ids)
        position_embeddings = self.embeddings_position(position_ids)
        token_type_embeddings = self.embeddings_token_type(token_type_ids)

        # Sum all embeddings
        embeddings = token_embeddings + position_embeddings + token_type_embeddings
        embeddings = self.embeddings_ln(embeddings)
        embeddings = self.embeddings_dropout(embeddings)

        # Prepare attention mask
        if attention_mask is not None:
            extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
            extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
        else:
            extended_attention_mask = None

        # Process through the encoder layers
        hidden_states = embeddings
        for layer in self.encoder:
            hidden_states = layer(hidden_states, extended_attention_mask)

        return hidden_states

In [None]:
# Part 3: T5 Architecture Implementation (Encoder-Decoder)
class T5SelfAttention(nn.Module):
    def __init__(self, config, is_decoder=False):
        super().__init__()
        self.is_decoder = is_decoder
        self.d_model = config.d_model
        self.n_heads = config.num_heads
        self.d_kv = config.d_kv

        self.q = nn.Linear(self.d_model, self.n_heads * self.d_kv)
        self.k = nn.Linear(self.d_model, self.n_heads * self.d_kv)
        self.v = nn.Linear(self.d_model, self.n_heads * self.d_kv)
        self.o = nn.Linear(self.n_heads * self.d_kv, self.d_model)

        self.dropout = nn.Dropout(config.dropout_rate)

    def forward(self, hidden_states, mask=None, kv=None, position_bias=None):
        batch_size, seq_length, _ = hidden_states.shape

        q = self.q(hidden_states).view(batch_size, seq_length, self.n_heads, self.d_kv).transpose(1, 2)

        if kv is not None:
            _, kv_length, _ = kv.shape
            k = self.k(kv).view(batch_size, kv_length, self.n_heads, self.d_kv).transpose(1, 2)
            v = self.v(kv).view(batch_size, kv_length, self.n_heads, self.d_kv).transpose(1, 2)
        else:
            k = self.k(hidden_states).view(batch_size, seq_length, self.n_heads, self.d_kv).transpose(1, 2)
            v = self.v(hidden_states).view(batch_size, seq_length, self.n_heads, self.d_kv).transpose(1, 2)

        scores = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(self.d_kv)

        if position_bias is not None:
            scores += position_bias

        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        # Add causal mask for decoder
        if self.is_decoder and kv is None:
            causal_mask = torch.triu(torch.ones(seq_length, seq_length), diagonal=1).bool().to(device)
            scores = scores.masked_fill(causal_mask.unsqueeze(0).unsqueeze(0), -1e9)

        attn_weights = F.softmax(scores, dim=-1)
        attn_weights = self.dropout(attn_weights)

        attn_output = torch.matmul(attn_weights, v)
        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_length, self.n_heads * self.d_kv)
        attn_output = self.o(attn_output)

        return attn_output


In [None]:
class T5Block(nn.Module):
    def __init__(self, config, is_decoder=False):
        super().__init__()
        self.is_decoder = is_decoder
        self.layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon)

        self.self_attention = T5SelfAttention(config, is_decoder=is_decoder)

        if is_decoder:
            self.cross_attention = T5SelfAttention(config, is_decoder=False)
            self.cross_layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon)

        self.mlp = nn.Sequential(
            nn.Linear(config.d_model, config.d_ff),
            nn.ReLU(),
            nn.Dropout(config.dropout_rate),
            nn.Linear(config.d_ff, config.d_model),
            nn.Dropout(config.dropout_rate)
        )
        self.mlp_layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon)

    def forward(self, hidden_states, attention_mask=None, encoder_hidden_states=None, encoder_attention_mask=None):
        # Self attention
        norm_x = self.layer_norm(hidden_states)
        attention_output = self.self_attention(norm_x, mask=attention_mask)
        hidden_states = hidden_states + attention_output

        # Cross attention for decoder
        if self.is_decoder and encoder_hidden_states is not None:
            norm_x = self.cross_layer_norm(hidden_states)
            cross_attention_output = self.cross_attention(norm_x, mask=encoder_attention_mask, kv=encoder_hidden_states)
            hidden_states = hidden_states + cross_attention_output

        # Feed-forward network
        norm_x = self.mlp_layer_norm(hidden_states)
        ff_output = self.mlp(norm_x)
        hidden_states = hidden_states + ff_output

        return hidden_states


In [None]:
class SimpleT5(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.shared = nn.Embedding(config.vocab_size, config.d_model)

        # Encoder
        self.encoder = nn.ModuleList([T5Block(config) for _ in range(config.num_layers)])
        self.encoder_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon)

        # Decoder
        self.decoder = nn.ModuleList([T5Block(config, is_decoder=True) for _ in range(config.num_decoder_layers)])
        self.decoder_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon)

    def forward(self, input_ids, decoder_input_ids=None, attention_mask=None, decoder_attention_mask=None):
        # Embed inputs
        inputs_embeds = self.shared(input_ids)

        # Encoder
        encoder_hidden_states = inputs_embeds
        for layer in self.encoder:
            encoder_hidden_states = layer(encoder_hidden_states, attention_mask)
        encoder_hidden_states = self.encoder_norm(encoder_hidden_states)

        # Return just encoder output if no decoder input
        if decoder_input_ids is None:
            return encoder_hidden_states

        # Embed decoder inputs
        decoder_inputs_embeds = self.shared(decoder_input_ids)

        # Decoder
        decoder_hidden_states = decoder_inputs_embeds
        for layer in self.decoder:
            decoder_hidden_states = layer(
                decoder_hidden_states,
                decoder_attention_mask,
                encoder_hidden_states,
                attention_mask
            )
        decoder_hidden_states = self.decoder_norm(decoder_hidden_states)

        return encoder_hidden_states, decoder_hidden_states

In [None]:
# Part 4: Load and Compare Pre-trained Models
def load_pretrained_models():
    # Load GPT-2 model and tokenizer
    gpt2_config = GPT2Config.from_pretrained('gpt2')
    gpt2_model = GPT2Model.from_pretrained('gpt2')
    gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

    # Load BERT model and tokenizer
    bert_config = BertConfig.from_pretrained('bert-base-uncased')
    bert_model = BertModel.from_pretrained('bert-base-uncased')
    bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Load T5 model and tokenizer
    t5_config = T5Config.from_pretrained('t5-small')
    t5_model = T5Model.from_pretrained('t5-small')
    t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')

    return {
        'gpt2': (gpt2_model, gpt2_tokenizer, gpt2_config),
        'bert': (bert_model, bert_tokenizer, bert_config),
        't5': (t5_model, t5_tokenizer, t5_config)
    }

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def model_parameter_comparison():
    models = load_pretrained_models()

    param_counts = {}
    for name, (model, _, _) in models.items():
        param_counts[name] = count_parameters(model) / 1_000_000  # in millions

    # Create a bar chart
    plt.figure(figsize=(10, 6))
    bars = plt.bar(param_counts.keys(), param_counts.values(), color=['blue', 'orange', 'green'])

    # Add parameter count labels on top of bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.5,
                f'{height:.1f}M', ha='center', va='bottom')

    plt.title('Parameter Count Comparison (in millions)')
    plt.ylabel('Millions of Parameters')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

    return param_counts

In [None]:
# Part 5: Architecture Analysis
def visualize_attention_patterns():
    # Create sample inputs
    text = "The transformer architecture revolutionized natural language processing."

    # Load models
    models = load_pretrained_models()

    _, gpt2_tokenizer, _ = models['gpt2']
    _, bert_tokenizer, _ = models['bert']

    # Tokenize input for each model
    gpt2_tokens = gpt2_tokenizer.tokenize(text)
    bert_tokens = bert_tokenizer.tokenize(text)

    # Create visualization of attention patterns (conceptual)
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

    # GPT-2 causal attention mask (lower triangular)
    gpt2_mask = np.tril(np.ones((len(gpt2_tokens), len(gpt2_tokens))))
    ax1.imshow(gpt2_mask, cmap='Blues')
    ax1.set_title('GPT-2 Causal Attention Pattern')
    ax1.set_xticks(range(len(gpt2_tokens)))
    ax1.set_yticks(range(len(gpt2_tokens)))
    ax1.set_xticklabels(gpt2_tokens, rotation=90)
    ax1.set_yticklabels(gpt2_tokens)

    # BERT full attention mask (full matrix)
    bert_mask = np.ones((len(bert_tokens), len(bert_tokens)))
    ax2.imshow(bert_mask, cmap='Oranges')
    ax2.set_title('BERT Full Attention Pattern')
    ax2.set_xticks(range(len(bert_tokens)))
    ax2.set_yticks(range(len(bert_tokens)))
    ax2.set_xticklabels(bert_tokens, rotation=90)
    ax2.set_yticklabels(bert_tokens)

    plt.tight_layout()
    plt.show()

In [None]:
# Part 6: Task-specific behavior demonstration
def demonstrate_model_capabilities():
    models = load_pretrained_models()
    example_text = "Natural language processing has been transformed by"

    # GPT-2 next token prediction
    gpt2_model, gpt2_tokenizer, _ = models['gpt2']

    gpt2_inputs = gpt2_tokenizer(example_text, return_tensors="pt")
    with torch.no_grad():
        gpt2_outputs = gpt2_model(**gpt2_inputs)
        gpt2_hidden_states = gpt2_outputs.last_hidden_state

    # BERT bidirectional processing
    bert_model, bert_tokenizer, _ = models['bert']

    bert_inputs = bert_tokenizer(example_text, return_tensors="pt")
    with torch.no_grad():
        bert_outputs = bert_model(**bert_inputs)
        bert_hidden_states = bert_outputs.last_hidden_state

    # T5 text-to-text framework
    t5_model, t5_tokenizer, _ = models['t5']

    t5_inputs = t5_tokenizer(example_text, return_tensors="pt")
    t5_decoder_inputs = t5_tokenizer("the", return_tensors="pt")
    with torch.no_grad():
        t5_outputs = t5_model(input_ids=t5_inputs.input_ids, decoder_input_ids=t5_decoder_inputs.input_ids)
        t5_encoder_states = t5_outputs.encoder_last_hidden_state
        t5_decoder_states = t5_outputs.last_hidden_state

    # Print shapes to show model outputs
    print(f"GPT-2 output shape: {gpt2_hidden_states.shape}")
    print(f"BERT output shape: {bert_hidden_states.shape}")
    print(f"T5 encoder output shape: {t5_encoder_states.shape}")
    print(f"T5 decoder output shape: {t5_decoder_states.shape}")

    return {
        'gpt2': gpt2_hidden_states,
        'bert': bert_hidden_states,
        't5_encoder': t5_encoder_states,
        't5_decoder': t5_decoder_states
    }

In [None]:
# Part 7: Comparative analysis
def architecture_comparison_table():
    # Create a table showing key differences
    data = {
        'Model': ['GPT (Autoregressive)', 'BERT (Autoencoding)', 'T5 (Seq2Seq)'],
        'Architecture': ['Decoder-only', 'Encoder-only', 'Encoder-Decoder'],
        'Attention': ['Causal (unidirectional)', 'Bidirectional', 'Bidirectional + Causal'],
        'Pre-training': ['Next-token prediction', 'Masked LM + Next Sentence Prediction', 'Text-to-Text'],
        'Best suited for': ['Text generation', 'Understanding & Classification', 'Translation & Summarization']
    }

    fig, ax = plt.subplots(figsize=(12, 5))
    ax.axis('tight')
    ax.axis('off')
    table = ax.table(
        cellText=list(zip(*data.values())),
        colLabels=list(data.keys()),
        cellLoc='center',
        loc='center'
    )

    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.scale(1, 1.5)

    plt.title('Comparison of LLM Architectures')
    plt.tight_layout()
    plt.show()

In [None]:
# Part 8: Implementation of a simple transformer from scratch
class SimplifiedTransformer(nn.Module):
    def __init__(self, vocab_size, d_model=256, nhead=8, num_encoder_layers=3,
                 num_decoder_layers=3, dim_feedforward=1024, dropout=0.1,
                 max_seq_length=128, architecture='seq2seq'):
        super().__init__()

        self.architecture = architecture  # 'decoder_only', 'encoder_only', or 'seq2seq'

        # Embeddings
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.position_embedding = nn.Embedding(max_seq_length, d_model)

        # Encoder (for BERT and T5)
        if architecture in ['encoder_only', 'seq2seq']:
            encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, batch_first=True)
            self.encoder = nn.TransformerEncoder(encoder_layer, num_encoder_layers)

        # Decoder (for GPT and T5)
        if architecture in ['decoder_only', 'seq2seq']:
            decoder_layer = nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, batch_first=True)
            self.decoder = nn.TransformerDecoder(decoder_layer, num_decoder_layers)

        # Output projection
        self.output_projection = nn.Linear(d_model, vocab_size)

        def forward(self, src_tokens, tgt_tokens=None, src_mask=None, tgt_mask=None, src_key_padding_mask=None, tgt_key_padding_mask=None):
          batch_size, src_len = src_tokens.size()
          src_positions = torch.arange(src_len, device=src_tokens.device).unsqueeze(0)
          src_emb = self.token_embedding(src_tokens) + self.position_embedding(src_positions)

          if self.architecture in ['encoder_only', 'seq2seq']:
              memory = self.encoder(src_emb, mask=src_mask, src_key_padding_mask=src_key_padding_mask)
          else:
              memory = None

          if self.architecture == 'encoder_only':
              output = self.output_projection(memory)
          else:
              if tgt_tokens is None:
                  raise ValueError("tgt_tokens must be provided for decoder or seq2seq architectures")
              tgt_len = tgt_tokens.size(1)
              tgt_positions = torch.arange(tgt_len, device=tgt_tokens.device).unsqueeze(0)
              tgt_emb = self.token_embedding(tgt_tokens) + self.position_embedding(tgt_positions)

              if self.architecture == 'decoder_only':
                  output_states = self.decoder(
                      tgt_emb,
                      memory if memory is not None else tgt_emb,
                      tgt_mask=tgt_mask,
                      memory_key_padding_mask=src_key_padding_mask,
                      tgt_key_padding_mask=tgt_key_padding_mask
                  )
              else:  # seq2seq
                  output_states = self.decoder(
                      tgt_emb,
                      memory,
                      tgt_mask=tgt_mask,
                      memory_key_padding_mask=src_key_padding_mask,
                      tgt_key_padding_mask=tgt_key_padding_mask
                  )

              output = self.output_projection(output_states)

          return output

In [None]:
# Run full notebook demonstrations
param_counts = model_parameter_comparison()
visualize_attention_patterns()
outputs = demonstrate_model_capabilities()
architecture_comparison_table()

# Quiz -1

Question 1: Which architecture uses causal masking for autoregressive generation?
1. BERT
2. GPT (Correct)
3. T5
4. Transformer (original)


Question 2: Which model architecture employs bidirectional context for token prediction?
1. GPT-2
2. BERT (Correct)
3. GPT-3
4. PaLM


Question 3: What is the key innovation of T5 compared to other architectures?
1. Larger parameter count
2. Text-to-text framework (Correct)
3. Mixture-of-experts
4. Reinforcement learning


Question 4: Which architecture is best suited for classification tasks?
1. GPT
2. BERT (Correct)
3. T5
4. All equally


Question 5: What does the "autoregressive" property in GPT refer to?
1. Self-improving capabilities
2. Predicting each token based on previous tokens (Correct)
3. Automatic gradient computation
4. Recursively generating embeddings

----

# Quiz -2

1. **Which architecture uses causal (unidirectional) masking?**  
   A. BERT  
   B. GPT (Correct)  
   C. T5  
   D. Transformer (original)

2. **Which pre-training objective is unique to BERT?**  
   A. Next-token prediction  
   B. Masked Language Modeling (MLM) (Correct)
   C. Text-to-text reformulation  
   D. Causal language modeling

3. **In a GPT model, how is token position information added?**  
   A. Through token-type embeddings  
   B. Via a separate positional-encoding embedding layer (Correct)
   C. By concatenating absolute positions to token IDs  
   D. It isn’t—GPT infers order purely from attention

4. **What core framework does T5 use?**  
   A. Decoder-only  
   B. Encoder-only  
   C. Encoder-Decoder (seq2seq) (Correct)
   D. Hybrid GAN-Transformer

5. **T5 reformulates every task as:**  
   A. A sequence classification problem  
   B. A masked-token prediction problem  
   C. A text-to-text generation problem (Correct) 
   D. A next-sentence prediction problem

6. **Which architecture is best suited for pure text generation tasks (e.g., writing a story)?**  
   A. BERT  
   B. GPT (Correct)
   C. T5  
   D. All are equally good

7. **True or False:** BERT’s self-attention blocks allow each token to attend only to previous tokens.  
   - True  
   - False (False)

8. **Short answer:** Name one real-world application powered by GPT-style models.

Chatbots, such as ChatGPT, used for customer support, virtual assistants, and interactive AI companions.

-----