# Week 9, Day 4: Large Language Models

## Learning Objectives
- Understand LLM architecture
- Learn transformer models
- Master attention mechanisms
- Practice implementing LLMs

## Topics Covered
1. Transformer Architecture
2. Attention Mechanisms
3. Pre-training and Fine-tuning
4. Prompt Engineering

In [None]:
# Import required libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import transformers
from transformers import GPT2Tokenizer, GPT2LMHeadModel

## 1. Transformer Implementation

In [None]:
class MultiHeadAttention(layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        
        assert d_model % self.num_heads == 0
        
        self.depth = d_model // self.num_heads
        
        self.wq = layers.Dense(d_model)
        self.wk = layers.Dense(d_model)
        self.wv = layers.Dense(d_model)
        
        self.dense = layers.Dense(d_model)
    
    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])
    
    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]
        
        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)
        
        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)
        
        scaled_attention, attention_weights = self.scaled_dot_product_attention(
            q, k, v, mask)
        
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(scaled_attention,
                                     (batch_size, -1, self.d_model))
        
        output = self.dense(concat_attention)
        
        return output, attention_weights
    
    def scaled_dot_product_attention(self, q, k, v, mask):
        matmul_qk = tf.matmul(q, k, transpose_b=True)
        
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
        
        if mask is not None:
            scaled_attention_logits += (mask * -1e9)
        
        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
        output = tf.matmul(attention_weights, v)
        
        return output, attention_weights

## 2. Language Model Training

In [None]:
class LanguageModel:
    def __init__(self, vocab_size, d_model=256, num_layers=4, num_heads=8):
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.num_layers = num_layers
        self.num_heads = num_heads
        
        self.model = self.build_model()
    
    def build_model(self):
        inputs = layers.Input(shape=(None,))
        
        # Embedding
        x = layers.Embedding(self.vocab_size, self.d_model)(inputs)
        x = layers.Dropout(0.1)(x)
        
        # Transformer blocks
        for _ in range(self.num_layers):
            x = self.transformer_block(x)
        
        # Output
        outputs = layers.Dense(self.vocab_size, activation='softmax')(x)
        
        return tf.keras.Model(inputs, outputs)
    
    def transformer_block(self, x):
        # Multi-head attention
        attn_output, _ = MultiHeadAttention(
            self.d_model, self.num_heads)(x, x, x, None)
        x = layers.LayerNormalization(epsilon=1e-6)(x + attn_output)
        
        # Feed forward
        ffn_output = self.point_wise_feed_forward_network(x)
        return layers.LayerNormalization(epsilon=1e-6)(x + ffn_output)
    
    def point_wise_feed_forward_network(self, x):
        return layers.Dense(self.d_model * 4, activation='relu')(
            layers.Dense(self.d_model)(x)
        )

## 3. Using Pre-trained Models

In [None]:
def use_pretrained_model():
    # Load pre-trained model and tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    
    # Generate text
    def generate_text(prompt, max_length=100):
        # Encode prompt
        input_ids = tokenizer.encode(prompt, return_tensors='pt')
        
        # Generate
        output = model.generate(
            input_ids,
            max_length=max_length,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            temperature=0.7
        )
        
        # Decode and return
        return tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Example usage
    prompt = "The artificial intelligence revolution"
    generated_text = generate_text(prompt)
    print(f"Generated text:\n{generated_text}")

use_pretrained_model()

## 4. Prompt Engineering

In [None]:
def prompt_engineering_examples():
    # Load model
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    
    # Different prompt styles
    prompts = [
        # Zero-shot
        "Classify the sentiment (positive/negative): 'I love this movie!'",
        
        # Few-shot
        """Classify sentiment:
        Text: 'This is terrible!' Sentiment: negative
        Text: 'Amazing experience!' Sentiment: positive
        Text: 'I'm so happy!' Sentiment:""",
        
        # Chain-of-thought
        """Let's solve this step by step:
        Question: If John has 5 apples and gives 2 to Mary, how many does he have left?
        1. Initial count: John has 5 apples
        2. Given away: 2 apples to Mary
        3. Calculation: 5 - 2 = 3
        Therefore, John has"""
    ]
    
    # Generate responses
    for prompt in prompts:
        input_ids = tokenizer.encode(prompt, return_tensors='pt')
        output = model.generate(
            input_ids,
            max_length=len(prompt) + 50,
            num_return_sequences=1,
            temperature=0.7
        )
        response = tokenizer.decode(output[0], skip_special_tokens=True)
        print(f"\nPrompt: {prompt}\nResponse: {response}\n")

prompt_engineering_examples()

## Practical Exercises

In [None]:
# Exercise 1: Attention Mechanism

def attention_exercise():
    print("Task: Implement attention mechanism")
    print("1. Create attention layer")
    print("2. Implement scaled dot-product")
    print("3. Add multi-head attention")
    print("4. Test attention")
    
    # Your code here

attention_exercise()

In [None]:
# Exercise 2: Prompt Engineering

def prompt_exercise():
    print("Task: Design effective prompts")
    print("1. Create zero-shot prompts")
    print("2. Design few-shot examples")
    print("3. Implement chain-of-thought")
    print("4. Test prompts")
    
    # Your code here

prompt_exercise()

## MCQ Quiz

1. What is a transformer?
   - a) RNN model
   - b) Attention-based model
   - c) CNN model
   - d) Linear model

2. What is attention?
   - a) Memory mechanism
   - b) Focus mechanism
   - c) Learning rate
   - d) Loss function

3. What is self-attention?
   - a) External attention
   - b) Internal attention
   - c) Memory mechanism
   - d) Loss function

4. What is prompt engineering?
   - a) Model training
   - b) Input design
   - c) Loss function
   - d) Architecture design

5. What is zero-shot learning?
   - a) Training method
   - b) No example inference
   - c) Loss function
   - d) Architecture type

6. What is few-shot learning?
   - a) Full training
   - b) Example-based learning
   - c) Architecture type
   - d) Loss function

7. What is chain-of-thought?
   - a) Model architecture
   - b) Reasoning process
   - c) Training method
   - d) Loss function

8. What is pre-training?
   - a) Fine-tuning
   - b) Initial training
   - c) Inference
   - d) Evaluation

9. What is fine-tuning?
   - a) Pre-training
   - b) Task adaptation
   - c) Model design
   - d) Loss function

10. What is temperature in generation?
    - a) Model parameter
    - b) Randomness control
    - c) Learning rate
    - d) Loss weight

Answers: 1-b, 2-b, 3-b, 4-b, 5-b, 6-b, 7-b, 8-b, 9-b, 10-b