In [None]:
"""
Exercise 03: Embedding Exploration - Starter Code

Experiment with word embeddings and analyze semantic relationships.

Prerequisites:
- Reading: 04-introduction-to-embeddings.md
- Reading: 05-word2vec-intuition.md
- Demo: demo_03_word2vec_visualization.py (REFERENCE FOR VISUALIZATION)

"""

from unittest.case import enterModuleContext
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore', category=FutureWarning)


# ============================================================================
# TASK 3.1: Keras Embedding Basics
# ============================================================================

def explore_embedding_layer():
    """
    Understand the Keras Embedding layer.
    
    KEY CONCEPT:
    - Embedding(vocab_size, embed_dim) creates a lookup table
    - Input: integer indices (word IDs)
    - Output: dense vectors of size embed_dim
    
    SHAPE TRANSFORMATION:
    - Input: (batch_size, sequence_length) - integers
    - Output: (batch_size, sequence_length, embed_dim) - floats
    
    PARAMETERS:
    - Total params = vocab_size * embed_dim
    
    TEST THESE EMBEDDING DIMS: 16, 32, 64, 128, 256
    with vocab_size = 1000
    """
    print("=" * 60)
    print("Task 3.1: Embedding Layer Basics")
    print("=" * 60)
    
    vocab_size = 1000
    embedding_dims = [16, 32, 64, 128, 256]
    
    print(f"Vocabulary Size: {vocab_size}")
    print(f"\n{'Embedding Dim':<15} {'Parameters':<15} {'Memory (MB)':<15}")
    print("-" * 45)
    
    sample_input = np.array([[1, 2, 3, 4]])
    
    for embed_dim in embedding_dims:
        embedding_layer = layers.Embedding(vocab_size, embed_dim)
        params = vocab_size * embed_dim
        memory_mb = (params * 4) / (1024 * 1024)
        
        print(f"{embed_dim:<15} {params:<15,} {memory_mb:<15.4f}")
        
        output = embedding_layer(sample_input)
        print(f"  Input shape: {sample_input.shape}")
        print(f"  Output shape: {output.shape}")
        print()
    
    print("\n" + "=" * 60)
    print("Key Observations:")
    print("=" * 60)
    print("1. Parameters = vocab_size × embedding_dim")
    print("2. Memory = parameters × 4 bytes (float32)")
    print("3. Output shape adds embedding_dim to input shape")
    print("4. Higher dimensions = more capacity but more memory")

In [69]:

# ============================================================================
# TASK 3.2: Training Embeddings
# ============================================================================

def train_sentiment_classifier():
    """
    Train embeddings through a simple sentiment task.
    
    SAMPLE DATA (provided):
    - Positive: "love", "great", "amazing", "wonderful"
    - Negative: "terrible", "awful", "hate", "bad"
    
    MODEL ARCHITECTURE:
    1. Embedding(vocab_size, 16)
    2. GlobalAveragePooling1D()  # Average all word vectors
    3. Dense(1, sigmoid)
    
    AFTER TRAINING:
    - Extract embedding weights: model.layers[0].get_weights()[0]
    - Words with similar sentiment should have similar vectors
    
    SEE: demo_03_word2vec_visualization.py for embedding extraction
    """
    print("=" * 60)
    print("Task 3.2: Training Embeddings")
    print("=" * 60)
    
    # Sample data
    texts = [
        "I love this movie it is amazing",
        "Great film excellent acting wonderful",
        "This movie is beautiful and amazing",
        "Terrible movie waste of time awful",
        "Bad film I hated it horrible",
        "Disappointing and boring terrible waste"
    ]
    labels = [1, 1, 1, 0, 0, 0]  # 1=positive, 0=negative
    
    # 1. Tokenize texts
    tokenizer = keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    
    vocab_size = len(tokenizer.word_index) + 1
    word_to_idx = tokenizer.word_index
    idx_to_word = {idx: word for word, idx in word_to_idx.items()}
    
    print(f"Vocabulary size: {vocab_size}")
    print(f"Word to index mapping: {word_to_idx}")
    print(f"\nSample sequences: {sequences[:2]}")
    
    # 2. Pad sequences to same length
    max_length = max(len(seq) for seq in sequences)
    padded_sequences = keras.preprocessing.sequence.pad_sequences(
        sequences, maxlen=max_length, padding='post'
    )
    
    X = np.array(padded_sequences)
    y = np.array(labels)
    
    print(f"\nPadded sequences shape: {X.shape}")
    print(f"Labels shape: {y.shape}")
    
    # 3. Build model
    model = keras.Sequential([
        layers.Embedding(vocab_size, 16, input_length=max_length),
        layers.GlobalAveragePooling1D(),
        layers.Dense(1, activation='sigmoid')
    ])
    
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    print("\nModel Architecture:")
    model.summary()
    
    # 4. Train model
    print("\nTraining model...")
    history = model.fit(
        X, y,
        epochs=100,
        verbose=0,
        validation_split=0.0
    )
    
    final_loss = history.history['loss'][-1]
    final_acc = history.history['accuracy'][-1]
    print(f"Final training loss: {final_loss:.4f}")
    print(f"Final training accuracy: {final_acc:.4f}")
    
    # 5. Extract embedding matrix
    embedding_layer = model.layers[0]
    embeddings = embedding_layer.get_weights()[0]
    
    print(f"\nEmbedding matrix shape: {embeddings.shape}")
    print(f"Embeddings for first 5 words:")
    for i in range(1, min(6, vocab_size)):
        word = idx_to_word.get(i, f"<word_{i}>")
        print(f"  '{word}': {embeddings[i][:5]}...")
    
    print("\n" + "=" * 60)
    print("Training Complete!")
    print("=" * 60)
    
    return embeddings, word_to_idx, idx_to_word


In [70]:

# ============================================================================
# TASK 3.3: Semantic Similarity
# ============================================================================

def analyze_semantic_similarity(embeddings, word_to_idx):
    """
    Analyze semantic relationships in learned embeddings.
    
    COSINE SIMILARITY FORMULA:
    cos_sim(a, b) = (a . b) / (||a|| * ||b||)
    
    TEST THESE PAIRS:
    - ("love", "amazing") - should be similar (both positive)
    - ("terrible", "awful") - should be similar (both negative)
    - ("love", "hate") - should be dissimilar (opposite sentiment)
    
    COMPARE TO ONE-HOT:
    - One-hot similarity was always 0 between different words
    - Embeddings should capture meaning!
    """
    print("=" * 60)
    print("Task 3.3: Semantic Similarity")
    print("=" * 60)
    
    test_pairs = [
        ("love", "amazing", "Both positive - should be similar"),
        ("terrible", "awful", "Both negative - should be similar"),
        ("love", "hate", "Opposite sentiment - should be dissimilar")
    ]
    
    print("\nCosine Similarity Analysis:")
    print("-" * 60)
    
    results = []
    
    for word1, word2, description in test_pairs:
        if word1 not in word_to_idx:
            print(f"'{word1}' not in vocabulary")
            continue
        if word2 not in word_to_idx:
            print(f"'{word2}' not in vocabulary")
            continue
        
        idx1 = word_to_idx[word1]
        idx2 = word_to_idx[word2]
        
        vec1 = embeddings[idx1]
        vec2 = embeddings[idx2]
        
        sim = cosine_similarity(vec1, vec2)
        results.append((word1, word2, sim, description))
        
        print(f"\nPair: '{word1}' vs '{word2}'")
        print(f"  Description: {description}")
        print(f"  Cosine Similarity: {sim:.4f}")
        print(f"  Interpretation: ", end="")
        
        if sim > 0.5:
            print("Highly similar ✓")
        elif sim > 0.2:
            print("Moderately similar")
        elif sim > -0.2:
            print("Neutral/Unrelated")
        else:
            print("Dissimilar ✓")
    
    print("\n" + "=" * 60)
    print("Summary:")
    print("=" * 60)
    print("Embeddings capture semantic meaning through vector similarity.")
    print("Unlike one-hot encoding (always 0 similarity), embeddings")
    print("learn meaningful relationships between words!")
    
    return results


def cosine_similarity(vec_a, vec_b):
    """
    Compute cosine similarity between two vectors.
    Returns value in [-1, 1], higher = more similar.
    """
    dot = np.dot(vec_a, vec_b)
    norm_a = np.linalg.norm(vec_a)
    norm_b = np.linalg.norm(vec_b)
    return dot / (norm_a * norm_b + 1e-8)


In [71]:

# ============================================================================
# TASK 3.4: Visualization
# ============================================================================

def visualize_embeddings(embeddings, idx_to_word):
    """
    Visualize embeddings in 2D using t-SNE or PCA.
    
    STEPS:
    1. Reduce dimensions: sklearn.manifold.TSNE or sklearn.decomposition.PCA
    2. Plot words as scatter points
    3. Color by sentiment (if known)
    4. Label each point with the word
    
    EXPECTED: Positive words cluster together, negative words cluster together
    
    SEE: demo_03_word2vec_visualization.py for complete visualization code
    
    Save to: embedding_visualization.png
    """
    print("=" * 60)
    print("Task 3.4: Visualization")
    print("=" * 60)
    
    # YOUR CODE:
    # 1. Apply PCA or t-SNE to reduce to 2D
    # 2. Plot with plt.scatter
    # 3. Add word labels with plt.annotate
    pass



In [None]:

# ============================================================================
# TASK 3.5: Embedding Dimension Experiment
# ============================================================================

def experiment_embedding_dims():
    """
    Test how embedding dimension affects model performance.
    
    TEST DIMS: 8, 16, 32, 64, 128
    
    HYPOTHESIS: Higher dims = more capacity but more overfitting risk
    
    RECORD:
    - Training accuracy for each dim
    - Validation accuracy for each dim
    - Training time
    """
    print("=" * 60)
    print("Task 3.5: Embedding Dimension Experiment")
    print("=" * 60)
    
    import time

    print("=" * 60)
    print("Task 3.5: Embedding Dimension Experiment")
    print("=" * 60)
    
    texts = [
        "I love this movie it is amazing",
        "Great film excellent acting wonderful",
        "This movie is beautiful and amazing",
        "Terrible movie waste of time awful",
        "Bad film I hated it horrible",
        "Disappointing and boring terrible waste"
    ]
    labels = [1, 1, 1, 0, 0, 0]
    
    tokenizer = keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    
    vocab_size = len(tokenizer.word_index) + 1
    max_length = max(len(seq) for seq in sequences)
    padded_sequences = keras.preprocessing.sequence.pad_sequences(
        sequences, maxlen=max_length, padding='post'
    )
    
    X = np.array(padded_sequences)
    y = np.array(labels)
    
    dims = [8, 16, 32, 64, 128]
    results = []
    
    print(f"\nTesting embedding dimensions: {dims}")
    print(f"Vocabulary size: {vocab_size}")
    print(f"Training samples: {len(X)}")
    print(f"Using validation_split=0.33 for validation\n")
    
    print(f"{'Dim':<8} {'Train Acc':<12} {'Val Acc':<12} {'Time (s)':<12} {'Params':<12}")
    print("-" * 60)
    
    for embed_dim in dims:
        start_time = time.time()
        
        model = keras.Sequential([
            layers.Embedding(vocab_size, embed_dim, input_length=max_length),
            layers.GlobalAveragePooling1D(),
            layers.Dense(1, activation='sigmoid')
        ])
        
        model.compile(
            optimizer='adam',
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        
        history = model.fit(
            X, y,
            epochs=100,
            verbose=0,
            validation_split=0.33
        )
        
        training_time = time.time() - start_time
        
        train_acc = history.history['accuracy'][-1]
        val_acc = history.history['val_accuracy'][-1]
        
        embedding_params = vocab_size * embed_dim
        dense_params = embed_dim * 1 + 1
        params = embedding_params + dense_params
        
        results.append({
            'dim': embed_dim,
            'train_acc': train_acc,
            'val_acc': val_acc,
            'time': training_time,
            'params': params
        })
        
        print(f"{embed_dim:<8} {train_acc:<12.4f} {val_acc:<12.4f} {training_time:<12.2f} {params:<12,}")
    
    print("\n" + "=" * 60)
    print("Analysis:")
    print("=" * 60)
    
    best_val = max(results, key=lambda x: x['val_acc'])
    fastest = min(results, key=lambda x: x['time'])
    
    print(f"\nBest validation accuracy: {best_val['val_acc']:.4f} (dim={best_val['dim']})")
    print(f"Fastest training: {fastest['time']:.2f}s (dim={fastest['dim']})")
    
    print("\nObservations:")
    print("- Higher dimensions provide more capacity but may overfit")
    print("- Training time increases with dimension (more parameters)")
    print("- Optimal dimension balances capacity vs overfitting risk")

    print("=" * 60)
    print("NOTE")
    print("=" * 60)
    print("sample size may be too small for accurate experimentation")
    return results


In [None]:

# ============================================================================
# MAIN
# ============================================================================

if __name__ == "__main__":
    print("=" * 60)
    print("Exercise 03: Embedding Exploration")
    print("=" * 60)
    
    # Uncomment as you complete:
    explore_embedding_layer()
    embeddings, word_to_idx, idx_to_word = train_sentiment_classifier()
    analyze_semantic_similarity(embeddings, word_to_idx)
    visualize_embeddings(embeddings, idx_to_word)
    experiment_embedding_dims() # sample size may be too small for accurate experimentation
    

Exercise 03: Embedding Exploration
Task 3.1: Embedding Layer Basics
Vocabulary Size: 1000

Embedding Dim   Parameters      Memory (MB)    
---------------------------------------------
16              16,000          0.0610         
  Input shape: (1, 4)
  Output shape: (1, 4, 16)

32              32,000          0.1221         
  Input shape: (1, 4)
  Output shape: (1, 4, 32)

64              64,000          0.2441         
  Input shape: (1, 4)
  Output shape: (1, 4, 64)

128             128,000         0.4883         
  Input shape: (1, 4)
  Output shape: (1, 4, 128)

256             256,000         0.9766         
  Input shape: (1, 4)
  Output shape: (1, 4, 256)


Key Observations:
1. Parameters = vocab_size × embedding_dim
2. Memory = parameters × 4 bytes (float32)
3. Output shape adds embedding_dim to input shape
4. Higher dimensions = more capacity but more memory
Task 3.2: Training Embeddings
Vocabulary size: 25
Word to index mapping: {'movie': 1, 'i': 2, 'this': 3, 'it': 4, '




Training model...
Final training loss: 0.5044
Final training accuracy: 1.0000

Embedding matrix shape: (25, 16)
Embeddings for first 5 words:
  'movie': [ 0.01841424 -0.01449239 -0.05531515  0.09051642 -0.09392176]...
  'i': [-0.03516037  0.01319461 -0.03501838 -0.01548882 -0.03202377]...
  'this': [ 0.10323714 -0.17077634 -0.17741868  0.16375753 -0.11663765]...
  'it': [0.03138268 0.00988049 0.03247201 0.0311247  0.04561434]...
  'is': [ 0.10104796 -0.17052098 -0.1702164   0.110618   -0.14405133]...

Training Complete!
Task 3.3: Semantic Similarity

Cosine Similarity Analysis:
------------------------------------------------------------

Pair: 'love' vs 'amazing'
  Description: Both positive - should be similar
  Cosine Similarity: 0.9472
  Interpretation: Highly similar ✓

Pair: 'terrible' vs 'awful'
  Description: Both negative - should be similar
  Cosine Similarity: 0.9402
  Interpretation: Highly similar ✓
'hate' not in vocabulary

Summary:
Embeddings capture semantic meaning th