# Exercise 09: One-Hot vs. Dense Encoding

## Part 1: Implement One-Hot Encoding


In [37]:
import numpy as np


### Task 1.1: Create One-Hot Encoder


In [38]:
class OneHotEncoder:
    """
    One-hot encoder for text vocabulary.
    """
    
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.vocab_size = 0
    
    def fit(self, words):
        """
        Build vocabulary from list of words.
        
        Args:
            words: List of unique words
        """
        self.word2idx = {word: idx for idx, word in enumerate(words)}
        self.idx2word = {idx: word for word, idx in self.word2idx.items()}
        self.vocab_size = len(words)
    
    def encode_word(self, word):
        """
        Encode single word as one-hot vector.
        
        Returns:
            numpy array of shape (vocab_size,)
        """
        vec = np.zeros(self.vocab_size)
        if word in self.word2idx:
            vec[self.word2idx[word]] = 1
        return vec
    
    def encode_sequence(self, words):
        """
        Encode list of words as matrix.
        
        Returns:
            numpy array of shape (len(words), vocab_size)
        """
        return np.array([self.encode_word(word) for word in words])


In [39]:
# Test the OneHotEncoder
vocabulary = ["cat", "dog", "bird", "fish", "horse"]
encoder = OneHotEncoder()
encoder.fit(vocabulary)

print(f"Vocabulary: {vocabulary}")
print(f"Vocabulary size: {encoder.vocab_size}")
print(f"\nWord to Index mapping:")
for word, idx in encoder.word2idx.items():
    print(f"  '{word}' -> {idx}")

print("\nOne-hot vectors for each word:")
for word in vocabulary:
    vec = encoder.encode_word(word)
    print(f"  '{word}': {vec.astype(int)}")

# Test encode_sequence
sentence = ["cat", "dog", "bird"]
encoded = encoder.encode_sequence(sentence)
print(f"\nSequence: {sentence}")
print(f"Encoded shape: {encoded.shape}")
print(f"Encoded matrix:\n{encoded.astype(int)}")


Vocabulary: ['cat', 'dog', 'bird', 'fish', 'horse']
Vocabulary size: 5

Word to Index mapping:
  'cat' -> 0
  'dog' -> 1
  'bird' -> 2
  'fish' -> 3
  'horse' -> 4

One-hot vectors for each word:
  'cat': [1 0 0 0 0]
  'dog': [0 1 0 0 0]
  'bird': [0 0 1 0 0]
  'fish': [0 0 0 1 0]
  'horse': [0 0 0 0 1]

Sequence: ['cat', 'dog', 'bird']
Encoded shape: (3, 5)
Encoded matrix:
[[1 0 0 0 0]
 [0 1 0 0 0]
 [0 0 1 0 0]]


### Task 1.2: Measure Memory Usage


In [40]:
def measure_memory(vocab_size, seq_length, dtype=np.float32):
    """
    Calculate memory usage for one-hot encoding.
    
    Returns:
        Memory in bytes and human-readable format
    """
    bytes_per_element = np.dtype(dtype).itemsize
    total_bytes = vocab_size * seq_length * bytes_per_element
    
    if total_bytes < 1024:
        return f"{total_bytes} bytes"
    elif total_bytes < 1024 * 1024:
        return f"{total_bytes / 1024:.2f} KB"
    elif total_bytes < 1024 * 1024 * 1024:
        return f"{total_bytes / (1024 * 1024):.2f} MB"
    else:
        return f"{total_bytes / (1024 * 1024 * 1024):.2f} GB"


In [41]:
# Test with realistic NLP vocabulary sizes
vocab_sizes = [1000, 10000, 50000, 100000]
seq_length = 100

print("Memory usage for one-hot encoding:")
print("-" * 50)
for v in vocab_sizes:
    mem = measure_memory(v, seq_length)
    print(f"Vocab {v:,}: {mem}")


Memory usage for one-hot encoding:
--------------------------------------------------
Vocab 1,000: 390.62 KB
Vocab 10,000: 3.81 MB
Vocab 50,000: 19.07 MB
Vocab 100,000: 38.15 MB


## Part 2: The Similarity Problem


### Task 2.1: Compute Cosine Similarity


In [42]:
def cosine_similarity(vec_a, vec_b):
    """
    Compute cosine similarity between two vectors.
    
    Formula: cos_sim = (A dot B) / (||A|| * ||B||)
    
    Returns:
        Similarity score between -1 and 1
    """
    dot_product = np.dot(vec_a, vec_b)
    norm_a = np.linalg.norm(vec_a)
    norm_b = np.linalg.norm(vec_b)
    
    if norm_a == 0 or norm_b == 0:
        return 0
    
    return dot_product / (norm_a * norm_b)


### Task 2.2: Demonstrate the Problem


In [43]:
# Create encoder with semantic word groups
words = [
    "cat", "dog", "bird",           # Animals
    "car", "truck", "motorcycle",   # Vehicles
    "happy", "sad", "angry",        # Emotions
    "run", "walk", "jump"           # Actions
]

encoder = OneHotEncoder()
encoder.fit(words)

# Compute similarity between all pairs
print("One-Hot Similarities:")
print("-" * 40)

# Compare semantically similar words
pairs_similar = [("cat", "dog"), ("car", "truck"), ("happy", "sad")]
for w1, w2 in pairs_similar:
    v1 = encoder.encode_word(w1)
    v2 = encoder.encode_word(w2)
    sim = cosine_similarity(v1, v2)
    print(f"{w1} vs {w2}: {sim:.4f}")  # Should all be 0!

# Compare semantically different words
pairs_different = [("cat", "car"), ("happy", "truck")]
for w1, w2 in pairs_different:
    v1 = encoder.encode_word(w1)
    v2 = encoder.encode_word(w2)
    sim = cosine_similarity(v1, v2)
    print(f"{w1} vs {w2}: {sim:.4f}")  # Also 0!

print("\nProblem: All similarities are 0")
print("One-hot cannot distinguish semantic relationships between differemt words.")


One-Hot Similarities:
----------------------------------------
cat vs dog: 0.0000
car vs truck: 0.0000
happy vs sad: 0.0000
cat vs car: 0.0000
happy vs truck: 0.0000

Problem: All similarities are 0
One-hot cannot distinguish semantic relationships between differemt words.


## Part 3: Compare with Dense Embeddings


### Task 3.1: Simulate Dense Embeddings


In [44]:
def create_semantic_embeddings(words, embedding_dim=8):
    """
    Create dense embeddings with semantic structure.
    
    Words in similar categories will have similar embeddings.
    """
    np.random.seed(42)
    
    # Define semantic categories
    categories = {
        "animal": ["cat", "dog", "bird"],
        "vehicle": ["car", "truck", "motorcycle"],
        "emotion": ["happy", "sad", "angry"],
        "action": ["run", "walk", "jump"]
    }
    
    # Create category base vectors
    category_vectors = {
        cat: np.random.randn(embedding_dim) for cat in categories
    }
    
    embeddings = {}
    for category, word_list in categories.items():
        base = category_vectors[category]
        for word in word_list:
            # Add small random noise to base vector
            noise = np.random.randn(embedding_dim) * 0.1
            embeddings[word] = base + noise
    
    return embeddings


### Task 3.2: Compare Similarities


In [45]:
embeddings = create_semantic_embeddings(words)

print("\nDense Embedding Similarities:")
print("-" * 40)

# Same pairs as before
for w1, w2 in pairs_similar:
    sim = cosine_similarity(embeddings[w1], embeddings[w2])
    print(f"{w1} vs {w2}: {sim:.4f}")  # Should be HIGH (similar categories)

for w1, w2 in pairs_different:
    sim = cosine_similarity(embeddings[w1], embeddings[w2])
    print(f"{w1} vs {w2}: {sim:.4f}")  # Should be LOW (different categories)



Dense Embedding Similarities:
----------------------------------------
cat vs dog: 0.9931
car vs truck: 0.9906
happy vs sad: 0.9946
cat vs car: -0.5199
happy vs truck: 0.3487


### Task 3.3: Memory Comparison


In [46]:
def compare_memory(vocab_size, seq_length, embedding_dim=100):
    """Compare memory usage between one-hot and dense embeddings."""
    
    one_hot_bytes = vocab_size * seq_length * 4  # float32
    dense_bytes = embedding_dim * seq_length * 4  # float32
    
    ratio = one_hot_bytes / dense_bytes
    
    print(f"Memory Comparison (vocab_size={vocab_size:,}, seq_length={seq_length}, embedding_dim={embedding_dim}):")
    print("-" * 60)
    print(f"One-hot encoding: {measure_memory(vocab_size, seq_length)}")
    print(f"Dense embeddings: {measure_memory(embedding_dim, seq_length)}")
    print(f"Ratio (one-hot / dense): {ratio:.1f}x")
    print(f"\nDense embeddings use {ratio:.1f}x less memory!")

compare_memory(50000, 100, embedding_dim=100)
# Expected: Dense uses ~500x less memory!


Memory Comparison (vocab_size=50,000, seq_length=100, embedding_dim=100):
------------------------------------------------------------
One-hot encoding: 19.07 MB
Dense embeddings: 39.06 KB
Ratio (one-hot / dense): 500.0x

Dense embeddings use 500.0x less memory!


## Summary Table


In [47]:
# Calculate values for summary table
vocab_size = 50000
seq_length = 100
embedding_dim = 100

# Memory per word (one-hot)
one_hot_per_word_bytes = vocab_size * 4  # float32
one_hot_per_word = measure_memory(vocab_size, 1)

# Memory per word (dense)
dense_per_word_bytes = embedding_dim * 4  # float32
dense_per_word = measure_memory(embedding_dim, 1)

# Memory per sequence (one-hot)
one_hot_per_seq = measure_memory(vocab_size, seq_length)

# Memory per sequence (dense)
dense_per_seq = measure_memory(embedding_dim, seq_length)

# Similarities (from previous cells)
# One-hot: cat-dog = 0.0000, cat-car = 0.0000
# Dense: cat-dog = 0.9931, cat-car = -0.5199

print("Summary Table:")
print("=" * 70)
print(f"{'Metric':<40} {'One-Hot':<20} {'Dense (dim=100)':<20}")
print("-" * 70)
print(f"{'Memory per word (50K vocab)':<40} {one_hot_per_word:<20} {dense_per_word:<20}")
print(f"{'Memory per sequence (100 words)':<40} {one_hot_per_seq:<20} {dense_per_seq:<20}")
print(f"{'Similarity: cat-dog':<40} {'0.0000':<20} {'0.9931':<20}")
print(f"{'Similarity: cat-car':<40} {'0.0000':<20} {'-0.5199':<20}")
print(f"{'Captures semantics?':<40} {'No':<20} {'Yes':<20}")
print("=" * 70)


Summary Table:
Metric                                   One-Hot              Dense (dim=100)     
----------------------------------------------------------------------
Memory per word (50K vocab)              195.31 KB            400 bytes           
Memory per sequence (100 words)          19.07 MB             39.06 KB            
Similarity: cat-dog                      0.0000               0.9931              
Similarity: cat-car                      0.0000               -0.5199             
Captures semantics?                      No                   Yes                 


## Reflection Questions


### 1. What is the sparsity of one-hot vectors? (What percentage of values are zero?) How does this waste computation?


In [48]:
# Calculate sparsity for different vocabulary sizes
vocab_sizes = [100, 1000, 10000, 50000, 100000]

print("Sparsity of One-Hot Vectors:")
print("-" * 50)
for vocab_size in vocab_sizes:
    nonzero = 1  # Only one position is 1
    total = vocab_size
    sparsity = (total - nonzero) / total * 100
    print(f"Vocab size {vocab_size:>7,}: {sparsity:.4f}% zeros ({nonzero}/{total} non-zero)")

print("\nAnswer:")
print("One-hot vectors are extremely sparse:")
print("- For a vocabulary of 50,000 words, 99.998% of values are zeros")
print("- Only 1 out of 50,000 positions contains a 1")
print("\nThis wastes computation because:")
print("1. Most operations multiply by zero (no effect)")
print("2. Memory is allocated for all zeros but never used")
print("3. Matrix operations become inefficient with sparse data")
print("4. Cache misses increase due to large memory footprint")


Sparsity of One-Hot Vectors:
--------------------------------------------------
Vocab size     100: 99.0000% zeros (1/100 non-zero)
Vocab size   1,000: 99.9000% zeros (1/1000 non-zero)
Vocab size  10,000: 99.9900% zeros (1/10000 non-zero)
Vocab size  50,000: 99.9980% zeros (1/50000 non-zero)
Vocab size 100,000: 99.9990% zeros (1/100000 non-zero)

Answer:
One-hot vectors are extremely sparse:
- For a vocabulary of 50,000 words, 99.998% of values are zeros
- Only 1 out of 50,000 positions contains a 1

This wastes computation because:
1. Most operations multiply by zero (no effect)
2. Memory is allocated for all zeros but never used
3. Matrix operations become inefficient with sparse data
4. Cache misses increase due to large memory footprint


### 2. Why do one-hot vectors have zero similarity? Think about what the dot product of two one-hot vectors equals.


In [49]:
# Demonstrate why one-hot vectors have zero similarity
cat_vec = encoder.encode_word("cat")
dog_vec = encoder.encode_word("dog")

print("Example: cat and dog one-hot vectors")
print(f"cat vector: {cat_vec.astype(int)}")
print(f"dog vector: {dog_vec.astype(int)}")
print()

# Dot product
dot_product = np.dot(cat_vec, dog_vec)
print(f"Dot product (cat · dog): {dot_product}")

# Norms
norm_cat = np.linalg.norm(cat_vec)
norm_dog = np.linalg.norm(dog_vec)
print(f"||cat|| = {norm_cat:.4f}")
print(f"||dog|| = {norm_dog:.4f}")
print()

# Cosine similarity
cos_sim = cosine_similarity(cat_vec, dog_vec)
print(f"Cosine similarity: {cos_sim:.4f}")

print("\nAnswer:")
print("One-hot vectors have zero similarity because:")
print("1. Each one-hot vector has exactly one '1' and all other positions are '0'")
print("2. Different words have their '1' at different positions")
print("3. The dot product of two different one-hot vectors is always 0")
print("   (since they never have '1' at the same position)")
print("4. Cosine similarity = (A · B) / (||A|| * ||B||)")
print("   Since A · B = 0, cosine similarity = 0")
print("5. One-hot vectors are orthogonal (perpendicular) to each other")


Example: cat and dog one-hot vectors
cat vector: [1 0 0 0 0 0 0 0 0 0 0 0]
dog vector: [0 1 0 0 0 0 0 0 0 0 0 0]

Dot product (cat · dog): 0.0
||cat|| = 1.0000
||dog|| = 1.0000

Cosine similarity: 0.0000

Answer:
One-hot vectors have zero similarity because:
1. Each one-hot vector has exactly one '1' and all other positions are '0'
2. Different words have their '1' at different positions
3. The dot product of two different one-hot vectors is always 0
   (since they never have '1' at the same position)
4. Cosine similarity = (A · B) / (||A|| * ||B||)
   Since A · B = 0, cosine similarity = 0
5. One-hot vectors are orthogonal (perpendicular) to each other


### 3. How do dense embeddings capture "cat is similar to dog"? What property of the vectors enables this?


In [50]:
# Demonstrate dense embeddings
cat_emb = embeddings["cat"]
dog_emb = embeddings["dog"]
car_emb = embeddings["car"]

print("Dense Embeddings (first 5 dimensions):")
print(f"cat:  {cat_emb[:5]}")
print(f"dog:  {dog_emb[:5]}")
print(f"car:  {car_emb[:5]}")
print()

# Show they share similar values
cat_dog_sim = cosine_similarity(cat_emb, dog_emb)
cat_car_sim = cosine_similarity(cat_emb, car_emb)

print(f"Similarity cat-dog: {cat_dog_sim:.4f}")
print(f"Similarity cat-car: {cat_car_sim:.4f}")
print()

# Show they're in the same semantic space
print("Key properties:")
print("1. Both cat and dog embeddings have non-zero values in ALL dimensions")
print("2. They share similar values because they come from the same category base vector")
print("3. The dot product is non-zero (unlike one-hot)")
print("4. Cosine similarity captures the angle between vectors in high-dimensional space")

print("\nAnswer:")
print("Dense embeddings capture semantic similarity through:")
print("1. Shared dimensions: Both vectors have values in the same dimensions")
print("2. Similar magnitudes: Words in the same category have similar vector values")
print("3. Non-zero dot product: Unlike one-hot, dense vectors can have non-zero dot products")
print("4. Geometric proximity: Similar words are close in the embedding space")
print("5. Learned representations: Embeddings can be trained to capture semantic relationships")
print("6. Continuous values: All dimensions contribute to similarity, not just one position")


Dense Embeddings (first 5 dimensions):
cat:  [ 0.49536443 -0.24403539  0.72994303  1.40094549 -0.21326702]
dog:  [ 0.57056081 -0.12112747  0.63612371  1.49291949 -0.38200557]
car:  [-0.55339614  0.51163881 -0.43029135 -0.36817524  0.19404485]

Similarity cat-dog: 0.9931
Similarity cat-car: -0.5199

Key properties:
1. Both cat and dog embeddings have non-zero values in ALL dimensions
2. They share similar values because they come from the same category base vector
3. The dot product is non-zero (unlike one-hot)
4. Cosine similarity captures the angle between vectors in high-dimensional space

Answer:
Dense embeddings capture semantic similarity through:
1. Shared dimensions: Both vectors have values in the same dimensions
2. Similar magnitudes: Words in the same category have similar vector values
3. Non-zero dot product: Unlike one-hot, dense vectors can have non-zero dot products
4. Geometric proximity: Similar words are close in the embedding space
5. Learned representations: Embeddi

### 4. If vocabulary size is 100K and embedding dimension is 300, how much more efficient are dense embeddings?


In [51]:
# Calculate efficiency for 100K vocab and 300-dim embeddings
vocab_size_large = 100000
embedding_dim_large = 300
seq_length_example = 100

# Memory calculations
one_hot_bytes = vocab_size_large * seq_length_example * 4
dense_bytes = embedding_dim_large * seq_length_example * 4
ratio = one_hot_bytes / dense_bytes

print(f"Memory Efficiency Comparison:")
print(f"Vocabulary size: {vocab_size_large:,}")
print(f"Embedding dimension: {embedding_dim_large}")
print(f"Sequence length: {seq_length_example}")
print("-" * 60)
print(f"One-hot encoding: {measure_memory(vocab_size_large, seq_length_example)}")
print(f"Dense embeddings: {measure_memory(embedding_dim_large, seq_length_example)}")
print(f"Efficiency ratio: {ratio:.1f}x")
print()

# Per word comparison
one_hot_per_word = vocab_size_large * 4
dense_per_word = embedding_dim_large * 4
ratio_per_word = one_hot_per_word / dense_per_word

print(f"Memory per word:")
print(f"One-hot: {measure_memory(vocab_size_large, 1)}")
print(f"Dense: {measure_memory(embedding_dim_large, 1)}")
print(f"Efficiency ratio: {ratio_per_word:.1f}x")
print()

print("Answer:")
print(f"For vocabulary size 100K and embedding dimension 300:")
print(f"- Dense embeddings use {ratio:.1f}x less memory per sequence")
print(f"- Dense embeddings use {ratio_per_word:.1f}x less memory per word")
print(f"- This means dense embeddings are approximately {ratio:.0f}x more memory-efficient")
print(f"\nAdditionally:")
print(f"- Dense embeddings capture semantic relationships (one-hot cannot)")
print(f"- Dense embeddings are less sparse (all values are used)")
print(f"- Dense embeddings enable better generalization in neural networks")


Memory Efficiency Comparison:
Vocabulary size: 100,000
Embedding dimension: 300
Sequence length: 100
------------------------------------------------------------
One-hot encoding: 38.15 MB
Dense embeddings: 117.19 KB
Efficiency ratio: 333.3x

Memory per word:
One-hot: 390.62 KB
Dense: 1.17 KB
Efficiency ratio: 333.3x

Answer:
For vocabulary size 100K and embedding dimension 300:
- Dense embeddings use 333.3x less memory per sequence
- Dense embeddings use 333.3x less memory per word
- This means dense embeddings are approximately 333x more memory-efficient

Additionally:
- Dense embeddings capture semantic relationships (one-hot cannot)
- Dense embeddings are less sparse (all values are used)
- Dense embeddings enable better generalization in neural networks


## Summary Table


In [52]:
# Calculate values for summary table
vocab_50k = 50000
seq_100 = 100
embed_dim = 100

# Memory per word (50K vocab)
one_hot_per_word_bytes = vocab_50k * 4
dense_per_word_bytes = embed_dim * 4

# Memory per sequence (100 words)
one_hot_seq_bytes = vocab_50k * seq_100 * 4
dense_seq_bytes = embed_dim * seq_100 * 4

print("Summary Table Results:")
print("=" * 70)
print(f"{'Metric':<40} {'One-Hot':<20} {'Dense (dim=100)':<20}")
print("-" * 70)
print(f"{'Memory per word (50K vocab)':<40} {measure_memory(vocab_50k, 1):<20} {measure_memory(embed_dim, 1):<20}")
print(f"{'Memory per sequence (100 words)':<40} {measure_memory(vocab_50k, seq_100):<20} {measure_memory(embed_dim, seq_100):<20}")
print(f"{'Similarity: cat-dog':<40} {'0.0000':<20} {'0.9931':<20}")
print(f"{'Similarity: cat-car':<40} {'0.0000':<20} {'-0.5199':<20}")
print(f"{'Captures semantics?':<40} {'No':<20} {'Yes':<20}")
print("=" * 70)


Summary Table Results:
Metric                                   One-Hot              Dense (dim=100)     
----------------------------------------------------------------------
Memory per word (50K vocab)              195.31 KB            400 bytes           
Memory per sequence (100 words)          19.07 MB             39.06 KB            
Similarity: cat-dog                      0.0000               0.9931              
Similarity: cat-car                      0.0000               -0.5199             
Captures semantics?                      No                   Yes                 


## Reflection Questions


### 1. What is the sparsity of one-hot vectors? (What percentage of values are zero?) How does this waste computation?


In [53]:
# Calculate sparsity for different vocab sizes
vocab_sizes = [100, 1000, 10000, 50000, 100000]

print("One-Hot Vector Sparsity:")
print("-" * 50)
for v in vocab_sizes:
    sparsity_pct = ((v - 1) / v) * 100
    print(f"Vocab size {v:>7,}: {sparsity_pct:.4f}% zeros")

print("\nAnswer:")
print("For a vocabulary of size V, one-hot vectors have (V-1)/V * 100% zeros,")
print("which approaches 100% for large vocabularies (e.g., 99.999% for 100K vocab).")
print("This wastes computation because most operations (dot products, matrix multiplications)")
print("involve multiplying zeros, which is no very efficient.")


One-Hot Vector Sparsity:
--------------------------------------------------
Vocab size     100: 99.0000% zeros
Vocab size   1,000: 99.9000% zeros
Vocab size  10,000: 99.9900% zeros
Vocab size  50,000: 99.9980% zeros
Vocab size 100,000: 99.9990% zeros

Answer:
For a vocabulary of size V, one-hot vectors have (V-1)/V * 100% zeros,
which approaches 100% for large vocabularies (e.g., 99.999% for 100K vocab).
This wastes computation because most operations (dot products, matrix multiplications)
involve multiplying zeros, which is computationally inefficient.


### 2. Why do one-hot vectors have zero similarity? Think about what the dot product of two one-hot vectors equals.


In [55]:
print("\nAnswer:")
print("Since most of the valeus in the vecotr is 0, each step in dot product mutliplication will end up with 0x0 or 0x1, which is 0")



Answer:
Since most of the valeus in the vecotr is 0, each step in dot product mutliplication will end up with 0x0 or 0x1, which is 0


### 3. How do dense embeddings capture "cat is similar to dog"? What property of the vectors enables this?


In [56]:
cat_emb = embeddings["cat"]
dog_emb = embeddings["dog"]
cat_car_emb = embeddings["car"]

cat_dog_sim = cosine_similarity(cat_emb, dog_emb)
cat_car_sim = cosine_similarity(cat_emb, cat_car_emb)

print(f"Cat embedding (first 5 values): {cat_emb[:5]}")
print(f"Dog embedding (first 5 values): {dog_emb[:5]}")
print(f"Car embedding (first 5 values): {cat_car_emb[:5]}")
print(f"\nCat-Dog similarity: {cat_dog_sim:.4f}")
print(f"Cat-Car similarity: {cat_car_sim:.4f}")

print("\nAnswer:")
print("Words that mean simialr things or show up in similar contexts end up having vectors of relitively similar values.")


Cat embedding (first 5 values): [ 0.49536443 -0.24403539  0.72994303  1.40094549 -0.21326702]
Dog embedding (first 5 values): [ 0.57056081 -0.12112747  0.63612371  1.49291949 -0.38200557]
Car embedding (first 5 values): [-0.55339614  0.51163881 -0.43029135 -0.36817524  0.19404485]

Cat-Dog similarity: 0.9931
Cat-Car similarity: -0.5199

Answer:
Dense embeddings capture semantic similarity because words in similar categories
share similar vector values (high cosine similarity), enabled by the property that
semantically related words have vectors pointing in similar directions in the embedding space.


### 4. If vocabulary size is 100K and embedding dimension is 300, how much more efficient are dense embeddings?


In [57]:
vocab_100k = 100000
embed_300 = 300

one_hot_100k = vocab_100k * 4
dense_300 = embed_300 * 4
efficiency_ratio = one_hot_100k / dense_300

print(f"Memory per word:")
print(f"  One-hot (100K vocab): {measure_memory(vocab_100k, 1)}")
print(f"  Dense (300 dim): {measure_memory(embed_300, 1)}")
print(f"\nEfficiency ratio: {efficiency_ratio:.1f}x")

print("\nAnswer:")
print(f"For vocabulary size 100K and embedding dimension 300, dense embeddings")
print(f"are {efficiency_ratio:.0f}x more memory-efficient than one-hot encoding.")


Memory per word:
  One-hot (100K vocab): 390.62 KB
  Dense (300 dim): 1.17 KB

Efficiency ratio: 333.3x

Answer:
For vocabulary size 100K and embedding dimension 300, dense embeddings
are 333x more memory-efficient than one-hot encoding.
