In [10]:
"""
Exercise 02: Embedding Exploration - Starter Code

Complete the TODOs to explore text embeddings and similarity.

Prerequisites:
- pip install numpy sentence-transformers

Hints:
- Demo 02 (demo_02_embedding_generation.py) shows model loading
- Reading 05 (vector-similarity-concepts.md) has the cosine formula
"""

import numpy as np
# TODO 1.1: Import SentenceTransformer from sentence_transformers
from sentence_transformers import SentenceTransformer


In [11]:

# ============================================================================
# PART 1: Load the Embedding Model
# ============================================================================

print("=" * 60)
print("Part 1: Loading the Embedding Model")
print("=" * 60)

# TODO 1.1: Load the 'all-MiniLM-L6-v2' model
# Hint: model = SentenceTransformer('...')
model = SentenceTransformer('all-MiniLM-L6-v2') 

# TODO 1.2: Print the embedding dimension
# Hint: Look for a method with "embedding" and "dimension" in the name
dimensions = model.get_sentence_embedding_dimension()
print(f"Embedding dimension: {dimensions}")


Part 1: Loading the Embedding Model
Embedding dimension: 384


In [12]:
# ============================================================================
# PART 2: Generate Embeddings
# ============================================================================

print("\n" + "=" * 60)
print("Part 2: Generate Embeddings")
print("=" * 60)

# TODO 2.1: Encode a single sentence
single_sentence = "Machine learning is transforming industries"
# Hint: embedding = model.encode(...)
single_embedding = model.encode(single_sentence) 

# Print embedding properties
print(f"Shape: {single_embedding.shape if single_embedding is not None else '???'}")
print(f"First 5 values: {single_embedding[:5]}")
print(f"Min: {np.min(single_embedding)}, Max: {np.max(single_embedding)}")



Part 2: Generate Embeddings
Shape: (384,)
First 5 values: [ 0.00736547 -0.0427747   0.06799109 -0.00174222  0.06531589]
Min: -0.12663623690605164, Max: 0.1434013843536377


In [13]:
# TODO 2.2: Batch encode these sentences
sentences = [
    "The cat sat on the mat",
    "A kitten rested on the rug",
    "Dogs are loyal companions",
    "Python is a programming language",
    "The python snake is quite long",
    "I love coding in Python"
]

# Hint: Pass the list directly to model.encode()
embeddings = model.encode(sentences) 

print(f"\nBatch shape: {embeddings.shape if embeddings is not None else '???'}")




Batch shape: (6, 384)


In [14]:

# ============================================================================
# PART 3: Calculate Similarities
# ============================================================================

print("\n" + "=" * 60)
print("Part 3: Calculate Similarities")
print("=" * 60)

# TODO 3.1: Implement cosine similarity
def cosine_similarity(a, b):
    """
    Calculate cosine similarity between two vectors.
    
    Formula: cos(θ) = (A · B) / (||A|| × ||B||)
    
    Hint: Use np.dot() for dot product
    Hint: Use np.linalg.norm() for magnitude
    """
    # Your implementation here
    dot_product = np.dot(a, b)
    magnitude_a = np.linalg.norm(a)
    magnitude_b = np.linalg.norm(b)
    return dot_product / (magnitude_a * magnitude_b)

# TODO 3.2: Build similarity matrix
# For each pair of sentences, calculate similarity
print("\nSimilarity Matrix:")
print("-" * 40)

# Example structure (fill in the actual calculations):
similarity_matrix = np.zeros((len(sentences), len(sentences)))

for i in range(len(sentences)):
    for j in range(len(sentences)):
        sim = cosine_similarity(embeddings[i], embeddings[j])
        similarity_matrix[i, j] = sim
        print(f"{sim:.4f}", end=" ")
    print()



Part 3: Calculate Similarities

Similarity Matrix:
----------------------------------------
1.0000 0.6131 0.1646 0.0309 0.1345 0.0337 
0.6131 1.0000 0.1545 0.0472 0.1288 0.0294 
0.1646 0.1545 1.0000 0.1078 0.0822 0.1410 
0.0309 0.0472 0.1078 1.0000 0.4421 0.7304 
0.1345 0.1288 0.0822 0.4421 1.0000 0.4036 
0.0337 0.0294 0.1410 0.7304 0.4036 1.0000 


In [15]:
# ============================================================================
# PART 3.3: Analysis Questions
# ============================================================================

print("\n" + "=" * 60)
print("Part 3.3: Analysis")
print("=" * 60)

# TODO: Answer these questions based on your similarity matrix

print("""
Q1: Which two sentences have the highest similarity (besides identical)?
    Answer: sentences 3 and 5 (similarity: 0.7304)

Q2: How similar are 'Python is a programming language' and 'The python snake is quite long'?
    Similarity score: score: 0.4421
    Interpretation: Not very similar

Q3: Which sentence is most 'isolated' (lowest average similarity)?
    Answer: Sentence 2 ("Dogs are loyal companions") with average similarity 0.1300
""")


Part 3.3: Analysis

Q1: Which two sentences have the highest similarity (besides identical)?
    Answer: sentences 3 and 5 (similarity: 0.7304)

Q2: How similar are 'Python is a programming language' and 'The python snake is quite long'?
    Similarity score: score: 0.4421
    Interpretation: Not very similar

Q3: Which sentence is most 'isolated' (lowest average similarity)?
    Answer: Sentence 2 ("Dogs are loyal companions") with average similarity 0.1300



In [16]:

# ============================================================================
# PART 4: Semantic Clustering
# ============================================================================

print("\n" + "=" * 60)
print("Part 4: Semantic Clustering")
print("=" * 60)

# TODO 4.1: Group sentences by topic
print("""
Cluster A (Animal-related):
    - "The cat sat on the mat" (sentence 0)
    - "A kitten rested on the rug" (sentence 1)
    - "Dogs are loyal companions" (sentence 2)

Cluster B (Programming-related):
    - "Python is a programming language" (sentence 3)
    - "I love coding in Python" (sentence 5)

Outliers:
    - "The python snake is quite long" (sentence 4) - ambiguous between animal and programming contexts
""")


Part 4: Semantic Clustering

Cluster A (Animal-related):
    - "The cat sat on the mat" (sentence 0)
    - "A kitten rested on the rug" (sentence 1)
    - "Dogs are loyal companions" (sentence 2)

Cluster B (Programming-related):
    - "Python is a programming language" (sentence 3)
    - "I love coding in Python" (sentence 5)

Outliers:
    - "The python snake is quite long" (sentence 4) - ambiguous between animal and programming contexts



In [17]:
# TODO 4.2: What similarity threshold would you use?
print("""
Recommended threshold for 'related' sentences: 0.45
Justification: The most ambiguous case in this dataset is sentence 4
               "The python snake is quite long" and sentence 6
               "I love coding in Python" both sue the word Python but
               are referring to different things.

               The similarity score for those sentences is 0.40 so having
               the thresnhold a little higher than that sounds like a good
               middle ground.
""")


print("\n" + "=" * 60)
print("Exercise Complete!")
print("=" * 60)



Recommended threshold for 'related' sentences: 0.45
Justification: The most ambiguous case in this dataset is sentence 4
               "The python snake is quite long" and sentence 6
               "I love coding in Python" both sue the word Python but
               are referring to different things.

               The similarity score for those sentences is 0.40 so having
               the thresnhold a little higher than that sounds like a good
               middle ground.


Exercise Complete!
