In [1]:
"""
Exercise 01: Metric Comparison - Starter Code

Compare Euclidean (L2) and Cosine distance metrics.

Prerequisites:
- pip install chromadb sentence-transformers numpy

Hints:
- Reading 02 (distance-metrics-euclidean-cosine.md) has formulas
- Demo 01 shows collection creation with different metrics
"""

import chromadb
import numpy as np
from sentence_transformers import SentenceTransformer

# ============================================================================
# SETUP
# ============================================================================

print("=" * 60)
print("Exercise 01: Metric Comparison")
print("=" * 60)

client = chromadb.Client()
model = SentenceTransformer('all-MiniLM-L6-v2')

  if not hasattr(np, "object"):


Exercise 01: Metric Comparison


In [4]:

# ============================================================================
# PART 1: Mathematical Understanding (by hand first!)
# ============================================================================

print("\n" + "=" * 60)
print("Part 1: Mathematical Understanding")
print("=" * 60)

# Task 1.1: Calculate by hand for these vectors:
A = np.array([1, 2])
B = np.array([3, 6])  # Same direction as A, different magnitude
C = np.array([2, 1])  # Different direction from A

print("""
VECTORS:
  A = [1, 2]
  B = [3, 6] (same direction as A)
  C = [2, 1] (different direction from A)

YOUR HAND CALCULATIONS:
  Euclidean A→B: ___4.47____
  Euclidean A→C: ____1.41___
  Cosine similarity A·B: ___1.0____
  Cosine similarity A·C: ___0.8____

PREDICTIONS:
  Which is "closer" to A using Euclidean? ___C____
  Which is "closer" to A using Cosine? ___B____
""")

# TODO: Verify your hand calculations with code
def euclidean_distance(a, b):
    return np.linalg.norm(a - b)
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# format the following output with labels please
print(f"Euclidean A→B: {euclidean_distance(A, B)}")
print(f"Euclidean A→C: {euclidean_distance(A, C)}")
print(f"Cosine similarity A·B: {cosine_similarity(A, B)}")
print(f"Cosine similarity A·C: {cosine_similarity(A, C)}")




Part 1: Mathematical Understanding

VECTORS:
  A = [1, 2]
  B = [3, 6] (same direction as A)
  C = [2, 1] (different direction from A)

YOUR HAND CALCULATIONS:
  Euclidean A→B: ___4.47____
  Euclidean A→C: ____1.41___
  Cosine similarity A·B: ___1.0____
  Cosine similarity A·C: ___0.8____

PREDICTIONS:
  Which is "closer" to A using Euclidean? ___C____
  Which is "closer" to A using Cosine? ___B____

Euclidean A→B: 4.47213595499958
Euclidean A→C: 1.4142135623730951
Cosine similarity A·B: 0.9999999999999999
Cosine similarity A·C: 0.7999999999999998


In [5]:
# ============================================================================
# PART 2: Implementation
# ============================================================================

print("\n" + "=" * 60)
print("Part 2: Create Collections with Different Metrics")
print("=" * 60)

# TODO 2.1: Create two collections with different distance metrics
# Hint: metadata={"hnsw:space": "cosine"} or {"hnsw:space": "l2"}

cosine_collection = client.create_collection(
    name="text_cosine",
    metadata={"hnsw:space": "cosine"}
)

l2_collection = client.create_collection(
    name="text_l2",
    metadata={"hnsw:space": "l2"} # l2 is euclidean distance
)

print("TODO: Create collections with cosine and l2 metrics")


Part 2: Create Collections with Different Metrics
TODO: Create collections with cosine and l2 metrics


In [6]:

# TODO 2.2: Add these documents to BOTH collections
test_documents = [
    "Machine learning is transforming the world",
    "MACHINE LEARNING IS TRANSFORMING THE WORLD",  # Same, uppercase
    "Deep learning uses neural networks",
    "I love eating pizza on Friday nights",
    "The weather is sunny and warm today"
]

doc_ids = [f"doc_{i}" for i in range(len(test_documents))]

# Hint: Add same documents to both collections
cosine_collection.add(documents=test_documents, ids=doc_ids)
l2_collection.add(documents=test_documents, ids=doc_ids)

print("TODO: Add test documents to both collections")


TODO: Add test documents to both collections


In [8]:

# TODO 2.3: Query and compare results
print("\n" + "=" * 60)
print("Part 2.3: Query and Compare")
print("=" * 60)

query = "AI and machine learning applications"

# Query both collections
cosine_results = cosine_collection.query(query_texts=[query], n_results=5)
l2_results = l2_collection.query(query_texts=[query], n_results=5)

print(f"Query: '{query}'")
print("\nTODO: Compare Cosine vs L2 results")
print(f"Cosine similarity: {cosine_results}")
print(f"L2 distance: {l2_results}")




Part 2.3: Query and Compare
Query: 'AI and machine learning applications'

TODO: Compare Cosine vs L2 results
Cosine similarity: {'ids': [['doc_0', 'doc_1', 'doc_2', 'doc_4', 'doc_3']], 'embeddings': None, 'documents': [['Machine learning is transforming the world', 'MACHINE LEARNING IS TRANSFORMING THE WORLD', 'Deep learning uses neural networks', 'The weather is sunny and warm today', 'I love eating pizza on Friday nights']], 'uris': None, 'included': ['metadatas', 'documents', 'distances'], 'data': None, 'metadatas': [[None, None, None, None, None]], 'distances': [[0.48594027757644653, 0.48594027757644653, 0.5316562652587891, 0.9345202445983887, 1.0356534719467163]]}
L2 distance: {'ids': [['doc_0', 'doc_1', 'doc_2', 'doc_4', 'doc_3']], 'embeddings': None, 'documents': [['Machine learning is transforming the world', 'MACHINE LEARNING IS TRANSFORMING THE WORLD', 'Deep learning uses neural networks', 'The weather is sunny and warm today', 'I love eating pizza on Friday nights']], 'uri

In [None]:

# ============================================================================
# PART 3: Analysis
# ============================================================================

print("\n" + "=" * 60)
print("Part 3: Analysis")
print("=" * 60)

# TODO 3.1: Fill in your analysis
print("""
COSINE Results:
1. [distance: __0.48594027757644653__] _Machine learning is transforming the world_
2. [distance: __0.48594027757644653__] _MACHINE LEARNING IS TRANSFORMING THE WORLD_
3. [distance: __0.5316562652587891__] _Deep learning uses neural networks_

L2 Results:
1. [distance: __0.9718807339668274__] __Machine learning is transforming the world_
2. [distance: __0.9718807339668274__] _MACHINE LEARNING IS TRANSFORMING THE WORLD_
3. [distance: ___1.0633126497268677_] _Deep learning uses neural networks__
""")

In [None]:
# TODO 3.2: Compare doc_1 and doc_2 (case difference)
print("""
CASE SENSITIVITY TEST:
  Doc 1: "Machine learning is transforming the world"
  Doc 2: "MACHINE LEARNING IS TRANSFORMING THE WORLD"
  
  Cosine similarity: _0.48594027757644653_____
  L2 distance: __0.9718807339668274____
  
  Which handles case differences better? _Cosine___
  Why? _Cosine similarity is not affected by case differences, L2 distance is a little higher and showing its sensitive to case differences___
""")

In [None]:
# TODO 3.3: Fill in the decision guide
print("""
DECISION GUIDE:
| Scenario                        | Metric | Reason |
|---------------------------------|--------|--------|
| Text semantic search            | Cosine    | Because it focuses on the angle of vectors over their magnitue. so it focuses on the semantic meaning of text isntead of gettign thown off by stuff like case differences     |
| Normalized image embeddings     | Cosine    | When we normalize embeddings, the vectors are scaled to have the same length, the angle between the vectors are the way we get the information from the data.    |
| Geographic coordinates          | L2    | Geographic coordinates are points ona map, and L2 will calculate the physical disctance between the points. This is ideal for comparing it to real world distances.   |
| Vectors where magnitude matters | L2    | Because L2 takes into account both the direction and the length of the vector, where Cosine does not consider magnitude.|
""")