# Embeddings

## Generating an initial embedding

In [1]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())

2.9.1+cpu
False


In [2]:
from sentence_transformers import SentenceTransformer
import numpy as np
print("Libraries imported successfully")

  from .autonotebook import tqdm as notebook_tqdm


Libraries imported successfully


In [3]:
# Load an embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")
print(f"Model produces {model.get_sentence_embedding_dimension()} dimensional embeddings")

Model produces 384 dimensional embeddings


In [4]:
# Generating embedding
text = "The cat sat on the mat"

embedding = model.encode(text)
print(f"Embedding shape: {embedding.shape}")
print(f"\nFirst 10 embedding values:\n{embedding[:10]}")

Embedding shape: (384,)

First 10 embedding values:
[ 0.13040186 -0.01187012 -0.02811704  0.05123863 -0.05597441  0.03019154
  0.03016129  0.02469839 -0.01837056  0.05876678]


## Similarity: The Heart of RAG

In [5]:
def cosine_similarity(vec1, vec2):
    """
    Calculate cosine similarity between two vectors.
    
    Returns a score between -1 and 1 (higher = more similar)
    """
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return dot_product/ (norm1 * norm2)
print("Similarity function ready")

Similarity function ready


### Testing Similarity

In [6]:
# Create test sentences
sentences = [
    "The cat sat on the mat",
    "A feline rested on the rug",      # Similar meaning, different words
    "Dogs are loyal animals",          # Different topic
    "Python is a programming language" # Completely unrelated
]

# Generate embedding for all sentences
embeddings = model.encode(sentences)

# Compare the first sentence to all others
print("Comparing to: 'The cat sat on the mat'\n")
for i, sentence in enumerate(sentences):
    similarity = cosine_similarity(embeddings[0], embeddings[i])
    print(f"Similarity to '{sentence}'")
    print(f"Score: {similarity:.3f}\n")


Comparing to: 'The cat sat on the mat'

Similarity to 'The cat sat on the mat'
Score: 1.000

Similarity to 'A feline rested on the rug'
Score: 0.564

Similarity to 'Dogs are loyal animals'
Score: 0.165

Similarity to 'Python is a programming language'
Score: 0.031



## Building a Simple Semantic Search

In [8]:
# Sample knowledge base
documents = [
    "Python is a high-level programming language known for simplicity",
    "Machine learning enables computers to learn from data",
    "Neural networks are inspired by biological brains",
    "Dogs are loyal and friendly pets that need exercise",
    "Cats are independent animals that make great companions",
    "JavaScript is used for web development and runs in browsers",
    "Deep learning uses multi-layered neural networks",
    "Puppies require training and socialization from an early age"
]

print(f"Knowledge base: {len(documents)} documents")

Knowledge base: 8 documents


In [9]:
# Generate embeddings for all documents
print("Generating embeddings for all documents...")
doc_embeddings = model.encode(documents)

print(f"✅ Created {len(doc_embeddings)} embeddings")
print(f"Each embedding has {doc_embeddings[0].shape[0]} dimensions")

Generating embeddings for all documents...
✅ Created 8 embeddings
Each embedding has 384 dimensions


In [10]:
# Search function
def search(query, documents, doc_embeddings, top_k=3):
    """
    Search for documents similar to the query.
    
    Args:
        query: Search query (string)
        documents: List of document texts
        doc_embeddings: Pre-computed document embeddings
        top_k: Number of results to return
    
    Returns:
        List of (document, similarity_score) tuples
    """

    # Embed the query
    query_embedding = model.encode(query)

    # Calculate similarity
    similarities=[]
    for i, doc_emb in enumerate(doc_embeddings):
        similarity = cosine_similarity(query_embedding, doc_emb)
        similarities.append((documents[i], similarity))

    # Sort by similarity (highest first)
    similarities.sort(key=lambda x:x[i], reverse=True)

    # Return top k results
    return similarities[: top_k]

In [11]:
# Test different queries
queries = [
    "What is artificial intelligence?",
    "Tell me about pet dogs",
    "How do I code in Python?"
]

for query in queries:
    print(f"\n{'='*80}")
    print(f"QUERY: {query}")
    print(f"\n{'='*80}")

    results = search(query, documents, doc_embeddings)
    for i, (doc, score) in enumerate(results, 1):
        print(f"\n{i}. (Score: {score:3f})")
        print(f" {doc}")


QUERY: What is artificial intelligence?



IndexError: tuple index out of range