In [2]:
# Complete homework
import numpy as np
from fastembed import TextEmbedding
import requests

print("=== Vector Search Homework Solutions ===\n")

# Q1: Embedding the query
print("Q1: Embedding the query")
model = TextEmbedding('jinaai/jina-embeddings-v2-small-en')
query = 'I just discovered the course. Can I join now?'
query_embedding = list(model.embed([query]))[0]

print(f"Query embedding shape: {query_embedding.shape}")
print(f"Minimal value: {query_embedding.min():.3f}")
print(f"Answer: {query_embedding.min():.2f}\n")

# Q2: Cosine similarity
print("Q2: Cosine similarity with another vector")
doc = 'Can I still join the course after the start date?'
doc_embedding = list(model.embed([doc]))[0]
cosine_similarity = np.dot(query_embedding, doc_embedding)
print(f"Cosine similarity: {cosine_similarity:.3f}")
print(f"Answer: {cosine_similarity:.1f}\n")

# Q3 & Q4: Documents
documents = [
    {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
     'section': 'General course-related questions',
     'question': 'Course - Can I still join the course after the start date?',
     'course': 'data-engineering-zoomcamp'},
    {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
     'section': 'General course-related questions',
     'question': 'Course - Can I follow the course after it finishes?',
     'course': 'data-engineering-zoomcamp'},
    {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  \"Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon't forget to register in DataTalks.Club's Slack and join the channel.",
     'section': 'General course-related questions',
     'question': 'Course - When will the course start?',
     'course': 'data-engineering-zoomcamp'},
    {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
     'section': 'General course-related questions',
     'question': 'Course - What can I do before the course starts?',
     'course': 'data-engineering-zoomcamp'},
    {'text': 'Star the repo! Share it with friends if you find it useful ❣️\nCreate a PR if you see you can improve the text or the structure of the repository.',
     'section': 'General course-related questions',
     'question': 'How can we contribute to the course?',
     'course': 'data-engineering-zoomcamp'}
]

# Q3: Ranking by cosine (text only)
print("Q3: Ranking by cosine similarity (text only)")
texts = [doc['text'] for doc in documents]
text_embeddings = list(model.embed(texts))
V = np.array(text_embeddings)
similarities = V.dot(query_embedding)
highest_idx = np.argmax(similarities)
print(f"Similarities: {similarities}")
print(f"Highest similarity document index: {highest_idx}")
print(f"Answer: {highest_idx}\n")

# Q4: Ranking with full text
print("Q4: Ranking with concatenated text")
full_texts = [doc['question'] + ' ' + doc['text'] for doc in documents]
full_text_embeddings = list(model.embed(full_texts))
V_full = np.array(full_text_embeddings)
similarities_full = V_full.dot(query_embedding)
highest_idx_full = np.argmax(similarities_full)
print(f"Similarities (full): {similarities_full}")
print(f"Highest similarity document index (full text): {highest_idx_full}")
print(f"Answer: {highest_idx_full}\n")

# Q5: Small embedding model
print("Q5: Selecting embedding model")
small_model = TextEmbedding('BAAI/bge-small-en')
test_embedding = list(small_model.embed(["test"]))[0]
print(f"BAAI/bge-small-en dimension: {test_embedding.shape[0]}")
print(f"Answer: {test_embedding.shape[0]}\n")



=== Vector Search Homework Solutions ===

Q1: Embedding the query
Query embedding shape: (512,)
Minimal value: -0.117
Answer: -0.12

Q2: Cosine similarity with another vector
Cosine similarity: 0.901
Answer: 0.9

Q3: Ranking by cosine similarity (text only)
Similarities: [0.76296847 0.81823782 0.81091955 0.7133079  0.73044992]
Highest similarity document index: 1
Answer: 1

Q4: Ranking with concatenated text
Similarities (full): [0.85145432 0.84365942 0.84177186 0.7755158  0.80860078]
Highest similarity document index (full text): 0
Answer: 0

Q5: Selecting embedding model
BAAI/bge-small-en dimension: 384
Answer: 384



In [3]:
# Q6: Qdrant indexing
# Q6: Memory-efficient Qdrant indexing
print("Q6: Indexing with Qdrant (Memory Efficient)")
try:
    from qdrant_client import QdrantClient
    from qdrant_client.models import Distance, VectorParams, PointStruct
    
    # Download and process documents
    docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
    docs_response = requests.get(docs_url)
    documents_raw = docs_response.json()

    ml_documents = []
    for course in documents_raw:
        if course['course'] == 'machine-learning-zoomcamp':
            for doc in course['documents']:
                doc['course'] = course['course']
                ml_documents.append(doc)

    print(f"Found {len(ml_documents)} ML documents")

    # Set up Qdrant and index
    client = QdrantClient(":memory:")
    collection_name = "ml_zoomcamp_faq"
    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=384, distance=Distance.COSINE)
    )

    # Process documents in smaller batches to avoid memory issues
    batch_size = 50  # Process 50 documents at a time
    total_batches = (len(ml_documents) + batch_size - 1) // batch_size
    
    for batch_idx in range(total_batches):
        start_idx = batch_idx * batch_size
        end_idx = min((batch_idx + 1) * batch_size, len(ml_documents))
        
        print(f"Processing batch {batch_idx + 1}/{total_batches} (docs {start_idx}-{end_idx-1})")
        
        # Get batch of documents
        batch_docs = ml_documents[start_idx:end_idx]
        
        # Prepare texts for embedding
        batch_texts = [doc['question'] + ' ' + doc['text'] for doc in batch_docs]
        
        # Generate embeddings for this batch
        batch_embeddings = list(small_model.embed(batch_texts))
        
        # Create points for this batch
        batch_points = [
            PointStruct(
                id=start_idx + i, 
                vector=emb.tolist(), 
                payload=doc
            )
            for i, (doc, emb) in enumerate(zip(batch_docs, batch_embeddings))
        ]
        
        # Insert batch into Qdrant
        client.upsert(collection_name=collection_name, points=batch_points)
        
        # Clear variables to free memory
        del batch_embeddings, batch_points, batch_texts
    
    print("All documents indexed successfully!")

    # Query
    query_embedding_small = list(small_model.embed([query]))[0]
    search_results = client.search(
        collection_name=collection_name,
        query_vector=query_embedding_small.tolist(),
        limit=5
    )

    print(f"Top result score: {search_results[0].score:.2f}")
    print(f"Answer: {search_results[0].score:.2f}")
    
    # Show top results for verification
    for i, result in enumerate(search_results):
        print(f"Result {i+1}: Score {result.score:.3f}")
        print(f"Question: {result.payload['question'][:100]}...")
        print()
    
except ImportError:
    print("Qdrant not installed. Install with: pip install qdrant-client")
    print("Expected answer for Q6: around 0.77-0.87")
except Exception as e:
    print(f"Error occurred: {e}")
    print("This might be due to memory constraints. Try restarting the kernel and running just Q6.")

Q6: Indexing with Qdrant (Memory Efficient)
Found 375 ML documents
Processing batch 1/8 (docs 0-49)
Processing batch 2/8 (docs 50-99)
Processing batch 3/8 (docs 100-149)
Processing batch 4/8 (docs 150-199)
Processing batch 5/8 (docs 200-249)
Processing batch 6/8 (docs 250-299)
Processing batch 7/8 (docs 300-349)
Processing batch 8/8 (docs 350-374)
All documents indexed successfully!
Top result score: 0.87
Answer: 0.87
Result 1: Score 0.870
Question: The course has already started. Can I still join it?...

Result 2: Score 0.869
Question: How long is the course?...

Result 3: Score 0.868
Question: I’m new to Slack and can’t find the course channel. Where is it?...

Result 4: Score 0.858
Question: How to get started with Week 10?...

Result 5: Score 0.857
Question: I just joined. What should I do next? How can I access course materials?...



  search_results = client.search(
