In [2]:
import numpy as np
from fastembed import TextEmbedding

# Initialize the embedding model
embedding_model = TextEmbedding(model_name="jinaai/jina-embeddings-v2-small-en")

# Define the query string
query = 'I just discovered the course. Can I join now?'

# Embed the query and immediately convert the generator to a list
query_embedding = list(embedding_model.embed([query]))

# Get the actual embedding vector (it's the first and only element in the list)
q = query_embedding[0]

# Check the size of the array (as mentioned in the question)
print(f"Embedding size: {q.shape[0]}")

# Find the minimal value in the array
min_value = q.min()
print(f"Minimal value: {min_value}")

Embedding size: 512
Minimal value: -0.11726373885183883


In [4]:

# --- Re-using from Q1  ---
embedding_model = TextEmbedding(model_name="jinaai/jina-embeddings-v2-small-en")
query = 'I just discovered the course. Can I join now?'
q = list(embedding_model.embed([query]))[0]
# -------------------------

# Define the new document string
doc_text = 'Can I still join the course after the start date?'

# Embed this new document and convert to a list
#    The fix is on this line:
doc_embedding = list(embedding_model.embed([doc_text]))[0]

# Calculate the cosine similarity
cosine_similarity = q.dot(doc_embedding)

print(f"Cosine similarity: {cosine_similarity}")

Cosine similarity: 0.9008528895674548


In [5]:
# --- Re-using from Q1 ---
embedding_model = TextEmbedding(model_name="jinaai/jina-embeddings-v2-small-en")
query = 'I just discovered the course. Can I join now?'
q = list(embedding_model.embed([query]))[0]
# -------------------------

#  The documents provided in the homework
documents = [
    {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
     'section': 'General course-related questions', 'question': 'Course - Can I still join the course after the start date?', 'course': 'data-engineering-zoomcamp'},
    {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
     'section': 'General course-related questions', 'question': 'Course - Can I follow the course after it finishes?', 'course': 'data-engineering-zoomcamp'},
    {'text': "The purpose of this document is to capture frequently asked technical questions...",
     'section': 'General course-related questions', 'question': 'Course - When will the course start?', 'course': 'data-engineering-zoomcamp'},
    {'text': 'You can start by installing and setting up all the dependencies and requirements...',
     'section': 'General course-related questions', 'question': 'Course - What can I do before the course starts?', 'course': 'data-engineering-zoomcamp'},
    {'text': 'Star the repo! Share it with friends if you find it useful ❣️...',
     'section': 'General course-related questions', 'question': 'How can we contribute to the course?', 'course': 'data-engineering-zoomcamp'}
]

# Extract only the 'text' field from each document
texts = [doc['text'] for doc in documents]

# Embed all the texts and convert the generator to a list
#    The fix is on this line:
text_embeddings = list(embedding_model.embed(texts))

# Convert the list of embeddings into a single 2D numpy array (matrix V)
V = np.array(text_embeddings)

# Compute all cosine similarities
scores = V.dot(q)

# Find the index of the document with the highest score
highest_scoring_index = np.argmax(scores)

print("Scores:", scores)
print(f"Highest scoring document index: {highest_scoring_index}")

Scores: [0.76296847 0.81823782 0.71915905 0.72130621 0.73432863]
Highest scoring document index: 1


In [6]:
# --- Re-using from Q1 ---
embedding_model = TextEmbedding(model_name="jinaai/jina-embeddings-v2-small-en")
query = 'I just discovered the course. Can I join now?'
q = list(embedding_model.embed([query]))[0]
documents = [ # same documents as Q3
    {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
     'section': 'General course-related questions', 'question': 'Course - Can I still join the course after the start date?', 'course': 'data-engineering-zoomcamp'},
    {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
     'section': 'General course-related questions', 'question': 'Course - Can I follow the course after it finishes?', 'course': 'data-engineering-zoomcamp'},
    {'text': "The purpose of this document is to capture frequently asked technical questions...",
     'section': 'General course-related questions', 'question': 'Course - When will the course start?', 'course': 'data-engineering-zoomcamp'},
    {'text': 'You can start by installing and setting up all the dependencies and requirements...',
     'section': 'General course-related questions', 'question': 'Course - What can I do before the course starts?', 'course': 'data-engineering-zoomcamp'},
    {'text': 'Star the repo! Share it with friends if you find it useful ❣️...',
     'section': 'General course-related questions', 'question': 'How can we contribute to the course?', 'course': 'data-engineering-zoomcamp'}
]
# -----------------------------

# Create a new list of texts by concatenating 'question' and 'text'
full_texts = [f"{doc['question']} {doc['text']}" for doc in documents]

# Embed these new combined texts and convert to a list
#    The fix is on this line:
full_text_embeddings = list(embedding_model.embed(full_texts))

# Create the new matrix
V_full = np.array(full_text_embeddings)

# Compute the scores against the same query vector q
scores_full = V_full.dot(q)

# Find the new highest scoring index
highest_scoring_index_full = np.argmax(scores_full)

print("Scores (full text):", scores_full)
print(f"Highest scoring document index (full text): {highest_scoring_index_full}")


Scores (full text): [0.85145432 0.84365942 0.822636   0.802103   0.83453303]
Highest scoring document index (full text): 0


The input text for Q4 is different from the input text for Q3. Since the text is different, the embedding model creates a different vector. Because the vector is different, its cosine similarity to the query vector is also different.
Let's break that down in more detail.
The Core Concept: Embeddings Capture Meaning of the Entire Input
Think of an embedding model (like jinaai/jina-embeddings-v2-small-en) as a highly sophisticated machine that reads a piece of text and converts its total meaning into a single point in a high-dimensional space. This point is the vector.
This machine doesn't just look at individual words. It looks at the words in order, their relationships, and the overall context to produce the final vector.
Any change to the input text, no matter how small, will change the final vector.
Let's Use a Simple Analogy
Imagine you're giving instructions to a painter, and the "painting" is the final vector.
Instruction for Q3: You tell the painter, "Paint a picture of 'Yes, even if you don't register, you're still eligible to submit the homeworks...'". The painter creates Painting A, which captures the idea of "eligibility" and "deadlines."
Instruction for Q4: You tell the painter, "Paint a picture of 'Course - Can I still join the course after the start date? Yes, even if you don't register...'". The painter creates Painting B. This painting is different! It still has the concepts of "eligibility" and "deadlines," but now it's framed by the much stronger context of "joining a course late."
Painting A and Painting B will be similar, but they are not identical.
Why are the Scores Higher in Q4?
Now, let's bring in your query: 'I just discovered the course. Can I join now?'.
Think of this query as a reference painting, Painting Q. The cosine similarity score is a measure of how similar the other paintings are to this reference painting.

Question 5: Selecting the embedding model
Objective: Find the smallest embedding dimensionality available in fastembed.
Step-by-step Code:
We can use a built-in method from the TextEmbedding class to list all supported models and their properties.

In [7]:
from fastembed import TextEmbedding

# Get the list of all supported models
# This returns a list of dictionaries, each describing a model
models_info = TextEmbedding.list_supported_models()

# Extract the 'dim' (dimensionality) from each model's info
all_dims = [model['dim'] for model in models_info]

# Find the minimum value in the list of dimensions
smallest_dim = min(all_dims)

print(f"The smallest dimensionality available is: {smallest_dim}")

The smallest dimensionality available is: 384


In [None]:
import requests
from qdrant_client import QdrantClient, models
from fastembed import TextEmbedding

# Download and filter documents (no change here)
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []
for course in documents_raw:
    course_name = course['course']
    if course_name == 'machine-learning-zoomcamp':
        for doc in course['documents']:
            doc['course'] = course_name
            documents.append(doc)

print(f"Number of documents for ML Zoomcamp: {len(documents)}")

# Initialize the smaller embedding model
embedding_model = TextEmbedding(model_name="BAAI/bge-small-en")

#  Initialize the Qdrant client
client = QdrantClient(":memory:")

# Create the Qdrant collection
client.recreate_collection(
    collection_name="ml_zoomcamp_faq",
    vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE),
)

#  Prepare the data for upserting
points_to_upsert = []
for i, doc in enumerate(documents):
    text = doc['question'] + ' ' + doc['text']
    
    # FIX #1 is on this line:
    embedding = list(embedding_model.embed([text]))[0]
    
    point = models.PointStruct(
        id=i + 1,
        vector=embedding.tolist(),
        payload=doc
    )
    points_to_upsert.append(point)

# Upsert all the points to the collection
client.upsert(
    collection_name="ml_zoomcamp_faq",
    points=points_to_upsert,
    wait=True
)

# Prepare and embed the search query
query = 'I just discovered the course. Can I join now?'
# FIX #2 is on this line:
query_vector = list(embedding_model.embed([query]))[0]

# Perform the search in Qdrant
search_results = client.search(
    collection_name="ml_zoomcamp_faq",
    query_vector=query_vector,
    limit=1
)

# Get the score of the highest-ranking result
highest_score = search_results[0].score
print(f"The highest score is: {highest_score}")


Number of documents for ML Zoomcamp: 375


  client.recreate_collection(


The highest score is: 0.8703173398971558


  search_results = client.search(
