In [1]:
from qdrant_client import QdrantClient, models
import requests
from fastembed import TextEmbedding
import json
import numpy as np

In [3]:
#connecting to local Qdrant instance
qd_client=QdrantClient(url="http://localhost:6333")

# Q1. Embedding the query

In [4]:
embedding_model = "jinaai/jina-embeddings-v2-small-en"

In [5]:
# Download and initialization the model
embedder=TextEmbedding(model_name=embedding_model)

In [6]:
query='I just discovered the course. Can I join now?'

In [7]:
#Embedding the query
embeddings_query = list(embedder.embed(query))[0]

In [8]:
# Get the size of the array
embeddings_query_size = embeddings_query.size
print("The size of the array:",embeddings_query_size)

The size of the array: 512


In [9]:
# Get the min value of the array
min_value = np.min(embeddings_query)
print("Min value:", min_value)

Min value: -0.11726373885183883


# Q2. Cosine similarity with another vector

In [10]:
doc = 'Can I still join the course after the start date?'

In [11]:
#Embedding the document
embedding_doc=list(embedder.embed(doc))[0]

In [12]:
# Get the Coosine similarity 
cos_similarity = embedding_doc.dot(embeddings_query)
print ("cosine similarity between query and document vector:", cos_similarity)

cosine similarity between query and document vector: 0.9008528895674548


In [13]:
# Another method to calculate similarity 
cos_similarity_dif = np.dot(embedding_doc, embeddings_query) / (np.linalg.norm(embedding_doc) * np.linalg.norm(embeddings_query))
print ("Another method to get cosine similarity:", cos_similarity_dif)

Another method to get cosine similarity: 0.9008528895674548


# Q3. Ranking by cosine

In [14]:
documents = [{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Star the repo! Share it with friends if you find it useful ❣️\nCreate a PR if you see you can improve the text or the structure of the repository.',
  'section': 'General course-related questions',
  'question': 'How can we contribute to the course?',
  'course': 'data-engineering-zoomcamp'}]

In [15]:
documents[2]['text']

"The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel."

In [16]:
# Define a function that can be used to get cosine similarity for text and full text
def cos_similarity(text):
    # Embedding text
    embedding_text=list(embedder.embed(text))
    #Convert to numpy arrays
    embedding_text_reshape=np.array(embedding_text)
    #Get the cosine similarity
    cosine_similarities=embedding_text_reshape.dot(embeddings_query)
    # Get the index of the most similar document
    most_similar_index = np.argmax(cosine_similarities)

    return cosine_similarities, most_similar_index

In [17]:
# compute the cosine between the query vector and text field
cosine_similarities, most_similar_index = cos_similarity([doc["text"] for doc in documents])

print("Cosine similarities for text and query vector:")
for i, score in enumerate(cosine_similarities):
    print(f"  Document {i}: {score:.4f}")

print(f"\nMost similar document index: {most_similar_index}")


Cosine similarities for text and query vector:
  Document 0: 0.7630
  Document 1: 0.8182
  Document 2: 0.8085
  Document 3: 0.7133
  Document 4: 0.7304

Most similar document index: 1


# Q4. Ranking by cosine, version two

In [18]:
# Calculate a new field, which is a concatenation of question and text
cosine_similarities, most_similar_index = cos_similarity([doc['question'] + ' ' + doc['text'] for doc in documents])

print("Cosine similarities for full text and query vector:")
for i, score in enumerate(cosine_similarities):
    print(f"full_text{i}: {score:.4f}")

print(f"\nMost similar full_text index: {most_similar_index}")


Cosine similarities for full text and query vector:
full_text0: 0.8515
full_text1: 0.8437
full_text2: 0.8408
full_text3: 0.7755
full_text4: 0.8086

Most similar full_text index: 0


The output changed when the question was added to the text because it provided additional context to the embedding. This improves the model’s performance, as it is designed to capture semantic meaning. Adding more information helps the model better understand the intent, which can significantly affect the resulting embeddings and, therefore, the cosine similarity scores.

# Selecting the embedding model

In [19]:
all_models = TextEmbedding.list_supported_models()

In [20]:
all_models

[{'model': 'BAAI/bge-base-en',
  'sources': {'hf': 'Qdrant/fast-bge-base-en',
   'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-bge-base-en.tar.gz',
   '_deprecated_tar_struct': True},
  'model_file': 'model_optimized.onnx',
  'description': 'Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: necessary, 2023 year.',
  'license': 'mit',
  'size_in_GB': 0.42,
  'additional_files': [],
  'dim': 768,
  'tasks': {}},
 {'model': 'BAAI/bge-base-en-v1.5',
  'sources': {'hf': 'qdrant/bge-base-en-v1.5-onnx-q',
   'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-bge-base-en-v1.5.tar.gz',
   '_deprecated_tar_struct': True},
  'model_file': 'model_optimized.onnx',
  'description': 'Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: not so necessary, 2023 year.',
  'license': 'mit',
  'size_in_GB': 0.21,
  'additional_files': [],
  'dim': 768,
  'tasks': {}},
 {'model':

In [21]:
dim = [model['dim'] for model in all_models]
print(dim)

[768, 768, 1024, 384, 384, 512, 1024, 384, 384, 768, 768, 1024, 768, 512, 384, 768, 512, 768, 768, 768, 768, 768, 1024, 768, 768, 768, 384, 768, 1024, 1024]


In [22]:
# Get smallest dimensionality for models  
min_dim = min(model['dim'] for model in all_models)
min_dim

384

In [23]:
# check models with 384 dim 

for model in TextEmbedding.list_supported_models():

    if model['dim'] == min_dim:
        print(json.dumps(model, indent = 2))
print("The number f models with 384 dim: ",len(model))


{
  "model": "BAAI/bge-small-en",
  "sources": {
    "hf": "Qdrant/bge-small-en",
    "url": "https://storage.googleapis.com/qdrant-fastembed/BAAI-bge-small-en.tar.gz",
    "_deprecated_tar_struct": true
  },
  "model_file": "model_optimized.onnx",
  "description": "Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: necessary, 2023 year.",
  "license": "mit",
  "size_in_GB": 0.13,
  "additional_files": [],
  "dim": 384,
  "tasks": {}
}
{
  "model": "BAAI/bge-small-en-v1.5",
  "sources": {
    "hf": "qdrant/bge-small-en-v1.5-onnx-q",
    "url": null,
    "_deprecated_tar_struct": false
  },
  "model_file": "model_optimized.onnx",
  "description": "Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: not so necessary, 2023 year.",
  "license": "mit",
  "size_in_GB": 0.067,
  "additional_files": [],
  "dim": 384,
  "tasks": {}
}
{
  "model": "snowflake/snowflake-arctic-embed-xs",
  "sou

# Q6. Indexing with qdrant


In [24]:
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()


documents = []

for course in documents_raw:
    course_name = course['course']
    if course_name != 'machine-learning-zoomcamp':
        continue

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [25]:
len(documents)

375

In [26]:
handling_model="BAAI/bge-small-en"

## Create a collection

In [27]:
collection_name = "zoomcamp-HW"

In [28]:
# Delete a connection 
qd_client.delete_collection(collection_name=collection_name)

True

In [29]:
#Create the collection
qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=min_dim, 
        distance=models.Distance.COSINE
    )
)

True

In [30]:
qd_client.create_payload_index(
    collection_name=collection_name, 
    field_name="course",
    field_schema="keyword" )

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [31]:
points = []

for i, doc in enumerate(documents):
    text=doc['question'] + '' + doc['text']  # for the index
    vector=models.Document(text=text, model=handling_model)
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
    )
    points.append(point)


In [32]:
qd_client.upsert(
    collection_name=collection_name, 
    points=points
)


UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

In [33]:
def vector_search(question, limit):
    results = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=question,
            model=handling_model
        ),
        limit=limit,
        with_payload=True
    )
    return results
    
    
    

In [34]:
results = vector_search(query, 5)
results

QueryResponse(points=[ScoredPoint(id=14, version=2, score=0.8703172, payload={'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.', 'section': 'General course-related questions', 'question': 'The course has already started. Can I still join it?', 'course': 'machine-learning-zoomcamp'}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=6, version=2, score=0.86918855, payload={'text': 'Approximately 4 months, but may take more if you want to do some extra activities (an extra project, an article, etc)', 'section': 'General course-related questions', 'question': 'How long is the course?', 'course': 'machine-learning-zoomcamp'}, vector=None, shard_key=

In [35]:
results.points[0]


ScoredPoint(id=14, version=2, score=0.8703172, payload={'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.', 'section': 'General course-related questions', 'question': 'The course has already started. Can I still join it?', 'course': 'machine-learning-zoomcamp'}, vector=None, shard_key=None, order_value=None)

In [36]:
# Highest score in the results
results.points[0].score


0.8703172