In [1]:
from qdrant_client import QdrantClient, models

In [2]:
client = QdrantClient("http://localhost:6333")

In [3]:
import requests

In [4]:
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

In [10]:
from fastembed import TextEmbedding

In [11]:
TextEmbedding.list_supported_models()

[{'model': 'BAAI/bge-base-en',
  'sources': {'hf': 'Qdrant/fast-bge-base-en',
   'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-bge-base-en.tar.gz',
   '_deprecated_tar_struct': True},
  'model_file': 'model_optimized.onnx',
  'description': 'Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: necessary, 2023 year.',
  'license': 'mit',
  'size_in_GB': 0.42,
  'additional_files': [],
  'dim': 768,
  'tasks': {}},
 {'model': 'BAAI/bge-base-en-v1.5',
  'sources': {'hf': 'qdrant/bge-base-en-v1.5-onnx-q',
   'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-bge-base-en-v1.5.tar.gz',
   '_deprecated_tar_struct': True},
  'model_file': 'model_optimized.onnx',
  'description': 'Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: not so necessary, 2023 year.',
  'license': 'mit',
  'size_in_GB': 0.21,
  'additional_files': [],
  'dim': 768,
  'tasks': {}},
 {'model':

In [12]:
import json
EMBEDDING_DIMENSIONALITY = 512
for model in TextEmbedding.list_supported_models():
    if model["dim"] == EMBEDDING_DIMENSIONALITY:
        print(model)


{'model': 'BAAI/bge-small-zh-v1.5', 'sources': {'hf': 'Qdrant/bge-small-zh-v1.5', 'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-bge-small-zh-v1.5.tar.gz', '_deprecated_tar_struct': True}, 'model_file': 'model_optimized.onnx', 'description': 'Text embeddings, Unimodal (text), Chinese, 512 input tokens truncation, Prefixes for queries/documents: not so necessary, 2023 year.', 'license': 'mit', 'size_in_GB': 0.09, 'additional_files': [], 'dim': 512, 'tasks': {}}
{'model': 'Qdrant/clip-ViT-B-32-text', 'sources': {'hf': 'Qdrant/clip-ViT-B-32-text', 'url': None, '_deprecated_tar_struct': False}, 'model_file': 'model.onnx', 'description': 'Text embeddings, Multimodal (text&image), English, 77 input tokens truncation, Prefixes for queries/documents: not necessary, 2021 year', 'license': 'mit', 'size_in_GB': 0.25, 'additional_files': [], 'dim': 512, 'tasks': {}}
{'model': 'jinaai/jina-embeddings-v2-small-en', 'sources': {'hf': 'xenova/jina-embeddings-v2-small-en', 'url': None, '_

In [7]:
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [8]:
# Define a collection for indexing the documents 

In [9]:
collection_name = "zoomcamo-rag"

In [13]:
client.create_collection(collection_name=collection_name,
                         vectors_config=models.VectorParams(
                             size=EMBEDDING_DIMENSIONALITY,
                             distance=models.Distance.COSINE
                         )
                        )

True

In [21]:
# Lets create the data points, and call them points
points = []
id = 0
for course in documents_raw:
    for doc in course['documents']:
        point = models.PointStruct(
            id=id,
            vector=models.Document(text=str(doc['text']), model=model_handle),
            payload={
                "text": doc["text"],
                "section": doc["section"],
                "course": course['course']
            }
        )
        points.append(point)
        id += 1

In [22]:
client.upsert(collection_name=collection_name,
              points=points)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/367 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

onnx/model.onnx:   0%|          | 0.00/130M [00:00<?, ?B/s]

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [23]:
def search(query, limit:int=1):
    results = client.query_points(collection_name=collection_name,
                                  query=models.Document(
                                      text=query,
                                      model=model_handle
                                  ),
                                  limit=limit,
                                  with_payload=True
                                 )
    return results
                                  

In [29]:
# Lets get a random question
import random
random.seed(42)
course = random.choice(documents_raw)
course_piece = random.choice(course['documents'])
print(json.dumps(course_piece, indent=1))

{
 "text": "I have faced a problem while reading the large parquet file. I tried some workarounds but they were NOT successful with Jupyter.\nThe error message is:\nIndexError: index 311297 is out of bounds for axis 0 with size 131743\nI solved it by performing the homework directly as a python script.\nAdded by Ibraheem Taha (ibraheemtaha91@gmail.com)\nYou can try using the Pyspark library\nAnswered by kamaldeen (kamaldeen32@gmail.com)",
 "section": "Module 1: Introduction",
 "question": "Reading large parquet files"
}


In [30]:
result = search(course_piece['question'])

In [38]:
print(result.points[0].payload['text'])

The read_parquet function supports a list of files as an argument. The list of files will be merged into a single result table.


In [39]:
def search_by_filter(query, filter_name: str='course', filter_value: str='mlops-zoomcamp', limit:int=1):
    results = client.query_points(collection_name=collection_name,
                                  query=models.Document(
                                      text=query,
                                      model=model_handle
                                  ),
                                  query_filter=models.Filter(
                                      must=[
                                          models.FieldCondition(
                                              key=filter_name,
                                              match=models.MatchValue(value=filter_value)
                                          )
                                      ]
                                  ),
                                  limit=limit,
                                  with_payload=True
                                 )
    return results


In [46]:
results=search_by_filter(query=course_piece['question'],
                        filter_value=course['course'], 
                        limit=2
                       )

In [50]:
for result in results.points:
    print(result.payload)

{'text': 'I have faced a problem while reading the large parquet file. I tried some workarounds but they were NOT successful with Jupyter.\nThe error message is:\nIndexError: index 311297 is out of bounds for axis 0 with size 131743\nI solved it by performing the homework directly as a python script.\nAdded by Ibraheem Taha (ibraheemtaha91@gmail.com)\nYou can try using the Pyspark library\nAnswered by kamaldeen (kamaldeen32@gmail.com)', 'section': 'Module 1: Introduction', 'course': 'mlops-zoomcamp'}
{'text': "Problem: While following the steps in the videos you may have problems trying to download with wget the files. Usually it is a 403 error type (Forbidden access).\nSolution: The links point to files on cloudfront.net, something like this:\nhttps://d37ci6vzurychx.cloudfront.net/tOSError: Could not open parquet input source '<Buffer>': Invalid: Parquet OSError: Could not open parquet input source '<Buffer>': Invalid: Parquet rip+data/green_tripdata_2021-01.parquet\nI’m not download 

In [41]:
print(result.points[0].payload['text'])

I have faced a problem while reading the large parquet file. I tried some workarounds but they were NOT successful with Jupyter.
The error message is:
IndexError: index 311297 is out of bounds for axis 0 with size 131743
I solved it by performing the homework directly as a python script.
Added by Ibraheem Taha (ibraheemtaha91@gmail.com)
You can try using the Pyspark library
Answered by kamaldeen (kamaldeen32@gmail.com)


In [43]:
search_by_filter(query="Can I submit homework late?").points[0].payload['text']

'In order to obtain the certificate, completion of the final capstone project is mandatory. The completion of weekly homework assignments is optional, but they can contribute to your overall progress and ranking on the top 100 leaderboard.'

In [51]:
import uuid

In [52]:
COLLECTION_NAME_2 = "zoomcamp-sparse"

In [56]:
if not client.collection_exists(COLLECTION_NAME_2):
    client.create_collection(
        collection_name=COLLECTION_NAME_2,
        sparse_vectors_config={
            "bm25": models.SparseVectorParams(
                modifier=models.Modifier.IDF
            )
        }
    )
else:
    print(f"{COLLECTION_NAME_2}, collection already exists")

zoomcamp-sparse, collection already exists


In [57]:
# Sending the documents data as a sparse vector

In [58]:
data = [{"course": "ml", "data": [1,2,3,4]}, {"course": "de", "data": [1,2]}]

In [62]:
[(k["course"],i) for k in data for i in k["data"]]

[('ml', 1), ('ml', 2), ('ml', 3), ('ml', 4), ('de', 1), ('de', 2)]

In [63]:
client.upsert(
    collection_name=COLLECTION_NAME_2,
    points=[
        models.PointStruct(
            id=uuid.uuid4().hex,
            vector={
                "bm25": models.Document(
                    text=doc["text"],
                    model="Qdrant/bm25"
                )
            },
            payload={
                "text": doc["text"],
                "section": doc["section"],
                "course": course["course"]
            }
        )
        for course in documents_raw for doc in course["documents"]
    ]
)

Fetching 18 files:   0%|          | 0/18 [00:00<?, ?it/s]

danish.txt:   0%|          | 0.00/424 [00:00<?, ?B/s]

french.txt:   0%|          | 0.00/813 [00:00<?, ?B/s]

dutch.txt:   0%|          | 0.00/453 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

finnish.txt: 0.00B [00:00, ?B/s]

english.txt:   0%|          | 0.00/936 [00:00<?, ?B/s]

german.txt: 0.00B [00:00, ?B/s]

arabic.txt: 0.00B [00:00, ?B/s]

greek.txt: 0.00B [00:00, ?B/s]

portuguese.txt: 0.00B [00:00, ?B/s]

norwegian.txt:   0%|          | 0.00/851 [00:00<?, ?B/s]

italian.txt: 0.00B [00:00, ?B/s]

hungarian.txt: 0.00B [00:00, ?B/s]

romanian.txt: 0.00B [00:00, ?B/s]

russian.txt: 0.00B [00:00, ?B/s]

spanish.txt: 0.00B [00:00, ?B/s]

swedish.txt:   0%|          | 0.00/559 [00:00<?, ?B/s]

turkish.txt:   0%|          | 0.00/260 [00:00<?, ?B/s]

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [64]:
def search_sparse(query: str, limit: int=1) -> list[models.ScoredPoint]:
    results = client.query_points(
        collection_name=COLLECTION_NAME_2,
        query=models.Document(
            text=query,
            model="Qdrant/bm25"
        ),
        using="bm25",
        limit=limit,
        with_payload=True
    )
    return results.points

In [67]:
search_sparse("when does the course start?")[0].payload["text"]

"The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel."

In [68]:
COLLECTION_SPARSE_AND_DENSE = "zoomcamp-sparse-and-dense"

In [70]:
if not client.collection_exists(COLLECTION_SPARSE_AND_DENSE):

    client.create_collection(
        collection_name=COLLECTION_SPARSE_AND_DENSE,
        vectors_config={
            "jina-small": models.VectorParams(
                size=512,
                distance=models.Distance.COSINE
            )
        },
        sparse_vectors_config={
            "bm25": models.SparseVectorParams(
                modifier=models.Modifier.IDF
            )
        }
    )
else:
    print(f"Collection {COLLECTION_SPARSE_AND_DENSE} already exists")

Collection zoomcamp-sparse-and-dense already exists


In [73]:
client.upsert(
    collection_name=COLLECTION_SPARSE_AND_DENSE,
    points=[
        models.PointStruct(
            id=uuid.uuid4().hex,
            vector={
                "jina-small": models.Document(
                    text=doc["text"],
                    model="jinaai/jina-embeddings-v2-small-en"
                ),
                "bm25": models.Document(
                    text=doc["text"],
                    model="Qdrant/bm25"
                )
            },
            payload={
                "text": doc["text"],
                "section": doc["section"],
                "course": course["course"]
            }
        ) for course in documents_raw for doc in course["documents"]
    ]
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [76]:
def multi_stage_search(query, limit: int=2):
    results = client.query_points(
        collection_name=COLLECTION_SPARSE_AND_DENSE,
        prefetch=[
            models.Prefetch(
                query=models.Document(text=query,
                                      model="jinaai/jina-embeddings-v2-small-en"
                                     ),
                limit=(limit*5),
                using="jina-small"
            )
        ],
        query=models.Document(
            text=query,
            model="Qdrant/bm25"
        ),
        using="bm25",
        limit=limit,
        with_payload=True
    )
    return results.points

In [85]:
for q in multi_stage_search("pandas"):
    print(q.payload["text"] + "\n")

Numpy and Pandas packages use different equations to compute the standard deviation. Numpy uses  population standard deviation, whereas pandas uses sample standard deviation by default.
Numpy
Pandas
pandas default standard deviation is computed using one degree of freedom. You can change degree in of freedom in NumPy to change this to unbiased estimator by using ddof parameter:
import numpy as np
np.std(df.weight, ddof=1)
The result will be similar if we change the dof = 1 in numpy
(Harish Balasundaram)

If we have a list or series of data for example x = [1,2,3,4,5]. We can use pandas to find the standard deviation. We can pass our list into panda series and call standard deviation directly on the series pandas.Series(x).std().
(Quinn Avila)



In [86]:
def rrf_hybrid_search(query, limit: int=2):
    results = client.query_points(
        collection_name=COLLECTION_SPARSE_AND_DENSE,
        prefetch=[
            models.Prefetch(
                query=models.Document(text=query,
                                      model="jinaai/jina-embeddings-v2-small-en"
                                     ),
                limit=(limit*5),
                using="jina-small"
            )
            ,
            models.Prefetch(
                query=models.Document(text=query,
                                      model="Qdrant/bm25"
                                     ),
                using="bm25",
                limit=(limit*5)
            )
        ],
        query=models.FusionQuery(fusion=models.Fusion.RRF),
        with_payload=True
    )
    return results.points

In [87]:
for q in rrf_hybrid_search("pandas"):
    print(q.payload["text"] + "\n")

Use ‘pandas.concat’ function (https://pandas.pydata.org/docs/reference/api/pandas.concat.html) to combine two dataframes. To combine two numpy arrays use numpy.concatenate (https://numpy.org/doc/stable/reference/generated/numpy.concatenate.html) function. So the code would be as follows:
df_train_combined = pd.concat([df_train, df_val])
y_train = np.concatenate((y_train, y_val), axis=0)
(George Chizhmak)

Numpy and Pandas packages use different equations to compute the standard deviation. Numpy uses  population standard deviation, whereas pandas uses sample standard deviation by default.
Numpy
Pandas
pandas default standard deviation is computed using one degree of freedom. You can change degree in of freedom in NumPy to change this to unbiased estimator by using ddof parameter:
import numpy as np
np.std(df.weight, ddof=1)
The result will be similar if we change the dof = 1 in numpy
(Harish Balasundaram)

You can use round() function or f-strings
round(number, 4)  - this will round num