In [1]:
from qdrant_client import QdrantClient, models
from fastembed import TextEmbedding

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
qd_client = QdrantClient('http://localhost:6333')

In [3]:
import json

with open('documents.json', 'r') as documents_file:
    documents_raw = json.load(documents_file)

documents = []

for course_dict in documents_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [4]:
model_handle = "Qdrant/bm25"
collection_name = "zoomcamp-parse"
# EMBEDDING_DIMENSIONALITY = 512 For BM, we do not need to define the size

In [5]:
qd_client.create_collection(
    collection_name=collection_name,
    sparse_vectors_config={
        "bm25":models.SparseVectorParams(
            modifier=models.Modifier.IDF
        )
    }
)

True

In [10]:
import uuid

points = []
for doc in documents:
    point = models.PointStruct(
        id=uuid.uuid4().hex,
        vector={
            'bm25': models.Document(
                text=doc['text'],
                model=model_handle
            )
        },
        payload=doc
    )
    points.append(point)

In [11]:
qd_client.upsert(collection_name=collection_name, points=points)

Fetching 18 files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 18/18 [00:00<00:00, 24.07it/s]


UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [5]:
def search(query: str, limit:int = 1) -> list[models.ScoredPoint]:
    results = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=query,
            model=model_handle
        ),
        using='bm25',
        limit=limit,
        with_payload=True
    )
    return results.points

In [6]:
results = search('pandas', limit=1)
print(results[0].payload['text'])

Fetching 18 files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 18/18 [00:00<00:00, 24.37it/s]


You can use round() function or f-strings
round(number, 4)  - this will round number up to 4 decimal places
print(f'Average mark for the Homework is {avg:.3f}') - using F string
Also there is pandas.Series. round idf you need to round values in the whole Series
Please check the documentation
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.round.html#pandas.Series.round
Added by Olga Rudakova


In [10]:
import random

random.seed(1234)

course_piece = random.choice(documents)
print(json.dumps(course_piece, indent=2))

{
  "text": "Refer to https://github.com/DataTalksClub/machine-learning-zoomcamp/blob/master/01-intro/06-environment.md\n(added by Rileen Sinha)",
  "section": "Miscellaneous",
  "question": "None of the videos have how to install the environment in Mac, does someone have instructions for Mac with M1 chip?",
  "course": "machine-learning-zoomcamp"
}


In [9]:
results = search(query=course_piece['question'])
print(results[0].payload['text'])

NameError: name 'search' is not defined

In [5]:
hybrid_collection_name='zoomcamp-sparse-and-dense'
sparse_model='Qdrant/bm25'
dense_model='jinaai/jina-embeddings-v2-small-en'
EMBEDDING_DIMENSIONALITY = 512

In [17]:
# Create the collection with both vector types
qd_client.create_collection(
    collection_name=hybrid_collection_name,
    vectors_config={
        # Named dense vector for jinaai/jina-embeddings-v2-small-en
        "jina-small": models.VectorParams(
            size=EMBEDDING_DIMENSIONALITY,
            distance=models.Distance.COSINE,
        ),
    },
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(
            modifier=models.Modifier.IDF,
        )
    }
)

True

In [18]:
import uuid

points = []
for doc in documents:
    point = models.PointStruct(
        id=uuid.uuid4().hex,
        vector={
            'jina-small': models.Document(
                text=doc['text'],
                model=dense_model
            ),
            'bm25': models.Document(
                text=doc['text'],
                model=sparse_model
            )
        },
        payload=doc
    )
    points.append(point)

In [20]:
qd_client.upsert(
    collection_name=hybrid_collection_name,
    points=points
)

Fetching 5 files: 100%|████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:09<00:00,  1.88s/it]


UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [7]:
def multi_stage_search(query: str, limit: int = 1) -> list[models.ScoredPoint]:
    results = qd_client.query_points(
        collection_name=hybrid_collection_name,
        prefetch=[
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model=dense_model
                ),
                using="jina-small",
                limit=10 * limit
            )
        ],
        query=models.Document(
            text=query,
            model=sparse_model, 
        ),
        using="bm25",
        limit=limit,
        with_payload=True,
    )

    return results.points

In [11]:
print(json.dumps(course_piece, indent=2))

{
  "text": "Refer to https://github.com/DataTalksClub/machine-learning-zoomcamp/blob/master/01-intro/06-environment.md\n(added by Rileen Sinha)",
  "section": "Miscellaneous",
  "question": "None of the videos have how to install the environment in Mac, does someone have instructions for Mac with M1 chip?",
  "course": "machine-learning-zoomcamp"
}


In [12]:
results = multi_stage_search(course_piece["question"])
print(results[0].payload["text"])

Fetching 18 files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 18/18 [00:00<00:00, 25.87it/s]
Fetching 5 files: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:10<00:00,  2.18s/it]


Refer to the page https://docs.docker.com/desktop/install/mac-install/ remember to check if you have apple chip or intel chip.


In [21]:
def rrf_search(query: str, limit: int = 1) -> list[models.ScoredPoint]:
    results = qd_client.query_points(
        collection_name=hybrid_collection_name,
        prefetch=[
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model=dense_model
                ),
                using="jina-small",
                limit=5 * limit
            ),
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model=sparse_model
                ),
                using="bm25",
                limit=5 * limit
            ),
        ],
        limit=limit,
        query=models.FusionQuery(fusion=models.Fusion.RRF),
        with_payload=True
    )
    return results.points

In [22]:
results = rrf_search(course_piece["question"])
print(json.dumps(course_piece, indent=2))
print(results[0].payload["text"])

{
  "text": "Refer to https://github.com/DataTalksClub/machine-learning-zoomcamp/blob/master/01-intro/06-environment.md\n(added by Rileen Sinha)",
  "section": "Miscellaneous",
  "question": "None of the videos have how to install the environment in Mac, does someone have instructions for Mac with M1 chip?",
  "course": "machine-learning-zoomcamp"
}
Refer to the page https://docs.docker.com/desktop/install/mac-install/ remember to check if you have apple chip or intel chip.
