In [1]:
from qdrant_client import QdrantClient, models

class QdrantVectorStore:
    def __init__(self, host: str):
        self.client = QdrantClient(url=host)

    def create_collection(self, collection_name: str, embedding_dimensionality: int):
        if not self.client.collection_exists(collection_name=collection_name):
            self.client.create_collection(collection_name=collection_name,
                                        vectors_config=models.VectorParams(
                                            size=embedding_dimensionality,
                                            distance=models.Distance.COSINE
                                        )
                                        )
        else:
            print(f"Collection {collection_name} already exists.")

    def create_collection_sparse(self, collection_name: str):
        if not self.client.collection_exists(collection_name=collection_name):
            self.client.create_collection(
                collection_name=collection_name,
                sparse_vectors_config={
                    "bm25": models.SparseVectorParams(
                        modifier=models.Modifier.IDF
                    )
                }
            )
        else:
            print(f"Collection {collection_name} already exists.")
            
    def create_collection_hybrid(self, collection_name: str, embedding_dimensionality: int):
        if not self.client.collection_exists(collection_name=collection_name):
            self.client.create_collection(
                collection_name=collection_name,
                vectors_config={
                    "jina-small": models.VectorParams(
                        size=embedding_dimensionality,
                        distance=models.Distance.COSINE
                    )
                },
                sparse_vectors_config={
                    "bm25": models.SparseVectorParams(
                        modifier=models.Modifier.IDF
                    )
                }
            )
        else:
            print(f"Collection {collection_name} already exists.")

In [2]:
import uuid
from qdrant_client import models

def upsert_sparse_dense_documents(client, collection_name: str, model_handle_dense: str, model_handle_sparse: str, documents: list):
    client.upsert(
        collection_name=collection_name,
        points=[
            models.PointStruct(
                id=uuid.uuid4().hex,
                vector={
                    "jina-small": models.Document(
                        text=doc["text"],
                        model=model_handle_dense
                        ),
                    "bm25": models.Document(
                        text=doc["text"],
                        model=model_handle_sparse
                        )
                    },
                payload={
                    "text": doc["text"],
                    "section": doc["section"],
                    "course": doc['course'],
                    "question": doc["question"],
                    "id": doc["document_id"]
                }
            )
        for doc in documents
        ]
    )

In [3]:
def search_multi_stage_sparse_and_dense(query: str, client, collection_name, limit: int) -> list[models.ScoredPoint]:
    
    results = client.query_points(
        collection_name=collection_name,
        prefetch=[
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="jinaai/jina-embeddings-v2-small-en"
                ),
                using='jina-small',
                limit=(limit*5)
            )
        ],
        query=models.Document(
            text=query,
            model="Qdrant/bm25"
        ),
        limit=limit,
        using="bm25",
        with_payload=True
    )
    return results.points

def search_hybrid(query: str, client, collection_name: str, limit: int=1) -> list[models.ScoredPoint]:
    
    results = client.query_points(
        collection_name=collection_name,
        prefetch=[
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="jinaai/jina-embeddings-v2-small-en"
                ),
                using='jina-small',
                limit=(limit*5)
            ),
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="Qdrant/bm25"
                ),
                using="bm25",
                limit=(limit*5)
            )
        ],
        query=models.FusionQuery(
            fusion=models.Fusion.RRF
        ),
        with_payload=True,
    )
    return results.points

In [4]:
import json
with open('./documents-with-ids-updated.json', 'rt') as f_in:
    documents = json.load(f_in)

In [5]:
host = "http://localhost:6333"
collection_name = "nakul-zoomcamp-faq-collection"

In [6]:
qv = QdrantVectorStore(host=host)
qv.create_collection_hybrid(collection_name=collection_name, embedding_dimensionality=512)

Collection nakul-zoomcamp-faq-collection already exists.


In [7]:
# Adding documents to the vector store
upsert_sparse_dense_documents(client=qv.client,
                              collection_name=collection_name,
                              model_handle_dense="jinaai/jina-embeddings-v2-small-en",
                              model_handle_sparse="Qdrant/bm25",
                              documents=documents
                             )

In [8]:
data = search_hybrid("when does the course start", 
              client=qv.client, 
              collection_name=collection_name,
              limit=5)
for point in data:
    print(point.payload)

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.", 'section': 'General course-related questions', 'course': 'data-engineering-zoomcamp', 'question': 'Course - When will the course start?', 'id': '12ba9d1bc6ca27c5'}
{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the co

In [9]:
import pandas as pd

In [10]:
df_ground_truth = pd.read_csv('ground-truth-data-practice.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

In [11]:
df_ground_truth.head()

Unnamed: 0,question,course,document
0,Could you let me know the specific date and ti...,data-engineering-zoomcamp,12ba9d1bc6ca27c5
1,What is the starting date for the first live O...,data-engineering-zoomcamp,12ba9d1bc6ca27c5
2,Is there a way to add the course schedule to m...,data-engineering-zoomcamp,12ba9d1bc6ca27c5
3,When exactly should I register for the course ...,data-engineering-zoomcamp,12ba9d1bc6ca27c5
4,How do I stay updated with announcements regar...,data-engineering-zoomcamp,12ba9d1bc6ca27c5


In [12]:
from tqdm.auto import tqdm

In [13]:
relevance_total = []
for q in tqdm(ground_truth):
    doc_id = q['document']
    results = search_hybrid(query=q['question'], 
              client=qv.client, 
              collection_name=collection_name,
              limit=1)
    relevance = [point.payload['id'] == doc_id for point in results]
    relevance_total.append(relevance)

  0%|          | 0/4119 [00:00<?, ?it/s]

In [14]:
def hit_rate(relevance_total):
    counter = 0
    for ls in relevance_total:
        if sum(ls) > 0:
            counter += 1
    return counter/len(relevance_total)

In [15]:
hit_rate(relevance_total=relevance_total)

0.8040786598689003

In [16]:
def mrr(relevance_total):
    score = 0.0
    for ls in relevance_total:
        for rank in range(len(ls)):
            if ls[rank]:
                score = score + (1/(rank+1))
                break
    return score/len(relevance_total)

In [17]:
mrr(relevance_total)

0.6715092081989401

In [18]:
def evaluate(ground_truth, search_function, collection_name, client):
    relevance_total = []
    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(query=q['question'], 
                  client=qv.client, 
                  collection_name=collection_name,
                  limit=5)
        relevance = [point.payload['id'] == doc_id for point in results]
        relevance_total.append(relevance)

    return {'hit_rate': hit_rate(relevance_total=relevance_total),
            'mrr': mrr(relevance_total=relevance_total)}

In [19]:
# hybrid search
evaluate(
    ground_truth=ground_truth,
    search_function=search_hybrid,
    collection_name=collection_name,
    client=qv.client)

  0%|          | 0/4119 [00:00<?, ?it/s]

{'hit_rate': 0.822044185481913, 'mrr': 0.6888098899023883}

In [20]:
# sparse and dense search
evaluate(
    ground_truth=ground_truth,
    search_function=search_multi_stage_sparse_and_dense,
    collection_name=collection_name,
    client=qv.client)

  0%|          | 0/4119 [00:00<?, ?it/s]

{'hit_rate': 0.71546491866958, 'mrr': 0.6177267945294165}

In [21]:
# Lets calculate cosine similarity
# -> We need to create answers for each question
# -> Then we need to do cosine similar score for each answer with original question answer

In [22]:
for q in range(0, len(ground_truth)):
    search = search_hybrid(query=ground_truth[q]['question'], client=qv.client, collection_name=collection_name, limit=1)
    answer = (search[0].payload['text'])
    ground_truth[q]['answer'] = answer

In [24]:
ground_truth[3]

{'question': 'When exactly should I register for the course to ensure my spot?',
 'course': 'data-engineering-zoomcamp',
 'document': '12ba9d1bc6ca27c5',
 'answer': "You don't need it. You're accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date."}

In [25]:
documents[3]

{'text': "You don't need it. You're accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.",
 'section': 'General course-related questions',
 'question': 'Course - I have registered for the Data Engineering Bootcamp. When can I expect to receive the confirmation email?',
 'course': 'data-engineering-zoomcamp',
 'document_id': '561fbba60a8a1668'}

In [26]:
import pickle

In [27]:
with open('./results_practice.bin', 'rb') as f_in:
    data = pickle.load(f_in)

In [28]:
from collections import defaultdict

In [29]:
hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['document_id']
    hashes[doc_id].append(doc)

In [30]:
from sentence_transformers import SentenceTransformer

model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [41]:
scores = []
for q in ground_truth:
    vector_answer = model.encode(q["answer"])
    original_answer = model.encode(hashes[q['document']][0]['text'])
    score = vector_answer.dot(original_answer)
    scores.append(score)

In [44]:
cosine_score = sum(scores)/len(scores)
print(f"cosine similarity for retrieving data from vector store is {cosine_score}")

cosine similarity for retrieving data from vector store is 0.7463263273239136


In [98]:
import time
import os
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
from tqdm.auto import tqdm
from test_functions import sum_of_squares
from rag_functions import OpenAIClient, build_context

In [59]:
numbers = [100000000, 20000000, 100000000, 15000000, 20000000]

In [60]:
start = time.time()
for num in numbers:
    print(sum_of_squares(num))
end = time.time()
print(f"time taken {end - start:.2f} seconds")

333333328333333350000000
2666666466666670000000
333333328333333350000000
1124999887500002500000
2666666466666670000000
time taken 11.94 seconds


In [72]:
start = time.time()
with ProcessPoolExecutor(max_workers=5) as executor:
    results = list(executor.map(sum_of_squares, numbers))
end = time.time()
print(results)
print(f"time taken {end - start:.2f} seconds")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[333333328333333350000000, 2666666466666670000000, 333333328333333350000000, 1124999887500002500000, 2666666466666670000000]
time taken 5.31 seconds


In [73]:
start = time.time()
with ThreadPoolExecutor(max_workers=5) as executor:
    results = list(executor.map(sum_of_squares, numbers))
end = time.time()
print(results)
print(f"time taken {end - start:.2f} seconds")

[333333328333333350000000, 2666666466666670000000, 333333328333333350000000, 1124999887500002500000, 2666666466666670000000]
time taken 12.07 seconds


In [74]:
# Lets do cosine similarity based on results from llm rag with vector store

In [75]:
MODEL = "gpt-4.1-nano"
prompt_template = """
You are a course assistant, and your goal is to answer questions of students, where QUESTION is provided below and CONTEXT is provided most of the times. 
Rules:
* Answer the QUESTION based on the CONTEXT.
* Use only the facts from the CONTEXT.
* If CONTEXT is empty, please let the student know, the information about their query is not there, however you found the following information on the web, by searching the web

QUESTION: {question}

CONTEXT: {context}
""".strip()

In [121]:
def rag(query):
    results = search_hybrid(query=query, collection_name=collection_name, client=qv.client, limit=2)

    # Build the context from the search results
    context = build_context(results)

    promt = prompt_template.format(question=query, context=context)

    client = OpenAIClient(api_key=os.getenv("OPENAI_API_KEY"), model=MODEL)

    # Send the prompt to the OpenAI API
    try:
        client_response = client.chat(query=promt)
    except RuntimeError:
        pass
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
    return client_response

In [122]:
def process_record(rec):

    try:
        answer_llm = rag(rec['question'])
        doc_id = rec['document']
        original_answer = hashes[doc_id][0]['text']
        return {
            'answer_llm': answer_llm,
            'answer_orig': original_answer,
            'document': doc_id,
            'question': rec['question'],
            'course': rec['course']
        }
    except RuntimeError:
        pass
    

In [139]:
def make_progress(records, func):
    with tqdm(total=len(records), desc="Building context") as pbar:
        with ThreadPoolExecutor(max_workers=8) as executor:
            futures = [executor.submit(func, record) for record in records]
        
             # Iterate through completed futures using tqdm
            for future in as_completed(futures):
                pbar.update(1)
                yield future.result()

In [124]:
results = []
for result in make_progress(ground_truth[0:200], process_record):
    results.append(result)

Building context:   0%|          | 0/200 [00:00<?, ?it/s]

In [126]:
for result in results:
    if result:
        vector_answer = model.encode(result['answer_llm'])
        original_answer = model.encode(result['answer_orig'])
        score = vector_answer.dot(original_answer)
        result['cosine_score'] = score

In [131]:
results_parsed = []
for result in results:
    if result:
        results_parsed.append(result)

In [132]:
df = pd.DataFrame(results_parsed)

In [135]:
df.cosine_score.describe()

count    189.000000
mean       0.608616
std        0.190586
min        0.074563
25%        0.479401
50%        0.631104
75%        0.752846
max        1.000000
Name: cosine_score, dtype: float64

In [136]:
#llm as a judge

In [173]:
results_parsed

[{'answer_llm': 'The starting date for the first live Office Hours session of the course is not specified in the provided CONTEXT.',
  'answer_orig': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'document': '12ba9d1bc6ca27c5',
  'question': 'What is the starting date for the first live Office Hours session of the course?',
  'course': 'data-engineering-zoomcamp',
  'cosine_score': np.float32(0.61884654)},
 {'answer_llm': 'The CONTEXT provided does not include specific information about the prerequisites necessary for participation in the "data-engineering-zoomcamp" cou

In [203]:
prompt1_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer compared to the original answer provided.
Based on the relevance and similarity of the generated answer to the original answer, you will classify
it as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Original Answer: {answer_orig}
Generated Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the original
answer and provide your evaluation in parsable JSON without using code blocks. Once generated please check is json parsable in python:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

prompt2_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks. Once generated please check is json parsable in python:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [183]:
def llm(record):
    prompt = prompt1_template.format(**record)
    client = OpenAIClient(api_key=os.getenv("OPENAI_API_KEY"), model=MODEL)

    # Send the prompt to the OpenAI API
    try:
        client_response = client.chat(query=prompt)
        return client_response
    except RuntimeError:
        pass
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

In [184]:
results = []
for result in make_progress(results_parsed, llm):
    results.append(result)

Building context:   0%|          | 0/189 [00:00<?, ?it/s]

In [195]:
data = []
for res in results:
    ll_parsed = json.loads(res)
    data.append(ll_parsed)

In [196]:
df = pd.DataFrame(data)

In [201]:
df.Relevance.value_counts()/df.shape[0]

Relevance
RELEVANT           0.407407
PARTLY_RELEVANT    0.365079
NON_RELEVANT       0.227513
Name: count, dtype: float64

In [144]:
def llm(record):
    prompt = prompt2_template.format(**record)
    client = OpenAIClient(api_key=os.getenv("OPENAI_API_KEY"), model=MODEL)

    # Send the prompt to the OpenAI API
    try:
        client_response = client.chat(query=prompt)
        return client_response
    except RuntimeError:
        pass
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer compared to the original answer provided.
Based on the relevance and similarity of the generated answer to the original answer, you will classify
it as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Original Answer: The purpose of this document is to capture frequently asked technical questions
The exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1
Subscribe to course public Google Calendar (it works from Desktop only).
Register before the course starts using this link.
Join the course Telegram channel with announcements.
Don’t forget to register in DataTalks.Club's Slack and join the channel.
Generated Question: What is the starting date for the first live Office Hours session of the course?
Generated Answer: The starting date for the 

In [204]:
results = []
for result in make_progress(results_parsed, llm):
    results.append(result)

Building context:   0%|          | 0/189 [00:00<?, ?it/s]

In [205]:
data = []
for res in results:
    ll_parsed = json.loads(res)
    data.append(ll_parsed)

In [206]:
df = pd.DataFrame(data)

In [207]:
df.Relevance.value_counts()/df.shape[0]

Relevance
RELEVANT           0.439153
PARTLY_RELEVANT    0.359788
NON_RELEVANT       0.201058
Name: count, dtype: float64