In [2]:
import json
import pandas as pd
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch

In [3]:
import numpy as np
np.float_ = np.float64

In [4]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

In [5]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [6]:
with open('documents_with_ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [7]:
for doc in tqdm(documents):
    question = doc['question']
    text = doc['text']
    qt = question + ' ' + text

    # doc['question_vector'] = model.encode(question)
    # doc['text_vector'] = model.encode(text)
    # doc['question_text_vector'] = model.encode(qt)
    doc['question_vector'] = model.encode(question)
    doc['text_vector'] = model.encode(text)
    doc['question_text_vector'] = model.encode(qt)


  0%|          | 0/948 [00:00<?, ?it/s]

In [8]:
es_client = Elasticsearch('http://localhost:9200') 

In [9]:
index_settings = {
    "settings":{
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings":{
        "properties":{
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_vector":{
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

In [10]:
index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [11]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

## Hybrid search example


In [12]:
course = "data-engineering-zoomcamp"

In [13]:
query = 'I just discovered the course. Can I still join it?'

In [14]:
v_q = model.encode(query).astype(np.float64).tolist()

In [15]:
knn_query = {
    "field": "text_vector",
    "query_vector": v_q,
    "k": 5,
    "num_candidates": 10000,
    "boost": 0.5,
    "filter":{
        "term":{
            "course": course
        }
    }
}

In [16]:
keyword_query = {
    "bool": {
        "must": {
            "multi_match": {
                "query": query, 
                "fields": ["question^3", "text", "section"], 
                "type": "best_fields", 
                "boost": 0.5, 
            }
        },
        "filter": {
            "term":{
                "course": course
            }
        }
    }
}

In [17]:
response = es_client.search(
    index=index_name,
    query=keyword_query, 
    knn=knn_query,
    size=5
)




In [18]:
response["hits"]["hits"]


[{'_index': 'course-questions',
  '_id': '6fYneZgBEJjkk6TalzL3',
  '_score': 36.424633,
  '_source': {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
   'section': 'General course-related questions',
   'question': 'Course - Can I still join the course after the start date?',
   'course': 'data-engineering-zoomcamp',
   'id': '7842b56a',
   'question_vector': [0.003035882720723748,
    -0.002387187909334898,
    0.035881657153367996,
    0.020998885855078697,
    -0.01828235760331154,
    0.06715092062950134,
    -0.10277322679758072,
    -0.11509551107883453,
    -0.06606756150722504,
    -0.004973332863301039,
    -0.0028617552015930414,
    0.10543150454759598,
    -0.0008142856531776488,
    0.08418366312980652,
    0.02704714797437191,
    -0.03135378286242485,
    -0.05154325067996979,
    -0.0494899675250053

## Hybrid search pipeline


In [19]:
df_ground_truth = pd.read_csv('ground-truth-data.csv')

In [20]:
df_ground_truth

Unnamed: 0,question,course,document
0,What is the start date and time for the course?,data-engineering-zoomcamp,c02e79ef
1,How can I subscribe to the course's public Goo...,data-engineering-zoomcamp,c02e79ef
2,Where can I register for the course before it ...,data-engineering-zoomcamp,c02e79ef
3,Where can I find updates and announcements rel...,data-engineering-zoomcamp,c02e79ef
4,How can I join the course's community on Slack...,data-engineering-zoomcamp,c02e79ef
...,...,...,...
4728,In what way can I remove infrastructure that w...,mlops-zoomcamp,886d1617
4729,How should I initialize Terraform for destroyi...,mlops-zoomcamp,886d1617
4730,What is the configuration I should use for Ter...,mlops-zoomcamp,886d1617
4731,What file should I use to provide variables fo...,mlops-zoomcamp,886d1617


In [21]:
ground_truth = df_ground_truth.to_dict(orient='records')


In [22]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [23]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [24]:
def elastic_search_hybird(field, query, vector, course):
    knn_query = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "boost": 0.5,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question", "text", "section"],
                    "type": "best_fields",
                    "boost": 0.5,
                }
            },
            "filter": {
                "term": {
                    "course": course
                }
            }
        }
    }

    search_query = {
        "knn": knn_query,
        "query": keyword_query,
        "size": 5,
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [25]:
def question_hybird(q):
    question = q['question']
    course = q['course']

    v_q = model.encode(question)

    return elastic_search_hybird('question_vector', question, v_q, course)



In [26]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return{
        'hit_rate': hit_rate(relevance_total), 
        'mrr': mrr(relevance_total)
    }
    

In [27]:
evaluate(ground_truth, question_hybird)

  0%|          | 0/4733 [00:00<?, ?it/s]

{'hit_rate': 0.9163321360659201, 'mrr': 0.8570110571166989}

In [28]:
def text_hybird(q):
    question = q['question']
    course = q['course']

    v_q = model.encode(question).astype(np.float64).tolist()

    return elastic_search_hybird('text_vector', question, v_q, course)



In [29]:
evaluate(ground_truth, text_hybird)

  0%|          | 0/4733 [00:00<?, ?it/s]

{'hit_rate': 0.9142193112190999, 'mrr': 0.855215155996902}

In [30]:
def question_text_hybrid(q):
    question = q['question']
    course = q['course']

    v_q = model.encode(question)

    return elastic_search_hybird('question_text_vector', question, v_q, course)

evaluate(ground_truth, question_text_hybrid)

  0%|          | 0/4733 [00:00<?, ?it/s]

{'hit_rate': 0.9152757236425101, 'mrr': 0.8578491443059377}

## Reranking


In [31]:
def elastic_search_hybrid_rrf(field, query, vector, course):
    knn_query = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "boost": 0.5,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question", "text", "section"],
                    "type": "best_fields",
                    "boost": 0.5,
                }
            },
            "filter": {
                "term": {
                    "course": course
                }
            }
        }
    }

    search_query = {
        "knn": knn_query,
        "query": keyword_query,
        "size": 5,
        "rank": {
            "rrf": {}
        },
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [35]:
elastic_search_hybrid_rrf('question_text_vector', query, v_q, course)



By default, RRF isn't available in a free-tier subscription. But you can try to use 30-day trial or upgrade the subscription plan.


## RRF implementation


In [42]:
def compute_rrf(rank, k=60):
    """ Our own implementation of the relevance score """
    return 1 / (k + rank)

def elastic_search_hybrid_rrf(field, query, vector, course, k=60):
    knn_query = {
        "field": field,
        "query_vector": vector,
        "k": 10,
        "num_candidates": 10000,
        "boost": 0.5,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question", "text", "section"],
                    "type": "best_fields",
                    "boost": 0.5,
                }
            },
            "filter": {
                "term": {
                    "course": course
                }
            }
        }
    }

    knn_results = es_client.search(
        index=index_name, 
        body={
            "knn": knn_query, 
            "size": 10
        }
    )['hits']['hits']
    
    keyword_results = es_client.search(
        index=index_name, 
        body={
            "query": keyword_query, 
            "size": 10
        }
    )['hits']['hits']
    
    rrf_scores = {}
    # Calculate RRF using vector search results
    for rank, hit in enumerate(knn_results):
        doc_id = hit['_id']
        rrf_scores[doc_id] = compute_rrf(rank + 1, k)

    # Adding keyword search result scores
    for rank, hit in enumerate(keyword_results):
        doc_id = hit['_id']
        if doc_id in rrf_scores:
            rrf_scores[doc_id] += compute_rrf(rank + 1, k)
        else:
            rrf_scores[doc_id] = compute_rrf(rank + 1, k)

    # Sort RRF scores in descending order
    reranked_docs = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Get top-K documents by the score
    final_results = []
    for doc_id, score in reranked_docs[:5]:
        doc = es_client.get(index=index_name, id=doc_id)
        final_results.append(doc['_source'])
    
    return final_results

In [None]:
def question_text_hybrid_rrf(q):
    question = q['question']
    course = q['course']

    v_q = model.encode(question)

    return elastic_search_hybrid_rrf('question_text_vector', question, v_q, course)

evaluate(ground_truth, question_text_hybrid_rrf)

  0%|          | 0/4733 [00:00<?, ?it/s]