Import Dependencies and read in Ground Truth Data

In [2]:
import pickle
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer

with open('fixed_ground_truth.pkl','rb') as infile:
    data = pickle.load(infile)

  from tqdm.autonotebook import tqdm, trange
2024-09-16 13:15:14.956844: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-16 13:15:14.976774: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-16 13:15:14.976811: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-16 13:15:14.989814: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  retur

Load model for Vector Search and connect to Elasticsearch

In [5]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)
es = Elasticsearch("http://localhost:9200")

Create single text vector from title, the transcript text and the description

In [6]:
vector_data = []
for i in data:
    d = {}
    d['title'] = i['title']
    d['text'] = i['text']
    d['timecode_text'] = i['timecode_text']
    d['description'] = i['description']
    d['id'] = i['id']
    d['text_vector'] = model.encode(i['title']+' '+i['text']+' '+i['description'])
    vector_data.append(d)




Define the index for the single vector search

In [8]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "title": {"type": "text"},
            "text": {"type": "text"},
            "timecode_text": {"type": "text"},
            "description": {"type": "keyword"},
            "id": {"type": "keyword"},
            "text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
        }
    }


index_name = "vector-search"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'vector-search'})

Insert the records

In [9]:
for doc in vector_data:
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

In [10]:
def knn_query(question):
    return  {
        "field": "text_vector",
        "query_vector": model.encode(question),
        "k": 5,
        "num_candidates": 10000,
        "boost": 0.5,
        
    }

In [11]:
question = "When did we talk about Mage"

Search

In [12]:
def vector_search(question):
    res = es_client.search(index=index_name, knn=knn_query(question), source=["id"])
    return res["hits"]["hits"]

Return results to calculate Hit Rate and MRR

In [13]:
from tqdm.auto import tqdm
relevance_total = []
for q in tqdm(data):
    doc_id = q['id']
    results = vector_search(q['student_question'])
    relevance = [d["_source"]['id'] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/837 [00:00<?, ?it/s]

In [21]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)
print(f"VECTOR SEARCH Hit Rate is: {hit_rate(relevance_total)}")

VECTOR SEARCH Hit Rate is: 0.4540023894862604


In [22]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)
print(f"VECTOR SEARCH  MRR is: {mrr(relevance_total)}")

VECTOR SEARCH  MRR is: 0.6554958183990423


Prepare data for hybrid search this time creating 4 vecors

In [14]:
hybrid_data = []
for i in data:
    d = {}
    d['title'] = i['title']
    d['text'] = i['text']
    d['timecode_text'] = i['timecode_text']
    d['description'] = i['description']
    d['id'] = i['id']
    d['title_vector'] = model.encode(i['title'])
    d['timecode_vector'] = model.encode(i['timecode_text'])
    d['text_vector'] = model.encode(i['text'])
    d['description_vector'] = model.encode(i['description'])
    hybrid_data.append(d)




In [15]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "title": {"type": "text"},
            "text": {"type": "text"},
            "timecode_text": {"type": "text"},
            "description": {"type": "keyword"},
            "id": {"type": "keyword"},
            "title_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "timecode_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "description_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
        }
    }


index_name = "hybrid-search"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'hybrid-search'})

In [16]:
for doc in hybrid_data:
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

In [17]:

def knn_query(question, vector):
    return  {
        "field": f"{vector}",
        "query_vector": model.encode(question),
        "k": 5,
        "num_candidates": 10000,
        "boost": 0.5,
        
    }

In [18]:
def keyword_query(question):
    return {
        "bool": {
            "must": {
                "multi_match": {
                    "query": f"{question}",
                    "fields": ["description^3", "text", "title"],
                    "type": "best_fields",
                    "boost": 0.5,
                }
            },
        }
    }

In [19]:
def multi_search(key_word, vector):
    response = es_client.search(
        index=index_name,
        query=keyword_query(key_word),
        knn=knn_query(key_word, vector),
        size=10
    )
    return response["hits"]["hits"]

Use Hybrid search, but also cycle through each vector and use it in the Hybrid Search to determine Hit Rate and MRR of each

In [29]:
for vector in ['title_vector','timecode_vector','text_vector','description_vector']:
    from tqdm.auto import tqdm
    relevance_total = []
    for q in tqdm(data):
        doc_id = q['id']
        results = multi_search(q['student_question'], vector)
        relevance = [d["_source"]['id'] == doc_id for d in results]
        relevance_total.append(relevance)
        cnt = 0
        for line in relevance_total:
            if True in line:
                cnt = cnt + 1

    
    print(f"Hybrid Hit rate for {vector} is: {cnt / len(relevance_total)}")

  0%|          | 0/837 [00:00<?, ?it/s]

Hybrid Hit rate for title_vector is: 0.5543608124253285


  0%|          | 0/837 [00:00<?, ?it/s]

In [30]:
for vector in ['title_vector','timecode_vector','text_vector','description_vector']:
    from tqdm.auto import tqdm
    relevance_total = []
    for q in tqdm(data):
        doc_id = q['id']
        results = multi_search(q['student_question'], vector)
        relevance = [d["_source"]['id'] == doc_id for d in results]
        relevance_total.append(relevance)
        total_score = 0.0
        for line in relevance_total:
            for rank in range(len(line)):
                if line[rank] == True:
                    total_score = total_score + 1 / (rank + 1)


    print(f"MRR is {vector}: {total_score / len(relevance_total)}")

  0%|          | 0/837 [00:00<?, ?it/s]

MRR is title_vector: 0.727193681136334


  0%|          | 0/837 [00:00<?, ?it/s]

MRR is timecode_vector: 0.7417022244979242


  0%|          | 0/837 [00:00<?, ?it/s]

MRR is text_vector: 0.737086818000797


  0%|          | 0/837 [00:00<?, ?it/s]

MRR is description_vector: 0.7285970302099337
