In [8]:
import pickle
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer

with open('fixed_ground_truth.pkl','rb') as infile:
    data = pickle.load(infile)

In [9]:
data[0:4]

[{'vid_id': 'Q75JgLEXMsM',
  'title': 'LLM Zoomcamp 1.1 - Introduction to LLM and RAG',
  'timecode': '00:00',
  'text': "hi everyone Welcome to our course this is our first module for first unit so in this course the course is called llm Zoom camp in this course we will learn about practical applications of llm and in particular we will focus our attention on rack retrieval a generation I'll shortly talk about these variations what they mean um and what we exactly will do and I want to start first with explaining the problem we are going to use uh to solve throughout the course um so this will be our running problem and in our community in data do club we have multiple courses so this llm Zoom Camp is our fifth course and usually in our courses we have frequently asked questions so there are questions that uh there are no answers in the videos or answers are not uh easy to find and we have these documents I'll quickly open one of them and in these documents we have frequently asked qu

In [10]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)
es = Elasticsearch("http://localhost:9200")

In [11]:
vector_data = []
for i in data:
    d = {}
    d['title'] = i['title']
    d['text'] = i['text']
    d['timecode_text'] = i['timecode_text']
    d['description'] = i['description']
    d['id'] = i['id']
    d['text_vector'] = model.encode(i['title']+' '+i['text']+' '+i['description'])
    vector_data.append(d)




In [12]:
count = 0
for i in data:
    if i['id']:
        count += 1
print(count)

837


In [13]:
len(vector_data[0]['text_vector'])

384

In [14]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "title": {"type": "text"},
            "text": {"type": "text"},
            "timecode_text": {"type": "text"},
            "description": {"type": "keyword"},
            "id": {"type": "keyword"},
            "text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
        }
    }


index_name = "vector-search"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'vector-search'})

In [15]:
for doc in vector_data:
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

In [16]:
def knn_query(question):
    return  {
        "field": "text_vector",
        "query_vector": model.encode(question),
        "k": 5,
        "num_candidates": 10000,
        "boost": 0.5,
        
    }

In [17]:
question = "When did we talk about Mage"

In [18]:
def vector_search(question):
    res = es_client.search(index=index_name, knn=knn_query(question), source=["id"])
    return res["hits"]["hits"]

In [19]:
vector_search(question)

[{'_index': 'vector-search',
  '_id': '23ZD-JEBH2OOoKBj1asY',
  '_score': 0.28033978,
  '_source': {'id': '3cba0dbf73297ec686a5b91511d47dbe'}},
 {'_index': 'vector-search',
  '_id': '3HZD-JEBH2OOoKBj1asc',
  '_score': 0.28033978,
  '_source': {'id': '3cba0dbf73297ec686a5b91511d47dbe'}},
 {'_index': 'vector-search',
  '_id': '3XZD-JEBH2OOoKBj1asg',
  '_score': 0.28033978,
  '_source': {'id': '3cba0dbf73297ec686a5b91511d47dbe'}},
 {'_index': 'vector-search',
  '_id': 'vnZD-JEBH2OOoKBj0Kri',
  '_score': 0.2729825,
  '_source': {'id': 'db6058ddeee3cd31cb9bcc0aa8db80ea'}},
 {'_index': 'vector-search',
  '_id': 'v3ZD-JEBH2OOoKBj0Krm',
  '_score': 0.2729825,
  '_source': {'id': 'db6058ddeee3cd31cb9bcc0aa8db80ea'}}]

In [20]:
from tqdm.auto import tqdm
relevance_total = []
for q in tqdm(data):
    doc_id = q['id']
    results = vector_search(q['student_question'])
    relevance = [d["_source"]['id'] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/837 [00:00<?, ?it/s]

In [21]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)
print(f"VECTOR SEARCH Hit Rate is: {hit_rate(relevance_total)}")

VECTOR SEARCH Hit Rate is: 0.4540023894862604


In [22]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)
print(f"VECTOR SEARCH  MRR is: {mrr(relevance_total)}")

VECTOR SEARCH  MRR is: 0.6554958183990423


In [23]:
hybrid_data = []
for i in data:
    d = {}
    d['title'] = i['title']
    d['text'] = i['text']
    d['timecode_text'] = i['timecode_text']
    d['description'] = i['description']
    d['id'] = i['id']
    d['title_vector'] = model.encode(i['title'])
    d['timecode_vector'] = model.encode(i['timecode_text'])
    d['text_vector'] = model.encode(i['text'])
    d['description_vector'] = model.encode(i['description'])
    hybrid_data.append(d)




In [24]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "title": {"type": "text"},
            "text": {"type": "text"},
            "timecode_text": {"type": "text"},
            "description": {"type": "keyword"},
            "id": {"type": "keyword"},
            "title_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "timecode_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "description_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
        }
    }


index_name = "hybrid-search"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'hybrid-search'})

In [25]:
for doc in hybrid_data:
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

In [26]:

def knn_query(question, vector):
    return  {
        "field": f"{vector}",
        "query_vector": model.encode(question),
        "k": 5,
        "num_candidates": 10000,
        "boost": 0.5,
        
    }

In [27]:
def keyword_query(question):
    return {
        "bool": {
            "must": {
                "multi_match": {
                    "query": f"{question}",
                    "fields": ["description^3", "text", "title"],
                    "type": "best_fields",
                    "boost": 0.5,
                }
            },
        }
    }

In [28]:
def multi_search(key_word, vector):
    response = es_client.search(
        index=index_name,
        query=keyword_query(key_word),
        knn=knn_query(key_word, vector),
        size=10
    )
    return response["hits"]["hits"]

In [29]:
for vector in ['title_vector','timecode_vector','text_vector','description_vector']:
    from tqdm.auto import tqdm
    relevance_total = []
    for q in tqdm(data):
        doc_id = q['id']
        results = multi_search(q['student_question'], vector)
        relevance = [d["_source"]['id'] == doc_id for d in results]
        relevance_total.append(relevance)
        cnt = 0
        for line in relevance_total:
            if True in line:
                cnt = cnt + 1

    
    print(f"Hybrid Hit rate for {vector} is: {cnt / len(relevance_total)}")

  0%|          | 0/837 [00:00<?, ?it/s]

Hybrid Hit rate for title_vector is: 0.5543608124253285


  0%|          | 0/837 [00:00<?, ?it/s]

Hybrid Hit rate for timecode_vector is: 0.5639187574671446


  0%|          | 0/837 [00:00<?, ?it/s]

Hybrid Hit rate for text_vector is: 0.5579450418160096


  0%|          | 0/837 [00:00<?, ?it/s]

Hybrid Hit rate for description_vector is: 0.5531660692951016


In [30]:
for vector in ['title_vector','timecode_vector','text_vector','description_vector']:
    from tqdm.auto import tqdm
    relevance_total = []
    for q in tqdm(data):
        doc_id = q['id']
        results = multi_search(q['student_question'], vector)
        relevance = [d["_source"]['id'] == doc_id for d in results]
        relevance_total.append(relevance)
        total_score = 0.0
        for line in relevance_total:
            for rank in range(len(line)):
                if line[rank] == True:
                    total_score = total_score + 1 / (rank + 1)


    print(f"MRR is {vector}: {total_score / len(relevance_total)}")

  0%|          | 0/837 [00:00<?, ?it/s]

MRR is title_vector: 0.727193681136334


  0%|          | 0/837 [00:00<?, ?it/s]

MRR is timecode_vector: 0.7417022244979242


  0%|          | 0/837 [00:00<?, ?it/s]

MRR is text_vector: 0.737086818000797


  0%|          | 0/837 [00:00<?, ?it/s]

MRR is description_vector: 0.7285970302099337


In [31]:
# def hit_rate(relevance_total):
#     cnt = 0

#     for line in relevance_total:
#         if True in line:
#             cnt = cnt + 1

#     return cnt / len(relevance_total)
# print(f"Hybrid Hit rate is: {hit_rate(relevance_total)}")

In [32]:
# def mrr(relevance_total):
#     total_score = 0.0

#     for line in relevance_total:
#         for rank in range(len(line)):
#             if line[rank] == True:
#                 total_score = total_score + 1 / (rank + 1)

#     return total_score / len(relevance_total)
# print(f"MRR is: {mrr(relevance_total)}")