# Evaluate Retrieval System

In this notebook different model embeddings and vector storages will be tested.

The ground truth questions loaded here were created using the 02_generate_ground_truth notebook.

## Import main libraries

In [1]:
import pickle
import json
import os
import pickle
from langchain_community.docstore.document import Document
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceBgeEmbeddings
import minsearch
from llama_index.core.schema import TextNode
from tqdm.autonotebook import tqdm
from llama_index.core import Document, Settings, StorageContext, VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.postgres import PGVectorStore
from langchain_huggingface import HuggingFaceEmbeddings
from dotenv import load_dotenv
from tqdm.autonotebook import tqdm
import sys
import os
import pandas as pd
import pickle

load_dotenv()

project_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.insert(0, os.path.join(project_dir, "utils"))

  from tqdm.autonotebook import tqdm


## Load data to build retriever system

In [2]:
with open("../data/docs_processed.pickle", "rb") as f:
    documents = pickle.load(f)

In [3]:
len(documents)

4428

In [4]:
documents[10]

{'metadata': {'document_id': '301ee2b109',
  'pdf_name': 'Responsible_travel.pdf',
  'pdf_part': 1,
  'Header 2': 'Economic Impact'},
 'content': 'Cultural **impact** Environmental impact See also: Sustainable travel Climate change Overtourism\n"They took all the trees, put \'em in a tree museum, And they charged the people a dollar and a half just to see \'em. Don\'t it always seem to go That you don\'t know what you\'ve got till it\'s gone?  \nThey paved paradise and put up a parking lot. "  \n![1_image_0.png](1_image_0.png)'}

## Load ground truth documents from pickle

In [5]:
model_name = "gpt-4o"  # "phi3"
with open("../data/GT_docs_{}.bin".format(model_name), "rb") as file:
    ground_truth = pickle.load(file)

In [6]:
unsucessfull_parses = set()

for gt in ground_truth:
    try:
        questions_list = json.loads(gt["ground_truth"])
        gt["ground_truth"] = questions_list
    except:
        print(gt["metadata"]["document_id"])
        unsucessfull_parses.add(gt["metadata"]["document_id"])

774afd330e
774afd330e
774afd330e
774afd330e


In [7]:
for doc in documents:
    for key in doc["metadata"].keys():
        doc[key] = doc["metadata"][key]

In [8]:
len(ground_truth)

4428

In [9]:
# Step 1: Flatten the dictionary and remove 'metadata'
flattened_gt_list = []
for gt in ground_truth:
    # Create a new dictionary with all keys from 'metadata' moved to the top level
    flattened_dict = {**gt["metadata"], "content": gt["content"]}

    # Step 2: Create a new list of dictionaries with individual questions
    for question in gt["ground_truth"]:
        # For each question, create a new dictionary with 'question' and other data
        question_dict = {**flattened_dict, "question": question}
        flattened_gt_list.append(question_dict)

In [10]:
len(flattened_gt_list)

19408

In [11]:
flattened_gt_list[100]

{'document_id': '69d0cddd8b',
 'pdf_name': 'Responsible_travel.pdf',
 'pdf_part': 11,
 'Header 2': 'By Plane',
 'content': 'Lufthansa (https://www.lufthansa.com/us/en/offset-flight), Qantas (http://www.qantas.com.au/travel/airlines/fly-carbon-neutral/global/en), & United Airlines (http:// www.united.com/web/en-US/content/company/globalcitizenship/environment_faq.aspx).',
 'question': 'What are ways to offset my carbon footprint when flying to Southeast Asia?'}

In [37]:
with open("../data/GT_docs_parsed_{}.bin".format(model_name), "wb") as file:
    pickle.dump(flattened_gt_list, file)

## Define evaluation metrics

In [30]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)


def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

## Build Min Search

In [39]:
ground_truth[0]

{'metadata': {'document_id': 'f392923b41',
  'pdf_name': 'Responsible_travel.pdf',
  'pdf_part': 0,
  'Header 2': 'Responsible Travel'},
 'content': 'See Sustainable travel for the ecological and appropriate technology dimension of travel sustainability.',
 'ground_truth': ['What are some eco-friendly accommodations in Southeast Asia?',
  'How can I minimize my ecological footprint while traveling in Southeast Asia?',
  'What are the best practices for responsible tourism in Southeast Asia?',
  'Are there any community-based tourism projects I can support in Southeast Asia?']}

In [40]:
documents[0]

{'metadata': {'document_id': 'f392923b41',
  'pdf_name': 'Responsible_travel.pdf',
  'pdf_part': 0,
  'Header 2': 'Responsible Travel'},
 'content': 'See Sustainable travel for the ecological and appropriate technology dimension of travel sustainability.',
 'document_id': 'f392923b41',
 'pdf_name': 'Responsible_travel.pdf',
 'pdf_part': 0,
 'Header 2': 'Responsible Travel'}

In [41]:
index = minsearch.Index(
    text_fields=["content", "Header 2"],
    keyword_fields=["pdf_name", "document_id"],
)

index.fit(documents)

<minsearch.minsearch.Index at 0x1699c7bc0>

In [62]:
def minsearch_search(query, pdf_name):

    boost = {"content": 3.0, "Header 2": 2}

    results = index.search(
        query=query, filter_dict={"pdf_name": pdf_name}, boost_dict=boost, num_results=4
    )

    return results

In [63]:
relevance_total = []

for q in tqdm(flattened_gt_list):
    doc_id = q["document_id"]
    results = minsearch_search(query=q["question"], pdf_name=q["pdf_name"])
    relevance = [d["document_id"] == doc_id for d in results]
    relevance_total.append(relevance)

100%|██████████| 19408/19408 [00:30<00:00, 642.45it/s]


In [64]:
hit_rate(relevance_total), mrr(relevance_total)

(0.4506904369332234, 0.3569189681230966)

## Build Chromadb

In [65]:
def chroma_search(db, query):
    results = db.similarity_search(query)
    return results

In [47]:
def create_langchain_document(text, metadata):
    return Document(page_content=text, metadata=metadata)


def create_langchain_document_list(document_dicts):
    documents = []
    for doc_dict in document_dicts:
        metadata = doc_dict["metadata"]
        content = doc_dict["content"]
        doc = create_langchain_document(content, metadata)
        documents.append(doc)
    return documents


lang_chain_docs = create_langchain_document_list(documents)

In [48]:
model_name = "BAAI/bge-large-en-v1.5"
encode_kwargs = {"normalize_embeddings": True}  # set True to compute cosine similarity
embedding_model = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    # model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
    query_instruction="为这个句子生成表示以用于检索相关文章：",
)

In [49]:
db = Chroma.from_documents(
    lang_chain_docs, embedding_model, persist_directory="../pdf_chroma_db"
)

In [52]:
q

{'metadata': {'document_id': 'f392923b41',
  'pdf_name': 'Responsible_travel.pdf',
  'pdf_part': 0,
  'Header 2': 'Responsible Travel'},
 'content': 'See Sustainable travel for the ecological and appropriate technology dimension of travel sustainability.',
 'ground_truth': ['What are some eco-friendly accommodations in Southeast Asia?',
  'How can I minimize my ecological footprint while traveling in Southeast Asia?',
  'What are the best practices for responsible tourism in Southeast Asia?',
  'Are there any community-based tourism projects I can support in Southeast Asia?']}

In [56]:
results[0].metadata

{'Header 1': 'Sustainable Travel',
 'Header 2': 'Sleep',
 'document_id': '15a52fbd26',
 'pdf_name': 'Sustainable_travel.pdf',
 'pdf_part': 1}

In [57]:
relevance_total = []

for q in tqdm(flattened_gt_list):
    doc_id = q["document_id"]
    results = chroma_search(db, query=q["question"])
    relevance = [d.metadata["document_id"] == doc_id for d in results]
    relevance_total.append(relevance)

100%|██████████| 19408/19408 [14:35<00:00, 22.17it/s]


In [61]:
results

[Document(metadata={'Header 2': 'Responsible Travel Vs Ecotourism', 'document_id': '301ee2b109', 'pdf_name': 'Responsible_travel.pdf', 'pdf_part': 1}, page_content="that the lodge you stay at does not harm the environment, but that the businesses you interact with (tourism providers, shops, hotels, etc) care about the local people and cultures as much as their bottom line. When you shop, you're putting your money in the hands of locals in a sustainable way, not staying at chain hotels, where revenue isn't spread around."),
 Document(metadata={'Header 2': 'Responsible Travel Vs Ecotourism', 'document_id': '301ee2b109', 'pdf_name': 'Responsible_travel.pdf', 'pdf_part': 1}, page_content="that the lodge you stay at does not harm the environment, but that the businesses you interact with (tourism providers, shops, hotels, etc) care about the local people and cultures as much as their bottom line. When you shop, you're putting your money in the hands of locals in a sustainable way, not stayi

In [59]:
hit_rate(relevance_total), mrr(relevance_total)

(0.17152720527617477, 0.2094325364111078)

## Evaluation Results Chroma vs MinSearch

The minsearch method is giving a hit rate of 0.45 and MRR of 0.37. The Chroma method is giving a hit rate of 0.17 and MRR of 0.20. I would expect that chroma db would have a higher score since it using text embeddings. Let's try another embedding model:

In [6]:
model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
encode_kwargs = {"normalize_embeddings": True}  # set True to compute cosine similarity
embedding_model = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    # model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
    query_instruction="为这个句子生成表示以用于检索相关文章：",
)



In [67]:
db2 = Chroma.from_documents(
    lang_chain_docs, embedding_model, persist_directory="../pdf_chroma_db2"
)

In [68]:
relevance_total = []

for q in tqdm(flattened_gt_list):
    doc_id = q["document_id"]
    results = chroma_search(db2, query=q["question"])
    relevance = [d.metadata["document_id"] == doc_id for d in results]
    relevance_total.append(relevance)

100%|██████████| 19408/19408 [04:16<00:00, 75.57it/s]


In [69]:
hit_rate(relevance_total), mrr(relevance_total)

(0.2796784830997527, 0.19250223275625766)

This has improved the hit rate but reduced the MRR. I will now try to implement a more complex retrieval system, using hybrid search and document reranking. 

## Build PGVector

In [11]:
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT

db_name = "vector_db"
conn = psycopg2.connect(
    host="localhost",
    database="postgres",
    user="marialoureiro",
    password="password",
)
conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)

conn.autocommit = True

with conn.cursor() as c:
    c.execute(f"DROP DATABASE IF EXISTS {db_name}")
    c.execute(f"CREATE DATABASE {db_name}")

conn.close()

In [12]:
model_name = (
    "BAAI/bge-large-en-v1.5"  # "sentence-transformers/multi-qa-mpnet-base-dot-v1"
)
embed_dim = 1024
llama_embedding_model = HuggingFaceEmbedding(model_name=model_name)

In [13]:
vector_store = PGVectorStore.from_params(
    database=db_name,
    host="localhost",
    password="password",
    port=5432,
    user="marialoureiro",
    table_name="southeast_asia_countries",
    hybrid_search=True,
    text_search_config="english",
    embed_dim=embed_dim,
)
"""     hnsw_kwargs={
        "hnsw_m": 16,
        "hnsw_ef_construction": 64,
        "hnsw_ef_search": 40,
        "hnsw_dist_method": "cosine",
    }, """


def build_index(embedding_model):
    """
    Builds an index using the provided embedding model.

    Args:
        embedding_model: The embedding model to use for generating embeddings.
    """

    Settings.embed_model = embedding_model

    storage_context = StorageContext.from_defaults(vector_store=vector_store)

    index = VectorStoreIndex.from_documents([], storage_context=storage_context)

    return index


def add_document(index, document: dict):
    """
    Adds a document to the index.

    Args:
        document (dict): A dictionary containing the document's content and metadata.
    """
    llama_node = TextNode(
        text=document["content"],
        metadata=document["metadata"],
        id_=document["metadata"]["document_id"],
    )

    index.insert_nodes([llama_node])

    return index

In [14]:
index = build_index(llama_embedding_model)

In [15]:
for doc in documents:
    index = add_document(index, doc)

In [20]:
query_engine = index.as_query_engine(vector_store_query_mode="hybrid", sparse_top_k=4)

In [21]:
answer = query_engine.query(
    "hello, what is the best city to visit in indonesia?",
)
print(answer.response)
print(answer.source_nodes[0].text)

Bali is one of the best cities to visit in Indonesia.
interested in pleasing the customer in the case of a problem. Java has by far the best railway network, with trains connecting the capital city, Jakarta, with other main cities such as Surabaya, Semarang, Yogyakarta and Solo. In particular, Indonesia's first high-speed railway (*Kereta Cepat*), a Chinese-built line from Jakarta to Bandung somewhat cringily named **Whoosh**, opened in 2023 and, running at up to 350 km/h, connects the two cities in as little as 30 minutes. There are plans to


In [24]:
answer.source_nodes[0].metadata

{'document_id': 'cee7e86be8',
 'pdf_name': 'Indonesia.pdf',
 'pdf_part': 2,
 'Header 2': 'By Yacht'}

In [27]:
def pgvector_search(query_engine, question):

    results = query_engine.query(question).source_nodes

    return results

In [None]:
relevance_total = []

for q in tqdm(flattened_gt_list):
    doc_id = q["document_id"]
    results = pgvector_search(query_engine, question=q["question"])
    relevance = [d.metadata["document_id"] == doc_id for d in results]
    relevance_total.append(relevance)

In [31]:
hit_rate(relevance_total), mrr(relevance_total)

(0.43738656987295826, 0.2357304900181445)

## Final Decision on Retrieval System

Although min search provided the best results, PGVector with be chosen considering it's ability of performing hybrid search, which can be useful in more complex questions, as well as scalibity in the future with it's connection to Postgres DB.