In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../data/ground-truth-retrieval.csv")
df

Unnamed: 0,id,chunk_id,question
0,5956,290_20,What are the exact resolutions that Gretchen R...
1,5956,290_20,What is the name of Gretchen Rubin's book that...
2,5956,290_20,What feeling of helplessness did Gretchen Rubi...
3,5956,290_20,What specific tools was Gretchen Rubin only ab...
4,5956,290_20,What is the name of the experiment that Gretch...
...,...,...,...
3361,9730,46625,What were the key findings of the book that st...
3362,9730,46625,Could Richard Koch have universally explained ...
3363,9730,46625,"Why was Malcolm Gladwell's thesis about 10,000..."
3364,9730,46625,"How many people out of 20, which Richard Koch ..."


I realized that the `id` column is wrongly labeled, and the last `chunk_id` that I manually processed is wrong. They need to be cleaned first.

In [3]:
df.loc[3361:, ["id", "chunk_id"]] = [466, "466_25"]
df[3360:]

Unnamed: 0,id,chunk_id,question
3360,12661,591_10,What was the initial reason that led to Zack q...
3361,466,466_25,What were the key findings of the book that st...
3362,466,466_25,Could Richard Koch have universally explained ...
3363,466,466_25,"Why was Malcolm Gladwell's thesis about 10,000..."
3364,466,466_25,"How many people out of 20, which Richard Koch ..."
3365,466,466_25,Did 9 reasons for achieving unreasonable succe...


In [4]:
df["id"] = df["chunk_id"].str.split("_").str[0].astype(int)
df

Unnamed: 0,id,chunk_id,question
0,290,290_20,What are the exact resolutions that Gretchen R...
1,290,290_20,What is the name of Gretchen Rubin's book that...
2,290,290_20,What feeling of helplessness did Gretchen Rubi...
3,290,290_20,What specific tools was Gretchen Rubin only ab...
4,290,290_20,What is the name of the experiment that Gretch...
...,...,...,...
3361,466,466_25,What were the key findings of the book that st...
3362,466,466_25,Could Richard Koch have universally explained ...
3363,466,466_25,"Why was Malcolm Gladwell's thesis about 10,000..."
3364,466,466_25,"How many people out of 20, which Richard Koch ..."


In [5]:
documents = df.to_dict(orient="records")

In [6]:
documents[:5]

[{'id': 290,
  'chunk_id': '290_20',
  'question': 'What are the exact resolutions that Gretchen Rubin is referring to in relation to her daily habits?'},
 {'id': 290,
  'chunk_id': '290_20',
  'question': "What is the name of Gretchen Rubin's book that 'did not find its audience'?"},
 {'id': 290,
  'chunk_id': '290_20',
  'question': 'What feeling of helplessness did Gretchen Rubin experience as a result of her book being a commercial failure?'},
 {'id': 290,
  'chunk_id': '290_20',
  'question': 'What specific tools was Gretchen Rubin only able to access after her book failed?'},
 {'id': 290,
  'chunk_id': '290_20',
  'question': 'What is the name of the experiment that Gretchen Rubin was testing in her book The Happiness Project?'}]

# Retrieval evaluation

In [7]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)


def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [8]:
from tqdm import tqdm


def evaluate(ground_truth, search_function):
    relevance_overall = []
    relevance_document = []

    for q in tqdm(ground_truth):
        doc_id, chunk_id = q["id"], q["chunk_id"]
        results = search_function(q)
        relevance_chunk = [(d["chunk_id"] == chunk_id) for d in results]
        relevance_doc = [(d["id"] == doc_id) for d in results]
        relevance_overall.append(relevance_chunk)
        relevance_document.append(relevance_doc)

    return {
        "hit_rate_overall": hit_rate(relevance_overall),
        "mrr_overall": mrr(relevance_overall),
        "hit_rate_document": hit_rate(relevance_document),
        "mrr_document": mrr(relevance_document),
    }

In [9]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.path.curdir, "..")))
from utils.rag import elastic_keyword_search, elastic_semantic_search

  from tqdm.autonotebook import tqdm, trange


In [10]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("multi-qa-mpnet-base-cos-v1")



In [11]:
from elasticsearch import Elasticsearch


es_client = Elasticsearch("http://127.0.0.1:9200")
es_client.info()

ObjectApiResponse({'name': '75e558d7ad26', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'hz3rz7BgQ2CimtW7H8da-A', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [12]:
from utils.constants import INDEX_NAME


es_client.indices.exists(index=INDEX_NAME)

HeadApiResponse(True)

In [13]:
keyword_results = evaluate(
    documents, lambda x: elastic_keyword_search(es_client, x["question"])
)
semantic_results = evaluate(
    documents, lambda x: elastic_semantic_search(es_client, x["question"], model)
)

100%|██████████| 3366/3366 [00:31<00:00, 105.22it/s]
100%|██████████| 3366/3366 [06:39<00:00,  8.44it/s]


In [14]:
keyword_results

{'hit_rate_overall': 0.7890671420083185,
 'mrr_overall': 1.0368785898197679,
 'hit_rate_document': 0.8722519310754605,
 'mrr_document': 1.571390374331495}

In [15]:
semantic_results

{'hit_rate_overall': 0.3820558526440879,
 'mrr_overall': 0.44081501287383695,
 'hit_rate_document': 0.6316102198455139,
 'mrr_document': 0.9488710635769462}

In [17]:
import json


with open("../data/keyword_eval.json", "w") as f:
    json.dump(keyword_results, f)

with open("../data/semantic_eval.json", "w") as f:
    json.dump(semantic_results, f)