In [1]:
import json
from tqdm.auto import tqdm
import pandas as pd

In [2]:
with open('./../dataset/medical_qa_documents_with_id.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [3]:
documents = []
for docs_info in docs_raw:
    for doc in docs_info['documents']:
        documents.append(doc)

In [4]:
df_ground_truth = pd.read_csv('./../dataset/search_ground-truth-data.csv')

In [5]:
ground_truth_dict = df_ground_truth.to_dict(orient='records')

### Elastic Search Evaluation

In [5]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch(
    ["http://localhost:9200"],   # must include scheme http://
    request_timeout=60           # increase timeout in case ES is slow
)
print(es_client.info)

<bound method Elasticsearch.info of <Elasticsearch(['http://localhost:9200'])>>


In [6]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "answer": {"type": "text"},
            "question": {"type": "text"},
            "qtype": {"type": "keyword"},
            "id": {"type": "keyword"}

        }
    }
}

index_name = "medical-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

In [6]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/14979 [00:00<?, ?it/s]

In [7]:
def elastic_search(query, qtype):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "answer", "qtype"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "qtype": qtype
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [8]:
elastic_search(
    query = "what is malaria?",
    qtype="information"
)

[{'question': 'What is (are) Malaria?',
  'answer': 'Malaria is a serious disease caused by a parasite. You get it when an infected mosquito bites you. Malaria is a major cause of death worldwide, but it is almost wiped out in the United States. The disease is mostly a problem in developing countries with warm climates. If you travel to these countries, you are at risk. There are four different types of malaria caused by four related parasites. The most deadly type occurs in Africa south of the Sahara Desert. Malaria symptoms include chills, flu-like symptoms, fever, vomiting, diarrhea, and jaundice. A blood test can diagnose it. It can be life-threatening. However, you can treat malaria with drugs. The type of drug depends on which kind of malaria you have and where you were infected. Malaria can be prevented. When traveling to areas where malaria is found - See your doctor for medicines that protect you - Wear insect repellent with DEET - Cover up - Sleep under mosquito netting Cente

In [11]:
relevance_total = []

for q in tqdm(ground_truth_dict):
    doc_id = q['document']
    results = elastic_search(query=q['question'], qtype=q['qtype'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/200 [00:00<?, ?it/s]

In [6]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [7]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

- hit-rate (recall)
- Mean Reciprocal Rank (mrr)

In [14]:
hit_rate(relevance_total), mrr(relevance_total)

(0.785, 0.7299166666666665)

### MinSearch Evaluation

In [8]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "answer"],
    keyword_fields=["qtype", "id"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x70763c25d6f0>

In [9]:
def minsearch_search(query, qtype):
    boost = {'question': 3.0}

    results = index.search(
        query=query,
        filter_dict={'qtype': qtype},
        boost_dict=boost,
        num_results=5
    )

    return results

In [10]:
relevance_total = []

for q in tqdm(ground_truth_dict):
    doc_id = q['document']
    results = minsearch_search(query=q['question'], qtype=q['qtype'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/200 [00:00<?, ?it/s]

In [11]:
hit_rate(relevance_total), mrr(relevance_total)

(0.82, 0.7745000000000001)

### Qdrant Vector Search Evaluation

In [12]:
from qdrant_client import QdrantClient, models

In [13]:
qd_client = QdrantClient(
    url="http://localhost:6333",
    timeout=60  # seconds (1 minutes)
)

collection_name = "medical-faq"

In [20]:
from sentence_transformers import SentenceTransformer

In [21]:
EMBEDDING_MODEL_NAME = "multi-qa-MiniLM-L6-cos-v1"
model = SentenceTransformer(EMBEDDING_MODEL_NAME)

In [22]:
def vector_search(qd_client, query, qtype=None, limit=5):
    query_vector = model.encode([query])[0].tolist()

    query_filter = None
    if qtype:
        query_filter = models.Filter(
            must=[models.FieldCondition(key="qtype", match=models.MatchValue(value=qtype))]
        )

    results = qd_client.query_points(
        collection_name=collection_name,
        query=query_vector,
        query_filter=query_filter,
        limit=limit,
        with_payload=True
    )

    return [p.payload for p in results.points]

In [23]:
relevance_total = []

for q in tqdm(ground_truth_dict):
    doc_id = q['document']
    results = vector_search(qd_client, query=q['question'], qtype=q['qtype'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/200 [00:00<?, ?it/s]

In [25]:
hit_rate(relevance_total), mrr(relevance_total)

(0.845, 0.7826666666666666)

In [24]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [23]:
# Elastic Search Evaluation
evaluate(ground_truth_dict, lambda q: elastic_search(q['question'], q['qtype']))

  0%|          | 0/200 [00:00<?, ?it/s]

{'hit_rate': 0.785, 'mrr': 0.7299166666666665}

In [24]:
# MinSearch Evaluation
evaluate(ground_truth_dict, lambda q: minsearch_search(q['question'], q['qtype']))

  0%|          | 0/200 [00:00<?, ?it/s]

{'hit_rate': 0.82, 'mrr': 0.7745000000000001}

In [27]:
# Qdrant VectorSearch Evaluation
evaluate(ground_truth_dict, lambda q: vector_search(qd_client,q['question'], q['qtype']))

  0%|          | 0/200 [00:00<?, ?it/s]

{'hit_rate': 0.845, 'mrr': 0.7826666666666666}