## Main

### Imports

In [1]:
%load_ext autoreload
%autoreload 2

from decouple import config
from fastembed import (
    TextEmbedding, 
    SparseTextEmbedding, 
    LateInteractionTextEmbedding
)
from qdrant_client import QdrantClient
from qdrant_client.models import (
    Distance,
    Fusion,
    VectorParams, 
    SparseVectorParams,
    SparseIndexParams,
    MultiVectorConfig,
    MultiVectorComparator,
    Modifier,
    HnswConfigDiff
)
from ranx import Qrels, Run, evaluate

from rag.models import (
    DenseModelConfig, 
    SparseModelConfig,
    RerankingModelConfig,
    DenseSearchManager,
    SparseSearchManager,
    HybridFusionSearchManager, 
    HybridRerankingSearchManager, 
    Metadata
)
from rag.utils import load_datasets


QDRANT_URL = config('QDRANT_URL')
QDRANT_API_KEY = config('QDRANT_API_KEY')
NVIDIA_API_KEY = config('NVIDIA_API_KEY')
CUDA=config('CUDA', cast=bool)
PROVIDER = 'CUDAExecutionProvider' if CUDA else 'CPUExecutionProvider'

### Client

In [2]:
qdrant_client = QdrantClient(
    url=QDRANT_URL, 
    api_key=QDRANT_API_KEY,
)

print(qdrant_client.get_collections())

collections=[CollectionDescription(name='dense_collection'), CollectionDescription(name='fusion_collection'), CollectionDescription(name='reranking_collection'), CollectionDescription(name='sparse_collection')]


### Models

In [3]:
dense_model_name = 'BAAI/bge-small-en-v1.5'
dense_model = TextEmbedding(
    model_name=dense_model_name,
    providers=[PROVIDER]
)

sparse_model_name = 'Qdrant/bm25'
sparse_model = SparseTextEmbedding(
    model_name=sparse_model_name,
    providers=[PROVIDER],
    k1=1.5,
    b=0.75
)

reranking_model_name = 'colbert-ir/colbertv2.0'
reranking_model = LateInteractionTextEmbedding(
    model_name=reranking_model_name,
    providers=[PROVIDER]
)

dense_model_config = DenseModelConfig(
    name=dense_model_name,
    vector_params=VectorParams(
        size=384,
        distance=Distance.COSINE,
        hnsw_config=HnswConfigDiff(
            m=16,
            ef_construct=128,
            on_disk=True
        ),
        on_disk=True
    )
)

sparse_model_config = SparseModelConfig(
    name=sparse_model_name,
    sparse_vector_params=SparseVectorParams(
        index=SparseIndexParams(
            on_disk=True
        ),
        modifier=Modifier.IDF
    )
)

reranking_model_config = RerankingModelConfig(
    name=reranking_model_name,
    vector_params=VectorParams(
        size=128,
        distance=Distance.COSINE,
        hnsw_config=HnswConfigDiff(
            m=0     # disable HNSW
        ),
        on_disk=True,
        multivector_config=MultiVectorConfig(
            comparator=MultiVectorComparator.MAX_SIM,
        )
    )
)



### Documents

In [4]:
corpus_df, queries_df, qrels_df = load_datasets(5_000, 5_000)

print(len(corpus_df), len(queries_df), len(qrels_df))

104 109 111


In [6]:
corpus_df[:3]

Unnamed: 0,_id,title,text
0,307,Abraham Lincoln,"Abraham Lincoln ( ; February 12, 1809 – April ..."
1,628,Aldous Huxley,Aldous Leonard Huxley ( ; 26 July 1894 – 22 No...
2,844,Amsterdam,Amsterdam ( ; ] ) is the capital and most popu...


In [11]:
queries_df[:3]

Unnamed: 0,_id,title,text
0,5ac3b95755429939154138e6,,What language family is the language of the tr...
1,5abee3d95542994516f4546c,,"Which of the following is acclaimed for his ""l..."
2,5a8c4c8e554299585d9e3652,,Filipino sitcom Iskul Bukol had a theme song t...


In [12]:
qrels_df[:3]

Unnamed: 0,query-id,corpus-id,score
101,5ac3b95755429939154138e6,7222,1
137,5abee3d95542994516f4546c,2310,1
203,5a8c4c8e554299585d9e3652,9288,1


In [7]:
corpus_texts: list[str] = corpus_df['text'].values.tolist()
metadatas = [
    Metadata(
        id=row['_id'],
        text=row['text']
    )
    for _, row in corpus_df.iterrows()
]

dense_embeddings = list(dense_model.embed(corpus_texts))
sparse_embeddings = list(sparse_model.embed(corpus_texts))
reranking_embeddings = list(reranking_model.embed(corpus_texts))

### Indexing

In [8]:
DENSE_COLLECTION_NAME = 'HotpotQA_dense'
SPARSE_COLLECTION_NAME = 'HotpotQA_sparse'
FUSION_COLLECTION_NAME = 'HotpotQA_fusion'
RERANKING_COLLECTION_NAME = 'HotpotQA_reranking'

#### Dense

In [9]:
dense_search = DenseSearchManager(
    qdrant_client=qdrant_client, 
    dense_model_config=dense_model_config
)

dense_search.create_collection(DENSE_COLLECTION_NAME)
dense_search.upload_points(
    DENSE_COLLECTION_NAME,
    dense_embeddings,
    metadatas
)

#### Sparse

In [None]:
sparse_search = SparseSearchManager(
    qdrant_client=qdrant_client, 
    sparse_model_config=sparse_model_config
)

sparse_search.create_collection(SPARSE_COLLECTION_NAME)
sparse_search.upload_points(
    SPARSE_COLLECTION_NAME,
    sparse_embeddings,
    metadatas
)

#### Hybrid - Fusion

In [None]:
fusion_search = HybridFusionSearchManager(
    qdrant_client=qdrant_client, 
    dense_model_config=dense_model_config,
    sparse_model_config=sparse_model_config
)

fusion_search.create_collection(FUSION_COLLECTION_NAME)
fusion_search.upload_points(
    FUSION_COLLECTION_NAME, 
    dense_embeddings, 
    sparse_embeddings,
    metadatas
)

#### Hybrid - Reranking

In [None]:
reranking_search = HybridRerankingSearchManager(
    qdrant_client=qdrant_client, 
    dense_model_config=dense_model_config,
    sparse_model_config=sparse_model_config,
    reranking_model_config=reranking_model_config
)

reranking_search.create_collection(RERANKING_COLLECTION_NAME)
reranking_search.upload_points(
    RERANKING_COLLECTION_NAME, 
    dense_embeddings, 
    sparse_embeddings,
    reranking_embeddings,
    metadatas
)

### Query

In [15]:
query_texts: list[str] = queries_df['text'].values.tolist()

query_dense_embeddings = list(dense_model.embed(query_texts))
query_sparse_embeddings = list(sparse_model.embed(query_texts))
query_reranking_embeddings = list(reranking_model.embed(query_texts))

### Search

In [16]:
top_k = 5

#### Dense

In [None]:
dense_scored_points_list = [
    dense_search.search(
        DENSE_COLLECTION_NAME,
        query_dense_embedding,
        top_k
    )
    for query_dense_embedding in query_dense_embeddings
]

print(*dense_scored_points_list[0][:3], sep='\n')

id='b47dae5f-f645-4d3e-9a0a-de834009129e' version=1 score=0.6729106 payload={'id': 7222, 'text': 'The Choctaw (In the Choctaw language, Chahta) are a Native American people originally occupying what is now the Southeastern United States (modern-day Alabama, Florida, Mississippi, and Louisiana). Their Choctaw language belongs to the Muskogean language family group.'} vector=None shard_key=None order_value=None
id='01d4ace3-e2b1-4fb3-92d1-51243470b30a' version=0 score=0.63985586 payload={'id': 2303, 'text': 'Aramaic (אַרָמָיָא "Arāmāyā", Syriac: ܐܪܡܝܐ\u200e , Arabic: آرامية\u200e \u200e ) is a language or group of languages belonging to the Semitic subfamily of the Afroasiatic language family. More specifically, it is part of the Northwest Semitic group, which also includes the Canaanite languages such as Hebrew and Phoenician. The Aramaic alphabet was widely adopted for other languages and is ancestral to the Hebrew, Syriac and Arabic alphabets.'} vector=None shard_key=None order_value=

#### Sparse

In [18]:
sparse_scored_points_list = [
    sparse_search.search(
        SPARSE_COLLECTION_NAME,
        query_sparse_embedding,
        top_k
    )
    for query_sparse_embedding in query_sparse_embeddings
]

print(*sparse_scored_points_list[0][:3], sep='\n')

id='87341816-6e96-4e1e-9b23-e7872b15d00f' version=0 score=17.041908 payload={'id': 2303, 'text': 'Aramaic (אַרָמָיָא "Arāmāyā", Syriac: ܐܪܡܝܐ\u200e , Arabic: آرامية\u200e \u200e ) is a language or group of languages belonging to the Semitic subfamily of the Afroasiatic language family. More specifically, it is part of the Northwest Semitic group, which also includes the Canaanite languages such as Hebrew and Phoenician. The Aramaic alphabet was widely adopted for other languages and is ancestral to the Hebrew, Syriac and Arabic alphabets.'} vector=None shard_key=None order_value=None
id='78df5b09-8390-41ee-a5dd-ae8a7fb593ae' version=1 score=16.999527 payload={'id': 7222, 'text': 'The Choctaw (In the Choctaw language, Chahta) are a Native American people originally occupying what is now the Southeastern United States (modern-day Alabama, Florida, Mississippi, and Louisiana). Their Choctaw language belongs to the Muskogean language family group.'} vector=None shard_key=None order_value=N

#### Hybrid - Fusion

In [None]:
fusion_scored_points_list = [
    fusion_search.search(
        FUSION_COLLECTION_NAME,
        query_dense_embedding,
        query_sparse_embedding,
        Fusion.RRF,
        top_k
    )
    for query_dense_embedding, query_sparse_embedding in zip(query_dense_embeddings, query_sparse_embeddings)
]

print(*fusion_scored_points_list[0][:3], sep='\n')

id='06f53800-d0f3-40f5-8e3f-8f03e2facc45' version=1 score=0.8333334 payload={'id': 7222, 'text': 'The Choctaw (In the Choctaw language, Chahta) are a Native American people originally occupying what is now the Southeastern United States (modern-day Alabama, Florida, Mississippi, and Louisiana). Their Choctaw language belongs to the Muskogean language family group.'} vector=None shard_key=None order_value=None
id='9bbb2b0f-8247-49e8-8520-bddcffd7d74b' version=0 score=0.8333334 payload={'id': 2303, 'text': 'Aramaic (אַרָמָיָא "Arāmāyā", Syriac: ܐܪܡܝܐ\u200e , Arabic: آرامية\u200e \u200e ) is a language or group of languages belonging to the Semitic subfamily of the Afroasiatic language family. More specifically, it is part of the Northwest Semitic group, which also includes the Canaanite languages such as Hebrew and Phoenician. The Aramaic alphabet was widely adopted for other languages and is ancestral to the Hebrew, Syriac and Arabic alphabets.'} vector=None shard_key=None order_value=N

#### Hybrid - Reranking

In [None]:
reranking_scored_points_list = [
    reranking_search.search(
        RERANKING_COLLECTION_NAME,
        query_dense_embedding,
        query_sparse_embedding,
        query_reranking_embedding,
        5 * top_k,
        top_k
    )
    for query_dense_embedding, query_sparse_embedding, query_reranking_embedding in zip(
        query_dense_embeddings, query_sparse_embeddings, query_reranking_embeddings
    )
]

print(*reranking_scored_points_list[0][:3], sep='\n')

id='7d7b49d4-8ebb-4c05-8d1a-0f8cf0903b5c' version=4 score=9.195317 payload={'id': 7222, 'text': 'The Choctaw (In the Choctaw language, Chahta) are a Native American people originally occupying what is now the Southeastern United States (modern-day Alabama, Florida, Mississippi, and Louisiana). Their Choctaw language belongs to the Muskogean language family group.'} vector=None shard_key=None order_value=None
id='9f3b4686-d66c-49b6-a396-588686a4ee40' version=5 score=8.606562 payload={'id': 9751, 'text': 'Elwyn Brooks "E. B." White (July 11, 1899 – October 1, 1985) was an American writer. He was a contributor to The New Yorker magazine and a co-author of the English language style guide "The Elements of Style", which is commonly known as "Strunk & White". He also wrote books for children, including "Stuart Little" in 1945, "Charlotte\'s Web" in 1952, and "The Trumpet of the Swan" in 1970. "Charlotte\'s Web" was voted the top children\'s novel in a 2012 survey of "School Library Journal" 

### Evaluate

#### Qrels (query relevances)

In [23]:
qrels_dict = {}

for _, row in qrels_df.iterrows():
    query_id = row['query-id']
    corpus_id = str(row['corpus-id'])
    relevance = int(row['score'])
    
    if query_id not in qrels_dict:
        qrels_dict[query_id] = {}

    qrels_dict[query_id][corpus_id] = relevance

#### Runs

In [28]:
runs_dict = {}

for i, query_id in enumerate(queries_df['_id'].values):
    runs_dict[query_id] = {}
    
    for scored_point in dense_scored_points_list[i]:
        doc_id = str(scored_point.payload['id'])
        runs_dict[query_id][doc_id] = float(scored_point.score)

#### Metrics

In [27]:
metrics = [
    'mrr',
    'map',
    f'precision@{top_k}',
    f'recall@{top_k}',
    f'ndcg@{top_k}'
]

#### Results

In [29]:
qrels_ranx = Qrels(qrels_dict)
run_ranx = Run(runs_dict)

results = evaluate(qrels_ranx, run_ranx, metrics=metrics)

print(results)

  scores[i] = _reciprocal_rank(qrels[i], run[i], k, rel_lvl)


{'mrr': np.float64(0.9365443425076454), 'map': np.float64(0.9365443425076454), 'precision@5': np.float64(0.19999999999999998), 'recall@5': np.float64(0.981651376146789), 'ndcg@5': np.float64(0.9481393103951706)}
