In [1]:
import pandas as pd 
import numpy as np

from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.core.schema import TextNode 

import qdrant_client
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

In [2]:
client = qdrant_client.QdrantClient(
    host="localhost",
    port=6333
)
try:
    response = client.get_collections()
    print("Connected to Qdrant successfully!")
    print("Available collections:", response)
except Exception as e:
    print("Failed to connect to Qdrant:", e)

Connected to Qdrant successfully!
Available collections: collections=[CollectionDescription(name='uit_halong'), CollectionDescription(name='uit_improve_halong'), CollectionDescription(name='uit_bkai'), CollectionDescription(name='uit_improve_bkai')]


In [3]:
test = pd.read_csv("data/test.csv")
test["metadata"] = test.apply(lambda row: {"document": row["document"], "article": row["article"]}, axis=1)
test = test[["question","metadata"]]
test.head(2)

Unnamed: 0,question,metadata
0,Học liệu điện tử sau khi được thông qua bởi ĐV...,{'document': 'QUY ĐỊNH DẠY VÀ HỌC THEO PHƯƠNG ...
1,Có thể nộp bản sao giấy khai sinh cho hồ sơ đề...,"{'document': 'QUY CHẾ Văn bằng, chứng chỉ của ..."


In [4]:
def evaluate(retriver, test):
    hit = 0 
    mrr_score = 0
    ndcg_score = 0
    for _, row in test.iterrows():
        results = retriver.retrieve(row["question"])
        meta = [r.metadata for r in results]
        if row["metadata"] in meta:
            hit += 1
            index = meta.index(row["metadata"])
            mrr_score += 1/(index+1)
            ndcg_score += 1/(np.log2(index+2))
    print("Hit rate:", hit/len(test))
    print("MRR score:", mrr_score/len(test))
    print("NDCG score:", ndcg_score/len(test))

## halong embedding

### halong

In [5]:
model_halong = HuggingFaceEmbedding(model_name="hiieu/halong_embedding")
vector_halong = QdrantVectorStore(
    "uit_halong",
    client=client,
    enable_hybrid=True,
    embed_model = model_halong
)
index_halong = VectorStoreIndex.from_vector_store(vector_halong,embed_model = model_halong)

In [6]:
retriver_halong = index_halong.as_retriever(vector_store_query_mode="hybrid",alpha = 0,similarity_top_k = 10)
evaluate(retriver_halong, test)

Hit rate: 0.8739754098360656
MRR score: 0.6367970173041891
NDCG score: 0.6941268037714591


In [7]:
retriver_halong = index_halong.as_retriever(vector_store_query_mode="hybrid",alpha = 0.5,similarity_top_k = 10)
evaluate(retriver_halong, test)

Hit rate: 0.8739754098360656
MRR score: 0.6367970173041891
NDCG score: 0.6941268037714591


In [8]:
retriver_halong = index_halong.as_retriever(vector_store_query_mode="hybrid",alpha = 0.75,similarity_top_k = 10)
evaluate(retriver_halong, test)

Hit rate: 0.8924180327868853
MRR score: 0.6956442720530839
NDCG score: 0.7437476553854001


In [9]:
retriver_halong = index_halong.as_retriever(vector_store_query_mode="hybrid",alpha = 1,similarity_top_k = 10)
evaluate(retriver_halong, test)

Hit rate: 0.8995901639344263
MRR score: 0.6949502341920376
NDCG score: 0.7449288013764916


### Improved Halong


In [10]:
model_improve_halong = HuggingFaceEmbedding(model_name="johnweak132/improve_halong")
vector_improve_halong = QdrantVectorStore(
    "uit_improve_halong",
    client=client,
    enable_hybrid=True,
    embed_model = model_improve_halong
)
index_improve_halong = VectorStoreIndex.from_vector_store(vector_improve_halong,embed_model = model_improve_halong)

In [11]:
retriver_improve_halong = index_improve_halong.as_retriever(vector_store_query_mode="hybrid",alpha = 0,similarity_top_k = 10)
evaluate(retriver_improve_halong, test)

Hit rate: 0.9620901639344263
MRR score: 0.7468831316679684
NDCG score: 0.7995522335515066


In [12]:
retriver_improve_halong = index_improve_halong.as_retriever(vector_store_query_mode="hybrid",alpha = 0.5,similarity_top_k = 10)
evaluate(retriver_improve_halong, test)

Hit rate: 0.9620901639344263
MRR score: 0.7468831316679684
NDCG score: 0.7995522335515066


In [13]:
retriver_improve_halong = index_improve_halong.as_retriever(vector_store_query_mode="hybrid",alpha = 0.75,similarity_top_k = 10)
evaluate(retriver_improve_halong, test)

Hit rate: 0.9682377049180327
MRR score: 0.8242083821233419
NDCG score: 0.8599873762936563


In [14]:
retriver_improve_halong = index_improve_halong.as_retriever(vector_store_query_mode="hybrid",alpha = 1,similarity_top_k = 10)
evaluate(retriver_improve_halong, test)

Hit rate: 0.9733606557377049
MRR score: 0.8263279013791319
NDCG score: 0.8626988271645647


## Bkai vibi 

In [15]:
model_bkai = HuggingFaceEmbedding(model_name="bkai-foundation-models/vietnamese-bi-encoder")
vector_bkai = QdrantVectorStore(
    "uit_bkai",
    client=client,
    enable_hybrid=True,
    embed_model = model_bkai
)
index_bkai = VectorStoreIndex.from_vector_store(vector_bkai,embed_model = model_bkai)

In [16]:
retriver_bkai = index_bkai.as_retriever(vector_store_query_mode="hybrid",alpha = 0,similarity_top_k = 10)
evaluate(retriver_bkai, test)

Hit rate: 0.7284836065573771
MRR score: 0.49902338667707474
NDCG score: 0.5540688899163141


In [17]:
retriver_bkai = index_bkai.as_retriever(vector_store_query_mode="hybrid",alpha = 0.5,similarity_top_k = 10)
evaluate(retriver_bkai, test)

Hit rate: 0.7284836065573771
MRR score: 0.49902338667707474
NDCG score: 0.5540688899163141


In [18]:
retriver_bkai = index_bkai.as_retriever(vector_store_query_mode="hybrid",alpha = 0.75,similarity_top_k = 10)
evaluate(retriver_bkai, test)

Hit rate: 0.742827868852459
MRR score: 0.5097588147280769
NDCG score: 0.5658561840769677


In [19]:
retriver_bkai = index_bkai.as_retriever(vector_store_query_mode="hybrid",alpha = 1,similarity_top_k = 10)
evaluate(retriver_bkai, test)

Hit rate: 0.7469262295081968
MRR score: 0.493544268800416
NDCG score: 0.5538255349486937


## Improved bkai

In [20]:
model_improve_bkai = HuggingFaceEmbedding(model_name="johnweak132/improve_vibi")
vector_improve_bkai = QdrantVectorStore(
    "uit_improve_bkai",
    client=client,
    enable_hybrid=True,
    embed_model = model_improve_bkai
)
index_improve_bkai = VectorStoreIndex.from_vector_store(vector_improve_bkai,embed_model = model_improve_bkai)

In [21]:
retriver_improve_bkai = index_improve_bkai.as_retriever(vector_store_query_mode="hybrid",alpha = 0,similarity_top_k = 10)
evaluate(retriver_improve_bkai, test)

Hit rate: 0.9395491803278688
MRR score: 0.7059694574551134
NDCG score: 0.7629156833486298


In [22]:
retreiver_improve_bkai = index_improve_bkai.as_retriever(vector_store_query_mode="hybrid",alpha = 0.5,similarity_top_k = 10)
evaluate(retriver_improve_bkai, test)

Hit rate: 0.9395491803278688
MRR score: 0.7059694574551134
NDCG score: 0.7629156833486298


In [23]:
retriver_improve_bkai = index_improve_bkai.as_retriever(vector_store_query_mode="hybrid",alpha = 0.75,similarity_top_k = 10)
evaluate(retriver_improve_bkai, test)

Hit rate: 0.9559426229508197
MRR score: 0.7694017531876142
NDCG score: 0.8155590812403827


In [24]:
retriver_improve_bkai = index_improve_bkai.as_retriever(vector_store_query_mode="hybrid",alpha = 1,similarity_top_k = 10)
evaluate(retriver_improve_bkai, test)

Hit rate: 0.9620901639344263
MRR score: 0.779317915690867
NDCG score: 0.8245393448548088
