In [1]:
from dotenv import load_dotenv
from datasets import load_dataset
from src.embedder.sparse import SparseEmbedder
from src.embedder.embedder import Embedder
from src.datasource.hybrid import HybridDatasource
from src.utils import evaluate_model, load_test_data

load_dotenv()

queries_dataset = load_dataset("CoIR-Retrieval/cosqa", "queries")["queries"]
corpus_dataset = load_dataset("CoIR-Retrieval/cosqa", "corpus")["corpus"]
default_dataset = load_dataset("CoIR-Retrieval/cosqa", "default")
test_corpus = [function for partition, function in zip(corpus_dataset["partition"], corpus_dataset["text"]) if
               partition == "test"]
test_queries = [query for partition, query in zip(queries_dataset["partition"], queries_dataset["text"]) if
                partition == "test"]


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sparse = SparseEmbedder("Qdrant/bm25")
dense = Embedder("Qwen/Qwen3-Embedding-0.6B", 1024)
db = HybridDatasource(sparse, dense)
await load_test_data(db, "code-test-hybrid", test_corpus, True)


In [3]:
print("bm25 + Qwen3: ")
await evaluate_model(db, "code-test-hybrid", test_queries, test_corpus)

bm25 + Qwen3: 
1.0
0.7911825396825396
0.8400819418770783


In [5]:
sparse = SparseEmbedder("Qdrant/bm25")
dense = Embedder("sentence-transformers/all-MiniLM-L6-v2", 384)
db = HybridDatasource(sparse, dense)
await load_test_data(db, "code-test-hybrid", test_corpus, True)

In [6]:
print("bm25 + MiniLM: ")
await evaluate_model(db, "code-test-hybrid", test_queries, test_corpus)

bm25 + MiniLM: 
1.0
0.7457484126984127
0.8000049321349421
