In [124]:
import sys
from pathlib import Path

project_root = Path().absolute().parent
sys.path.append(str(project_root))

In [125]:
from better_search.lib.vectorstore.hybrid_search import HybridSearch

In [126]:
from qdrant_client import QdrantClient, models
from fastembed.embedding import TextEmbedding
from fastembed.sparse.bm25 import Bm25

from pydantic import BaseModel


class HybridSearchResult(BaseModel):
    podcast_id: int
    episode_id: int
    episode_title: str
    podcast_title: str
    podcast_author: str
    podcast_categoires: list
    sim_score: float

In [127]:
client = QdrantClient(url="http://localhost:6333")
DENSE_MODEL = TextEmbedding(
    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)
SPARSE_MODEL = Bm25("Qdrant/bm25")

In [128]:
collection_name = "episodes_enhanced"

In [129]:
query = "تطوير المنتجعات في الشرقية"

In [145]:
query = "حياة في الإدارة"

In [146]:
query_dense_vector = next(DENSE_MODEL.query_embed(query))
query_sparse_vector = next(SPARSE_MODEL.query_embed(query))

In [147]:
prefecth = [
    # models.Prefetch(
    #     query=query_dense_vector,
    #     using="fast-paraphrase-multilingual-minilm-l12-v2",
    #     limit=15,
    #     params=models.SearchParams(
    #         hnsw_ef=256,
    #         exact=True,
    #     ),
    # ),
    models.Prefetch(
        query=models.SparseVector(**query_sparse_vector.as_object()),
        using="fast-sparse-bm25",
        limit=40,
        params=models.SearchParams(hnsw_ef=256, exact=True),
    ),
    models.Prefetch(
        query=models.SparseVector(**query_sparse_vector.as_object()),
        using="fast-sparse-bm25",
        limit=40,
        params=models.SearchParams(hnsw_ef=256, exact=True),
        filter=models.Filter(
            should=models.FieldCondition(
                key="documents", match=models.MatchAny(any=query.split())
            )
        ),
    ),
]

result = client.query_points(
    collection_name=collection_name,
    prefetch=prefecth,
    query=models.FusionQuery(fusion=models.Fusion.RRF),
    limit=10,
    with_payload=True,
    search_params=models.SearchParams(hnsw_ef=256, exact=True),
)

results = [
    HybridSearchResult(
        podcast_id=r.payload["podcast_id"],
        episode_id=r.payload["episode_id"],
        episode_title=r.payload["document"].split("\n")[1],
        podcast_title=r.payload["podcast_name"],
        podcast_author=r.payload["podcast_author"],
        podcast_categoires=r.payload["podcast_categories"],
        sim_score=r.score,
    )
    for r in result.points
]

In [148]:
for result in results:
    # print(f"{result.podcast_title}: {result.episode_title}")
    print(result)

podcast_id=55 episode_id=514 episode_title='توصية: كيف أصبحنا مدمني إنترنت' podcast_title='سوالف بزنس مع مشهور الدبيان' podcast_author='ثمانية/ thmanyah' podcast_categoires=['Business'] sim_score=0.5
podcast_id=57 episode_id=619 episode_title='حياة في الإدارة: مطلق المريشد' podcast_title='فنجان مع عبدالرحمن أبومالح' podcast_author='ثمانية/ thmanyah' podcast_categoires=['Society', 'Culture'] sim_score=0.33333334
podcast_id=57 episode_id=1542 episode_title='265: كيف أصبحنا مدمني إنترنت' podcast_title='فنجان مع عبدالرحمن أبومالح' podcast_author='ثمانية/ thmanyah' podcast_categoires=['Society', 'Culture'] sim_score=0.25
podcast_id=57 episode_id=1599 episode_title='218: حياة في الإدارة مع مدحت عامر' podcast_title='فنجان مع عبدالرحمن أبومالح' podcast_author='ثمانية/ thmanyah' podcast_categoires=['Society', 'Culture'] sim_score=0.2
podcast_id=53 episode_id=1288 episode_title='ما الذي سيحدث إن لم تسدد أمريكا ديونها' podcast_title='الفجر' podcast_author='ثمانية/thmanyah' podcast_categoires=['Ne

In [None]:
# TODO: clean all text from arabic shit before embedding