In [1]:
import sys
from pathlib import Path

project_root = Path().absolute().parent
sys.path.append(str(project_root))

In [2]:
from better_search.lib.vectorstore.hybrid_search import HybridSearch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from qdrant_client import QdrantClient, models
from fastembed.embedding import TextEmbedding
from fastembed.sparse.bm25 import Bm25

from pydantic import BaseModel


class HybridSearchResult(BaseModel):
    podcast_id: int
    episode_id: int
    episode_title: str
    podcast_title: str
    podcast_author: str
    podcast_categoires: list
    sim_score: float

In [4]:
client = QdrantClient(url="http://localhost:6333")
DENSE_MODEL = TextEmbedding(
    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)
SPARSE_MODEL = Bm25("Qdrant/bm25")

In [5]:
collection_name = "episodes_normalized_openai"

In [6]:
query = "فنجان استشاري أسري والصحة النفسية وطريقة تفكير العقل وطريقة حل المشاكل النفسية والتعامل معها"

In [7]:
import unicodedata
import re


def normalize_arabic(text: str) -> str:
    tashkeel = re.compile(r"[\u0617-\u061A\u064B-\u0652]")
    text = tashkeel.sub("", text)

    text = re.sub("\u0640", "", text)

    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ة", "ه", text)

    text = unicodedata.normalize("NFKC", text)
    return text


query = normalize_arabic(query)

In [8]:
from openai import OpenAI
from better_search.core.config import settings

oai_client = OpenAI(api_key=settings.OPENAI_API_KEY)

In [9]:
res = oai_client.embeddings.create(
    input=query, model="text-embedding-3-small", dimensions=1536
)

In [10]:
# query_dense_vector = next(DENSE_MODEL.query_embed(query))
query_dense_vector = res.data[0].embedding
query_sparse_vector = next(SPARSE_MODEL.query_embed(query))

In [11]:
len(query.split())

14

In [12]:
query_length = len(query.split())
prefecth = []

if query_length < 5:
    prefecth = [
        models.Prefetch(
            query=models.SparseVector(**query_sparse_vector.as_object()),
            using="fast-sparse-bm25",
            limit=40,
            params=models.SearchParams(hnsw_ef=256, exact=True),
        ),
        models.Prefetch(
            query=models.SparseVector(**query_sparse_vector.as_object()),
            using="fast-sparse-bm25",
            limit=40,
            params=models.SearchParams(hnsw_ef=256, exact=True),
            filter=models.Filter(
                should=models.FieldCondition(
                    key="documents", match=models.MatchAny(any=query.split())
                )
            ),
        ),
    ]
else:
    prefecth = [
        models.Prefetch(
            query=query_dense_vector,
            using="text-dense",
            limit=15,
            params=models.SearchParams(
                hnsw_ef=256,
                exact=True,
            ),
        ),
        models.Prefetch(
            query=models.SparseVector(**query_sparse_vector.as_object()),
            using="text-sparse",
            limit=40,
            params=models.SearchParams(hnsw_ef=256, exact=True),
        ),
        models.Prefetch(
            query=models.SparseVector(**query_sparse_vector.as_object()),
            using="text-sparse",
            limit=40,
            params=models.SearchParams(hnsw_ef=256, exact=True),
            filter=models.Filter(
                should=models.FieldCondition(
                    key="documents", match=models.MatchAny(any=query.split())
                )
            ),
        ),
    ]

In [13]:
# prefecth = [
#     models.Prefetch(
#         query=query_dense_vector,
#         using="fast-paraphrase-multilingual-minilm-l12-v2",
#         limit=15,
#         params=models.SearchParams(
#             hnsw_ef=256,
#             exact=True,
#         ),
#     ),
#     models.Prefetch(
#         query=models.SparseVector(**query_sparse_vector.as_object()),
#         using="fast-sparse-bm25",
#         limit=40,
#         params=models.SearchParams(hnsw_ef=256, exact=True),
#     ),
#     models.Prefetch(
#         query=models.SparseVector(**query_sparse_vector.as_object()),
#         using="fast-sparse-bm25",
#         limit=40,
#         params=models.SearchParams(hnsw_ef=256, exact=True),
#         filter=models.Filter(
#             should=models.FieldCondition(
#                 key="documents", match=models.MatchAny(any=query.split())
#             )
#         ),
#     ),
# ]

# prefetch = [
#     models.Prefetch(
#         query=query_dense_vector,
#         using="fast-paraphrase-multilingual-minilm-l12-v2",
#         limit=40,
#     ),
#     models.Prefetch(
#         query=models.SparseVector(**query_sparse_vector.as_object()),
#         using="fast-sparse-bm25",
#         limit=40,
#     ),
# ]


result = client.query_points(
    collection_name=collection_name,
    prefetch=prefecth,
    query=models.FusionQuery(fusion=models.Fusion.RRF),
    limit=10,
    with_payload=True,
    search_params=models.SearchParams(hnsw_ef=256, exact=True),
)

results = [
    HybridSearchResult(
        podcast_id=r.payload["podcast_id"],
        episode_id=r.payload["episode_id"],
        episode_title=r.payload["document"].split("\n")[2],
        podcast_title=r.payload["podcast_name"],
        podcast_author=r.payload["podcast_author"],
        podcast_categoires=r.payload["podcast_categories"],
        sim_score=r.score,
    )
    for r in result.points
]

In [14]:
for result in results:
    # print(f"{result.podcast_title}: {result.episode_title}")
    print(result)

podcast_id=57 episode_id=1640 episode_title='182: اهم ما قاله ضيوف فنجان عن الصحه النفسيه' podcast_title='فنجان مع عبدالرحمن أبومالح' podcast_author='ثمانية/ thmanyah' podcast_categoires=['Society', 'Culture'] sim_score=0.8333334
podcast_id=57 episode_id=1657 episode_title='167: اهميه الصحه النفسيه' podcast_title='فنجان مع عبدالرحمن أبومالح' podcast_author='ثمانية/ thmanyah' podcast_categoires=['Society', 'Culture'] sim_score=0.625
podcast_id=57 episode_id=1636 episode_title='184: كيف نتجاوز جائحه كورونا نفسيا؟' podcast_title='فنجان مع عبدالرحمن أبومالح' podcast_author='ثمانية/ thmanyah' podcast_categoires=['Society', 'Culture'] sim_score=0.44444445
podcast_id=57 episode_id=606 episode_title='كيف تفهم نفسك' podcast_title='فنجان مع عبدالرحمن أبومالح' podcast_author='ثمانية/ thmanyah' podcast_categoires=['Society', 'Culture'] sim_score=0.4166667
podcast_id=57 episode_id=645 episode_title='324: هل انت مكتئب فعلا' podcast_title='فنجان مع عبدالرحمن أبومالح' podcast_author='ثمانية/ thmanyah'