In [8]:
import os
import json
from qdrant_client import QdrantClient, models
from sentence_transformers import SentenceTransformer
from fastembed import SparseTextEmbedding
from PIL import Image
import requests
from io import BytesIO
from dotenv import load_dotenv

In [9]:
load_dotenv()
qdrant_api_key = os.getenv("QDRANT_API_KEY")
cluster_endpoint = os.getenv("QDRANT_CLUSTER_ENDPOINT")

In [10]:
# --- CONFIGURATION ---
COLLECTION_NAME = "Hybrid_Collection_CONVOLVE"

client = QdrantClient(
    url=cluster_endpoint,
    api_key=qdrant_api_key,
)

# --- 1. LOAD MODELS (For generating query vectors) ---
print("‚è≥ Loading Search Models...")
dense_text_model = SentenceTransformer("intfloat/multilingual-e5-base")
dense_image_model = SentenceTransformer("clip-ViT-B-32")
sparse_text_model = SparseTextEmbedding(model_name="Qdrant/bm25")

print("‚úÖ Models Ready.")

‚è≥ Loading Search Models...


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


‚úÖ Models Ready.


In [11]:
# --- 2. HELPER: DYNAMIC FILTER BUILDER ---
def build_filter(filter_dict):
    """
    Converts a simple dictionary {'category': 'News', 'trust_score': 0.9}
    into a complex Qdrant Filter object.
    """
    if not filter_dict: return None
    
    conditions = []
    for key, value in filter_dict.items():
        # Handle simple equality (Category = "News")
        if isinstance(value, str):
            conditions.append(
                models.FieldCondition(key=key, match=models.MatchValue(value=value))
            )
        # Handle ranges (Score >= 0.9)
        elif isinstance(value, (int, float)):
             conditions.append(
                models.FieldCondition(key=key, range=models.Range(gte=value))
            )
        # Handle lists (Category IN ["News", "Reports"])
        elif isinstance(value, list):
             conditions.append(
                models.FieldCondition(key=key, match=models.MatchAny(any=value))
            )

    return models.Filter(must=conditions)

In [None]:
def search_sparse(query_text, filters=None, limit=5):
    print(f"\nüîç [SPARSE] Searching for: '{query_text}'")
    
    query_vector = list(sparse_text_model.embed([query_text]))[0]

    hits = client.query_points(
        collection_name=COLLECTION_NAME,
        query=models.SparseVector(
            indices=query_vector.indices.tolist(),
            values=query_vector.values.tolist()
        ),
        using="sparse_text",    # Specify the vector name here
        query_filter=build_filter(filters),
        limit=limit
    ).points
    
    return hits

In [34]:
# --- 5. RETRIEVAL FUNCTION 3: IMAGE SEARCH (Visual) ---
def search_image(image_source, filters=None, limit=5):
    print(f"\nüîç [IMAGE] Searching with image input...")
    
    if not image_source: return None
    
    try:
        img = None
        
        # FIX 1: Check for both http and https
        if image_source.startswith(("http://", "https://")):
            response = requests.get(image_source, stream=True, timeout=10)
            
            # FIX 2: Raise error if status is 404/500
            response.raise_for_status() 
            
            img = Image.open(BytesIO(response.content))
        
        elif os.path.exists(image_source):
            img = Image.open(image_source)

        if img:
            # FIX 3: Force conversion to RGB (Fixes PNG/RGBA errors)
            img = img.convert("RGB")
            
            
            
    except Exception as e:
        print(f"‚ö†Ô∏è Image Error processing '{image_source}': {e}")

    # 2. Vectorize Image (CLIP)
    image_vector = dense_image_model.encode(img, normalize_embeddings=True).tolist()

    # 3. Search "dense_image" vector space
    hits = client.query_points(
        collection_name=COLLECTION_NAME,
        query=image_vector,
        using="dense_image",    # Specify the vector name here
        query_filter=build_filter(filters),
        limit=limit
    ).points

    return hits

In [16]:
# --- 6. RETRIEVAL FUNCTION 4: HYBRID SEARCH (RRF Fusion) ---
def search_hybrid(query_text, filters=None, limit=5):
    print(f"\nüîç [HYBRID] Searching for: '{query_text}'")
    
    # RRF (Reciprocal Rank Fusion) is the industry standard for 
    # combining Dense (Semantic) + Sparse (Keyword) results.
    
    # 1. Get Results from both worlds
    dense_hits = search_dense(query_text, filters, limit=limit*2)
    sparse_hits = search_sparse(query_text, filters, limit=limit*2)
    
    # 2. Fuse Scores (RRF Algorithm)
    # Score = 1 / (rank + k)
    rank_k = 60
    fused_scores = {}
    
    # Process Dense Ranks
    for rank, hit in enumerate(dense_hits):
        if hit.id not in fused_scores: fused_scores[hit.id] = {"hit": hit, "score": 0}
        fused_scores[hit.id]["score"] += 1 / (rank + rank_k)
        
    # Process Sparse Ranks
    for rank, hit in enumerate(sparse_hits):
        if hit.id not in fused_scores: fused_scores[hit.id] = {"hit": hit, "score": 0}
        fused_scores[hit.id]["score"] += 1 / (rank + rank_k)
    
    # 3. Sort by new fused score
    sorted_results = sorted(
        fused_scores.values(), 
        key=lambda x: x["score"], 
        reverse=True
    )
    
    # Return top N original hit objects
    return [item["hit"] for item in sorted_results[:limit]]

In [20]:
# --- 3. RETRIEVAL FUNCTION 1: DENSE SEARCH (Semantic) ---
def search_dense(query_text, filters=None, limit=5):
    print(f"\nüîç [DENSE] Searching for: '{query_text}'")
    
    # 1. Vectorize Query (E5 needs "query: " prefix)
    query_vector = dense_text_model.encode(
        f"query: {query_text}", 
        normalize_embeddings=True
    ).tolist()

    # 2. Search "dense_text" vector space
    hits = client.query_points(
        collection_name=COLLECTION_NAME,
        query=query_vector,     # Pass the vector list directly
        using="dense_text",     # Specify the vector name here
        query_filter=build_filter(filters),
        limit=limit
    ).points
    return hits

In [24]:
# A. Test Dense (Semantic Question)
results = search_dense(
    "The EVM and VVPAT are free in price", 
    filters={"topic_tags": [
        "EVM Price",
        "VVPAT Price"
      ]} 
)
for hit in results:
    print(f"   Score: {hit.score:.3f} | {hit.payload.get('text_content')[:]}...")


üîç [DENSE] Searching for: 'The EVM and VVPAT are free in price'
   Score: 0.868 | The cost of CU, BU and VVPAT is finalised by the Price Negotiation Committee constituted by the Government of India. The present cost of a BU is Rs.7,991/-, CU is Rs.9,812/- and VVPAT is Rs.16,132/-. The economic life of EVMs is approximately 15 years....


In [32]:



# B. Test Sparse (Specific Keywords)
results = search_sparse(
    "EVM can be hacked", 
    filters={"topic_tags": [
        "Election scams",
        "Lies around elections"
      ], "trust_score": 0.9}  
)
for hit in results:
    print(f"   Score: {hit.score:.3f} | {hit.payload.get('text_content', 'No Title')}")

# C. Test Image (Visual Match)
# Use a dummy image or a real path if you have one
# results = search_image("assets/test_backpack.jpg") 

# D. Test Hybrid (Best of both worlds)



üîç [SPARSE] Searching for: 'EVM can be hacked'
   Score: 9.825 | EVM can be Hacked.
   Score: 9.669 | A person in a video posted on a YT Channel was claiming that EVM can be hacked.
   Score: 9.395 | A  tweet with a media clipping was shared claiming that former Chief Election Commissioner of India Sh. T.S. Krishnamurthy had opined that a particular party won assembly elections by hacking EVMs.
   Score: 2.505 | A false claim was made in a video regarding mismatch of electors & votes polled in #EVM in Varanasi Parliamentary Constituency during General Elections 2019. Another false claim was made quoting ECI letter that mismatch between total electors & votes polled in EVM were found in 373 PCs in General¬†Elections¬†2019
   Score: 2.309 | 15 lakh EVMs are missing.


In [31]:
results = search_hybrid(
    "Mandatory verification of VVPAT slips of randomly selected 05 polling stations per Assembly Constituency",
    filters={"trust_score": 1.0}
)
print("\nüèÜ HYBRID WINNERS:")
for hit in results:
    # Note: Hybrid returns the original Qdrant object, but score is RRF score now
    print(f"   Payload: {hit.payload.get('title') or hit.payload.get('text_content')[:40]}")


üîç [HYBRID] Searching for: 'Mandatory verification of VVPAT slips of randomly selected 05 polling stations per Assembly Constituency'

üîç [DENSE] Searching for: 'Mandatory verification of VVPAT slips of randomly selected 05 polling stations per Assembly Constituency'

üîç [SPARSE] Searching for: 'Mandatory verification of VVPAT slips of randomly selected 05 polling stations per Assembly Constituency'

üèÜ HYBRID WINNERS:
   Payload: Mandatory verification of VVPAT slips of
   Payload: 1200 votes in 1% EVMs, 1000 votes in 2% 
   Payload: Polling agents are allowed to sign the s
   Payload: There is no scientific basis to such fal
   Payload: The representatives of National and Stat


In [35]:
# Use a direct image link (ending in .jpg or .png)
query_image_url = "https://github.com/Keshav-CUJ/Qdrant-convole/raw/main/images/EVMbackpack.png"

results = search_image(
    image_source=query_image_url, 
    limit=3
)

print(f"\nüåê Visual Search Results for URL:")
for hit in results:
    print(f"   Score: {hit.score:.3f} | Found: {hit.payload.get('title')}")


üîç [IMAGE] Searching with image input...

üåê Visual Search Results for URL:
   Score: 1.000 | Found: BACKPACKS TO CARRY EVMS - INDICATIVE SAMPLES
   Score: 0.561 | Found: SET UP OF POLLING STATION FOR SINGLE ELECTION
   Score: 0.493 | Found: Arrangements for barricading for Counting Tables
