In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance
from langchain_google_genai import embeddings
import uuid
import os
from dotenv import load_dotenv
from qdrant_client.models import (
    VectorParams, SparseVectorParams, Distance
)
from sentence_transformers import SentenceTransformer
import re
from qdrant_client.models import PointStruct

In [2]:
dataset = pd.read_csv("walmart-products.csv")

In [3]:
df = dataset.copy()

In [4]:
df["initial_price"] = df["initial_price"].fillna(df["final_price"])

In [5]:
df = df.dropna(subset=["description"])

In [6]:
df["details"] = (
    "product : " + df['product_name'].astype(str) +
    " category : " + df['category_name'].astype(str) +
    " details : " + df['description'].astype(str)
)

In [7]:
from qdrant_client import models


In [8]:
client = QdrantClient(url="http://localhost:6333")
from qdrant_client.models import (
    VectorParams, SparseVectorParams, Distance, PointStruct
)
documents = df["details"].tolist()
from fastembed import SparseTextEmbedding, TextEmbedding, LateInteractionTextEmbedding
dense_embedding_model=SentenceTransformer("all-MiniLM-L6-v2")
sparse_model = SparseTextEmbedding(model_name="prithivida/Splade_PP_en_v1")
bm25_embedding_model = SparseTextEmbedding(model_name="Qdrant/bm25")
late_interaction_embedding_model = LateInteractionTextEmbedding("colbert-ir/colbertv2.0")



Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [9]:
dense_embedding_model2=TextEmbedding("sentence-transformers/all-MiniLM-L6-v2")
dense_embeddings = list(dense_embedding_model2.embed(doc for doc in documents))
sparse_embeddings = list(bm25_embedding_model.embed(doc for doc in documents))
late_interaction_embeddings = list(late_interaction_embedding_model.embed(doc for doc in documents))


In [10]:
client.recreate_collection(
    collection_name="products",
    vectors_config={
        "text-dense": VectorParams(
            size=len(dense_embeddings[0]),
            distance=Distance.COSINE
        ),
        "text-late-interaction": models.VectorParams(
            size=len(late_interaction_embeddings[0][0]),
            distance=models.Distance.COSINE,
            multivector_config=models.MultiVectorConfig(
                comparator=models.MultiVectorComparator.MAX_SIM,
            ),
            hnsw_config=models.HnswConfigDiff(m=0)  #  Disable HNSW for reranking
        ),
    },
    sparse_vectors_config={
        "text-sparse": SparseVectorParams(
            modifier=models.Modifier.IDF,
            index=models.SparseIndexParams(
                on_disk=False,
            )
        )
    }
)

def clean_price(price_str):
    if price_str is None:
        return None
    cleaned = re.sub(r"[^\d.,]", "", str(price_str))
    cleaned = cleaned.replace(",", ".")
    try:
        return float(cleaned)
    except:
        return None

  client.recreate_collection(


In [11]:
payloads = [
    {
        "actual_price": clean_price(row.initial_price),
        "discounted_price": clean_price(row.final_price),
        "category": row.category_name,
        "rating": row.rating,
        "image_url": row.main_image,
        "product_url" : row.url,
    }
    for row in df.itertuples()
]

In [12]:
points = []
texts = df["details"].tolist()
avg_doc_length = sum(len(text.split()) for text in texts) / len(texts)
for i, row in enumerate(df.itertuples()):
    points.append(
        PointStruct(
            id=i,
            vector={
                "text-dense": dense_embeddings[i],
                "text-sparse": sparse_embeddings[i].as_object(),
                "text-late-interaction": late_interaction_embeddings[i],
            },
            payload=payloads[i]
        )
    )


In [13]:
client.upload_points(
    collection_name="products",
    points=points,
    batch_size=32,
    parallel=1,
    wait=True
)

In [14]:
# Get collection info
collection_info = client.get_collection("products")
print(collection_info)

# Look for vector config
print("\nVector configs:")
print(collection_info.config.params.vectors)


Vector configs:
{'text-dense': VectorParams(size=384, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=None), 'text-late-interaction': VectorParams(size=128, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=HnswConfigDiff(m=0, ef_construct=None, full_scan_threshold=None, max_indexing_threads=None, on_disk=None, payload_m=None, inline_storage=None), quantization_config=None, on_disk=None, datatype=None, multivector_config=MultiVectorConfig(comparator=<MultiVectorComparator.MAX_SIM: 'max_sim'>))}


In [15]:
query_text = "leather jacket"
dense_vectors = next(dense_embedding_model2.query_embed(query_text))
sparse_vectors = next(bm25_embedding_model.query_embed(query_text))
late_vectors = next(late_interaction_embedding_model.query_embed(query_text))

In [16]:
prefetch = [
        models.Prefetch(
            query=dense_vectors,
            using="text-dense",
            limit=20,
        ),
        models.Prefetch(
            query=models.SparseVector(**sparse_vectors.as_object()),
            using="text-sparse",
            limit=20,
        ),
    ]

In [17]:
results = client.query_points(
         "products",
        prefetch=prefetch,
        query=late_vectors,
        using="text-late-interaction",
        with_payload=True,
        limit=5,
)

In [18]:
for i, point in enumerate(results.points, 1):
    print(f"\n--- Result {i} ---")
    print(f"Score: {point.score}")
    print(f"ID: {point.id}")
    print(f"Payload: {point.payload}")


--- Result 1 ---
Score: 19.603956
ID: 102
Payload: {'actual_price': 35.19, 'discounted_price': 29.99, 'category': "Men's Character Shop", 'rating': 5.0, 'image_url': '"https://i5.walmartimages.com/seo/Men-Sherpa-Lined-Hooded-Jacket-Men-s-Flannel-Plaid-Shirts-Fleece-Fuzzy-Button-Down-Long-Sleeve-Winter-Thermal-Hoodies-Jackets-with-Pockets_dc9e9fe7-4994-4665-8ef5-705f11685c36.e015b9e817f57c0cc85c2bb14dc7746b.jpeg"', 'product_url': 'https://www.walmart.com/ip/Men-Sherpa-Lined-Hooded-Jacket-Men-s-Flannel-Plaid-Shirts-Fleece-Fuzzy-Button-Down-Long-Sleeve-Winter-Thermal-Hoodies-Jackets-with-Pockets/5209344136'}

--- Result 2 ---
Score: 17.307285
ID: 555
Payload: {'actual_price': 36.95, 'discounted_price': 29.39, 'category': "Men's Character Shop", 'rating': 5.0, 'image_url': '"https://i5.walmartimages.com/seo/Men-Casual-Sherpa-Fleece-Lined-Plaid-Flannel-Shirts-Jackets-Men-s-Long-Sleeve-Plush-Thicken-Hooded-Jacket-Winter-Thermal-Coats-with-Pockets_7711d647-ab8f-4ba4-a59e-c79c3677871d.e0a0ec3