In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance
from langchain_google_genai import embeddings
import uuid
import os
from dotenv import load_dotenv
from qdrant_client.models import (
    VectorParams, SparseVectorParams, Distance
)
from sentence_transformers import SentenceTransformer
import re
from qdrant_client.models import PointStruct

In [None]:
dataset = pd.read_csv("walmart-products.csv")

In [None]:
dataset.head(10)

In [None]:
dataset.isna().sum()

In [None]:
print(dataset["unit_price"][0])
print(dataset["initial_price"][0])
print(dataset["final_price"][0])
print(dataset["discount"][0])

In [None]:
dataset.info()

In [None]:
len(dataset)

In [None]:
df = dataset.copy()

In [None]:
df["initial_price"] = df["initial_price"].fillna(df["final_price"])

In [None]:
df.isna().sum()

In [None]:
df = df.dropna(subset=["description"])

In [None]:
df["details"] = (
    "product : " + df['product_name'].astype(str) +
    " category : " + df['category_name'].astype(str) +
    " details : " + df['description'].astype(str)
)

In [None]:
client = QdrantClient(url="http://localhost:6333")
from qdrant_client.models import (
    VectorParams, SparseVectorParams, Distance
)

client.recreate_collection(
    collection_name="products",
    vectors_config={
        "dense": VectorParams(
            size=384,
            distance=Distance.COSINE
        )
    },
    sparse_vectors_config={
        "sparse": SparseVectorParams()
    }
)


model = SentenceTransformer("all-MiniLM-L6-v2")

dense_vectors = model.encode(
    df["details"].tolist(),
    show_progress_bar=True,
    normalize_embeddings=True
)
def clean_price(price_str):
    if price_str is None:
        return None
    cleaned = re.sub(r"[^\d.,]", "", str(price_str))
    cleaned = cleaned.replace(",", ".")
    try:
        return float(cleaned)
    except:
        return None

In [None]:
payloads = [
    {
        "actual_price": clean_price(row.initial_price),
        "discounted_price": clean_price(row.final_price),
        "category": row.category_name,
        "rating": row.rating,
        "image_url": row.main_image,
        "product_url" : row.url,
    }
    for row in df.itertuples()
]

In [None]:
from qdrant_client.models import PointStruct

points = []

for i, row in enumerate(df.itertuples()):
    points.append(
        PointStruct(
            id=i,
            vector={
                "dense": dense_vectors[i],
            },
            payload=payloads[i]
        )
    )

client.upsert(
    collection_name="products",
    points=points
)

In [None]:
# Get collection info
collection_info = client.get_collection("products")
print(collection_info)

# Look for vector config
print("\nVector configs:")
print(collection_info.config.params.vectors)

In [None]:
query_text = "a leather jacket"
query_vector = model.encode(query_text).tolist()
results = client.query_points(
    collection_name="products",
    query=query_vector,
    using="dense",
    limit=5
).points

In [None]:
for i, point in enumerate(results, 1):
    print(f"\n--- Result {i} ---")
    print(f"Score: {point.score}")
    print(f"ID: {point.id}")
    print(f"Payload: {point.payload}")

In [None]:
for i in range(min(10, len(df))):
    print(f"{i}: {df['details'].iloc[i][:100]}...")