In [1]:
from langchain_community.document_loaders import TextLoader
from langchain.schema import Document
from merge_meaning import SemanticChunker
from langchain.schema import Document
from qdrant_client import QdrantClient, models
from qdrant_client.models import Distance, VectorParams
from qdrant_client.models import PointStruct
from langchain_ollama import OllamaEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
import os
import pandas as pd
import mysql.connector
from typing import List, Dict, Any, Optional
os.environ["CUDA_VISIBLE_DEVICES"] = "1"


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# data = pd.read_csv("product_neh.csv")
# data

In [3]:
client = QdrantClient(path="./langchain_qdrant")

In [None]:
class OptimizedProductSearch:
    def __init__(self):
        self.client = client
        self.embeddings = HuggingFaceEmbeddings(model_name = './vietnamese-bi-encoder')
        self.collection_name = "product_search"
        self.db_config = {
                'host': 'localhost', #'genimagine-server-vn',
                'user': 'root',
                'password': '123',
                'database': 'myshop'
            }


    def create_text_description(self, product_data: Dict) -> str:
        """Tạo text description từ các trường quan trọng"""
        important_fields = [
            product_data.get('name', ''),
            product_data.get('description', ''),
            product_data.get('image_url', ''),
            f"giá {product_data.get('price', 0)}",
            f"chiết khấu {product_data.get('discount_percent', 0)}"
        ]
        return ". ".join([field for field in important_fields if field])

    def index_products_to_qdrant(self):
        """Chỉ index các trường cần thiết vào Qdrant"""

        if not self.client.collection_exists(self.collection_name):
            self.client.create_collection(
                collection_name=self.collection_name,
                vectors_config=VectorParams( size=768, distance=Distance.COSINE)
                )
            
        conn = mysql.connector.connect(**self.db_config)
        cursor = conn.cursor()
        
        query = """
        SELECT id, name, description, image_url, price, discount_percent
        FROM products 
        WHERE is_available_online = 1
        """
        
        cursor.execute(query)
        products = cursor.fetchall()
        
        points = []
        for i, product_row in enumerate(products):
            product_data = {
                'id': product_row[0],
                'name': product_row[1],
                'description': product_row[2],
                'image_url': product_row[3],
                'price': product_row[4],
                'discount_percent': product_row[5],
            }
            
            text_description = self.create_text_description(product_data)            # Create text description
            vector = self.embeddings.embed_query(text_description)
            # print(len(vector))
            
            # Payload minimal - chỉ lưu những gì cần thiết
            payload = {
                "id": product_data["id"],
                "name": product_data["name"],  # Để hiển thị quick preview
                "price": float(product_data["price"]), # Để filter nhanh
                "discount_percent": float(product_data["discount_percent"]) # Để filter theo category
            }
            
            points.append(models.PointStruct(
                id=i,
                vector=vector,
                payload=payload
            ))
        
        self.client.upsert(
            collection_name=self.collection_name,
            wait=True,
            points=points
        )
        
        conn.close()
        print(f"Indexed {len(points)} products to Qdrant")

    def hybrid_search(self, query_text: str, 
                     top_k: int = 10,
                     filters: Optional[Dict] = None) -> List[Dict]:
        """Tìm kiếm hybrid với filters"""
        
        query_vector = self.embeddings.embed_query(query_text)
        
        search_params = {
            "collection_name": self.collection_name,
            "query": query_vector,
            "limit": top_k,
            "with_payload": True,
            "with_vectors": True
        }
        
        if filters:
            search_params["query_filter"] = self._build_qdrant_filter(filters)
        
        qdrant_results = self.client.query_points(**search_params)
        ids = [p.payload["id"] for p in qdrant_results.points]
        
        if not ids:
            return []
        
        # get full details from SQL
        full_products = self.get_full_product_details(ids)
        
        # Maintain order by relevance score
        product_dict = {p["id"]: p for p in full_products}
        ordered_results = []
        
        for hit in qdrant_results.points:
            id = hit.payload["id"]
            if id in product_dict:
                product = product_dict[id]
                product["relevance_score"] = hit.score
                ordered_results.append(product)
        
        return ordered_results
    

    def _build_qdrant_filter(self, filters: Dict) -> models.Filter:
        """Xây dựng Qdrant filter từ dict"""
        conditions = []
        
        for field, value in filters.items():
            if isinstance(value, dict):
                # Range filter: {"price": {"gte": 100, "lte": 1000}}
                if "gte" in value:
                    conditions.append(
                        models.FieldCondition(
                            key=field,
                            range=models.Range(gte=value["gte"])
                        )
                    )
                if "lte" in value:
                    conditions.append(
                        models.FieldCondition(
                            key=field,
                            range=models.Range(lte=value["lte"])
                        )
                    )
            else:
                # Exact match: {"category": "electronics"}
                conditions.append(
                    models.FieldCondition(
                        key=field,
                        match=models.MatchValue(value=value)
                    )
                )
        
        return models.Filter(must=conditions)
    

    def get_full_product_details(self, product_ids: List[str]) -> List[Dict]:
        """Truy xuất full details từ SQL database"""
        conn = mysql.connector.connect(**self.db_config)
        cursor = conn.cursor()

        if not product_ids:
            return []

        placeholders = ','.join(['%s' for _ in product_ids])
        query = f"""
        SELECT id, name, description, image_url, price, discount_percent
        FROM products 
        WHERE id IN ({placeholders}) and is_available_online = 1
        """
        
        cursor.execute(query, product_ids)  
        results = cursor.fetchall()         
        
        products = []
        for row in results:
            products.append({
                'id': row[0],
                'name': row[1],
                'description': row[2],
                'image_url': row[3],
                'price': float(row[4]),
                'discount_percent': float(row[5]),
            })

        conn.close()
        return products





In [7]:
search_engine = OptimizedProductSearch()
search_engine.index_products_to_qdrant()

# Tìm kiếm với filters
results = search_engine.hybrid_search(
    query_text="tôi nên chọn kem chống nắng nào",
    top_k=5,
    filters={
        "price": {"gte": 200, "lte": 15000}
    }
)

for product in results:
    print(f"{product['name']} - ${product['price']} (Score: {product['relevance_score']:.3f})")


Indexed 4 products to Qdrant


In [None]:
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_google_genai import ChatGoogleGenerativeAI
from typing import Dict, Optional

api_key = AIzaSyAQD4yGxRd4YErjvxZDuOXFwkmgt2pUytM

class RangeFilter(BaseModel):
    gte: Optional[float] = Field(..., description="Minimum value of the field")
    lte: Optional[float] = Field(..., description="Maximum value of the field")

class FilterSchema(BaseModel):
    """Represents a filter with a range for a specific product field"""
    price: Optional[RangeFilter] = None
    discount_percent: Optional[RangeFilter] = None

llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0)
structured_llm = llm.with_structured_output(FilterSchema)

# 3. Hàm nhận message và trả về JSON có cấu trúc
def extract_filter_from_message(message: str) -> dict:
    result = structured_llm.invoke(message)
    return result.dict(exclude_none=True)

# 4. Ví dụ sử dụng
if __name__ == "__main__":
    msg = "Tôi muốn tìm sản phẩm có giá từ 200 đến 15000 và chiết khấu từ 10 đến 50 phần trăm"
    structured_json = extract_filter_from_message(msg)
    print(structured_json)
