In [1]:
!pip install langchain sentence-transformers torch transformers pymongo

Collecting pymongo
  Downloading pymongo-4.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014

In [2]:
!pip install langchain_huggingface langchain_mongodb

Collecting langchain_huggingface
  Downloading langchain_huggingface-0.2.0-py3-none-any.whl.metadata (941 bytes)
Collecting langchain_mongodb
  Downloading langchain_mongodb-0.6.2-py3-none-any.whl.metadata (1.7 kB)
Collecting lark<2.0.0,>=1.1.9 (from langchain_mongodb)
  Downloading lark-1.2.2-py3-none-any.whl.metadata (1.8 kB)
Downloading langchain_huggingface-0.2.0-py3-none-any.whl (27 kB)
Downloading langchain_mongodb-0.6.2-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lark-1.2.2-py3-none-any.whl (111 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.0/111.0 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lark, langchain_huggingface, langchain_mongodb
Successfully installed langchain_huggingface-0.2.0 langchain_mongodb-0.6.2 lark-1.2.2


In [30]:
from pymongo import MongoClient
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain.schema import Document
from google.colab import userdata
import torch
import logging
from typing import List, Dict, Any
from LLM_Routing import LLMResponse

# Setup logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

# Connect to MongoDB
mongo_uri = userdata.get("mongodb")
client = MongoClient(mongo_uri)
collection = client["HCMIU_Data"]["Data"]

# Load embedding model
embeddings = HuggingFaceEmbeddings(
    model_name="keepitreal/vietnamese-sbert",
    model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

def get_available_document_types() -> List[str]:
    """Get list of unique document types in the database"""
    return collection.distinct("document_type")

def normalize_scores(scores: List[float]) -> List[float]:
    """Normalize scores to range [0,1] using min-max normalization"""
    if not scores:
        return []
    min_score = min(scores)
    max_score = max(scores)
    if max_score == min_score:
        return [1.0] * len(scores)
    return [(score - min_score) / (max_score - min_score) for score in scores]

def get_filtered_document_ids(document_type: str) -> List[Any]:
    """
    First step: Get all document IDs that match the specified document type

    Args:
        document_type (str): The document type to filter by

    Returns:
        List[Any]: List of document IDs matching the document type
    """
    try:
        # Query to get all documents of the specified type
        filter_pipeline = [
            {
                "$match": {
                    "document_type": document_type
                }
            },
            {
                "$project": {
                    "_id": 1
                }
            }
        ]

        filtered_docs = list(collection.aggregate(filter_pipeline))
        doc_ids = [doc["_id"] for doc in filtered_docs]

        logger.info(f"Found {len(doc_ids)} documents of type '{document_type}'")
        return doc_ids

    except Exception as e:
        logger.error(f"❌ Failed to filter documents by type: {e}")
        return []

def hybrid_search_on_filtered_docs(query: str, document_type: str, alpha: float = 0.5) -> List[Document]:
    try:
        # Log available document types
        available_types = get_available_document_types()
        logger.info(f"Available document types in database: {available_types}")
        logger.info(f"Searching for document type: {document_type}")

        # Check if document type exists
        doc_count = collection.count_documents({"document_type": document_type})
        if doc_count == 0:
            logger.warning(f"⚠️ No documents found for type: {document_type}")
            return []

        logger.info(f"Found {doc_count} documents of type '{document_type}'")

        # Special handling for course_structure documents
        if document_type == "course_structure":
            # Use only text search with title boost for course structure
            bm25_pipeline = [
                {
                    "$search": {
                        "index": "text",
                        "compound": {
                            "must": [
                                {
                                    "text": {
                                        "query": query,
                                        "path": "title",
                                        "score": {"boost": {"value": 2}}  # Boost title matches
                                    }
                                }
                            ],
                            "should": [
                                {
                                    "text": {
                                        "query": query,
                                        "path": "content"
                                    }
                                }
                            ]
                        }
                    }
                },
                {
                    "$match": {
                        "document_type": document_type
                    }
                },
                {
                    "$project": {
                        "title": 1,
                        "content": 1,
                        "document_type": 1,
                        "textScore": { "$meta": "searchScore" }
                    }
                },
                {
                    "$limit": 20
                }
            ]

            bm25_results = list(collection.aggregate(bm25_pipeline))
            logger.info(f"Text search found {len(bm25_results)} results for course structure")

            if not bm25_results:
                logger.warning(f"⚠️ No relevant content found in course structure documents")
                return []

            # Process results
            docs = []
            for doc in bm25_results:
                doc = Document(
                    page_content=doc.get("content", ""),
                    metadata={
                        "title": doc.get("title"),
                        "document_type": doc.get("document_type"),
                        "score": doc.get("textScore", 0),
                        "text_score": doc.get("textScore", 0),
                        "vector_score": 0.0  # No vector score for course structure
                    }
                )
                docs.append(doc)

            # Sort by text score
            docs.sort(key=lambda x: x.metadata.get("score", 0), reverse=True)

        else:
            # Regular hybrid search for other document types
            # Get query embedding
            query_embedding = embeddings.embed_query(query)

            # BM25 Search
            bm25_pipeline = [
                {
                    "$search": {
                        "index": "text",
                        "text": {
                            "query": query,
                            "path": ["title", "content"]
                        }
                    }
                },
                {
                    "$match": {
                        "document_type": document_type
                    }
                },
                {
                    "$project": {
                        "title": 1,
                        "content": 1,
                        "document_type": 1,
                        "textScore": { "$meta": "searchScore" }
                    }
                },
                {
                    "$limit": 20
                }
            ]

            bm25_results = list(collection.aggregate(bm25_pipeline))
            logger.info(f"BM25 search found {len(bm25_results)} results")

            # Vector Search
            vector_pipeline = [
                {
                    "$vectorSearch": {
                        "queryVector": query_embedding,
                        "path": "embedding",
                        "numCandidates": 100,
                        "limit": 50,
                        "index": "default"
                    }
                },
                {
                    "$match": {
                        "document_type": document_type
                    }
                },
                {
                    "$project": {
                        "title": 1,
                        "content": 1,
                        "document_type": 1,
                        "vectorScore": { "$meta": "vectorSearchScore" }
                    }
                },
                {
                    "$limit": 20
                }
            ]

            vector_results = list(collection.aggregate(vector_pipeline))
            logger.info(f"Vector search found {len(vector_results)} results")

            if not bm25_results and not vector_results:
                logger.warning(f"⚠️ No relevant content found in documents of type: {document_type}")
                return []

            # Combine and normalize scores
            combined_results = {}

            # Process BM25 results
            if bm25_results:
                bm25_scores = [doc.get("textScore", 0) for doc in bm25_results]
                normalized_bm25_scores = normalize_scores(bm25_scores)

                for doc, norm_score in zip(bm25_results, normalized_bm25_scores):
                    doc_id = doc.get("_id")
                    combined_results[doc_id] = {
                        "title": doc.get("title"),
                        "content": doc.get("content"),
                        "document_type": doc.get("document_type"),
                        "textScore": norm_score,
                        "vectorScore": 0.0
                    }

            # Process Vector results
            if vector_results:
                vector_scores = [doc.get("vectorScore", 0) for doc in vector_results]
                normalized_vector_scores = normalize_scores(vector_scores)

                for doc, norm_score in zip(vector_results, normalized_vector_scores):
                    doc_id = doc.get("_id")
                    if doc_id in combined_results:
                        combined_results[doc_id]["vectorScore"] = norm_score
                    else:
                        combined_results[doc_id] = {
                            "title": doc.get("title"),
                            "content": doc.get("content"),
                            "document_type": doc.get("document_type"),
                            "textScore": 0.0,
                            "vectorScore": norm_score
                        }

            # Calculate final scores
            docs = []
            for doc_id, result in combined_results.items():
                vector_score = result["vectorScore"]
                text_score = result["textScore"]

                combined_score = (alpha * vector_score) + ((1 - alpha) * text_score)

                doc = Document(
                    page_content=result["content"],
                    metadata={
                        "title": result["title"],
                        "document_type": result["document_type"],
                        "score": combined_score,
                        "vector_score": vector_score,
                        "text_score": text_score
                    }
                )
                docs.append(doc)

            # Sort by combined score
            docs.sort(key=lambda x: x.metadata.get("score", 0), reverse=True)

        # Display results
        print(f"\nSearching for '{query}' in document type: {document_type}")
        print(f"Total documents of this type: {doc_count}")
        print(f"Found {len(docs)} relevant documents")
        print("=" * 60)

        for i, doc in enumerate(docs[:5], 1):
            print(f"\n📄 Result {i}")
            print("Title:", doc.metadata.get("title"))
            print("Document Type:", doc.metadata.get("document_type"))
            print("Combined Score:", f"{doc.metadata.get('score'):.4f}")
            print("Vector Score:", f"{doc.metadata.get('vector_score'):.4f}")
            print("Text Score:", f"{doc.metadata.get('text_score'):.4f}")
            print("Content Preview:", doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content)
            print("-" * 50)

        return docs[:10]

    except Exception as e:
        logger.error(f"❌ Hybrid search failed: {e}")
        return []

def search_documents(query: str, alpha: float = 0.5) -> List[Document]:
    """
    Main search function that first filters by document type, then applies hybrid search.

    Args:
        query (str): The search query
        alpha (float): Weight for vector search (0-1, where 1 = only vector, 0 = only BM25)

    Returns:
        List[Document]: Ranked search results
    """
    try:
        # Step 1: Get document type from LLM
        llm_response = LLMResponse(userdata.get("OpenAI"))
        document_type = llm_response.get_function_call(query)

        if not document_type:
            logger.warning("⚠️ Could not determine document type from query")
            logger.info("Available document types:")
            for doc_type in get_available_document_types():
                logger.info(f"  - {doc_type}")
            return []

        # Log the document type we're searching for
        logger.info(f"LLM determined document type: {document_type}")

        # Step 2: Perform hybrid search on filtered documents
        return hybrid_search_on_filtered_docs(query, document_type, alpha)

    except Exception as e:
        logger.error(f"❌ Search process failed: {e}")
        return []

def search_documents_with_type(query: str, document_type: str, alpha: float = 0.5) -> List[Document]:
    """
    Alternative function to search with explicit document type (bypassing LLM routing)

    Args:
        query (str): The search query
        document_type (str): Explicit document type to search within
        alpha (float): Weight for vector search (0-1)

    Returns:
        List[Document]: Ranked search results
    """
    logger.info(f"Searching with explicit document type: {document_type}")
    return hybrid_search_on_filtered_docs(query, document_type, alpha)

# Example usage
if __name__ == "__main__":
    query = "Sinh viên có anh chị em ruột đang theo học được giảm bao nhiêu học phí ?"

    # Method 1: Let LLM determine document type
    results = search_documents(query, alpha=0.5)

    # Method 2: Specify document type explicitly (if you know it)
    # results = search_documents_with_type(query, "academic_regulations", alpha=0.5)

    print(f"\nFinal Results: {len(results)} documents found")

Function to call: document_routing
Arguments: {
  "document_type": "quy_dinh"
}

Searching for 'Sinh viên có anh chị em ruột đang theo học được giảm bao nhiêu học phí ?' in document type: quy_dinh
Total documents of this type: 363
Found 29 relevant documents

📄 Result 1
Title: Mục 8
Document Type: quy_dinh
Combined Score: 1.0000
Vector Score: 1.0000
Text Score: 1.0000
Content Preview: Mục 8: Sinh viên có anh chị em ruột hiện đang học tại trường Đại học Quốc tế (áp dụng kể từ người thứ hai).
Mức miễn giảm: 10%
Yêu cầu: Bản sao công chứng hộ khẩu thường trú hoặc bản sao công chứng gi...
--------------------------------------------------

📄 Result 2
Title: Mục 7
Document Type: quy_dinh
Combined Score: 0.5792
Vector Score: 0.2349
Text Score: 0.9236
Content Preview: Mục 7: Sinh viên có anh chị em ruột hiện đang học tại trường Đại học Quốc tế (áp dụng kể từ người thứ hai), (Chính sách riêng của trường ĐHOT). >
Mức miễn giảm: 10% học phí thực. > Yêu cầu về hồ sơ cầ...
------------------------