In [1]:
import os
import sys
import time
import logging
import warnings
import shutil
from typing import List, Dict, Any, Optional
from datetime import datetime
import pandas as pd
import numpy as np
from pydantic import BaseModel
import torch

# LangChain Core
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.retrievers import BaseRetriever
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_text_splitters import RecursiveCharacterTextSplitter

# LangChain Retrievers
from langchain_community.retrievers import BM25Retriever

# Vector Stores
from langchain_chroma import Chroma
from langchain_community.vectorstores import FAISS
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings

# Transformers for PhoBERT
from transformers import AutoModel, AutoTokenizer

# Document Processing
from markitdown import MarkItDown
from langchain_docling.loader import DoclingLoader, ExportType
from docling.datamodel.pipeline_options import PdfPipelineOptions, EasyOcrOptions
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker

# Evaluation
from ragas import evaluate
from ragas.metrics import context_recall, context_precision
from datasets import Dataset

# Suppress warnings
logging.getLogger("docling").setLevel(logging.WARNING)
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:

# CONFIGURATION CLASS
class PipelineConfig(BaseModel):
    chunk_size: int = 500
    chunk_overlap: int = 50
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    bm25_weight: float = 0.5
    dense_weight: float = 0.5
    top_k: int = 5

config = PipelineConfig()
print(f"üöÄ Pipeline Config: Device={config.device}, Chunk Size={config.chunk_size}")


üöÄ Pipeline Config: Device=cpu, Chunk Size=500


In [None]:

# CUSTOM ENSEMBLE RETRIEVER (REPLACEMENT FOR DEPRECATED ONE)

class CustomEnsembleRetriever(BaseRetriever):
    """
    Custom implementation of Ensemble Retriever since it was removed from LangChain.
    Combines multiple retrievers using weighted reciprocal rank fusion.
    """
    retrievers: List[BaseRetriever]
    weights: List[float]
    k: int = 5
    
    class Config:
        arbitrary_types_allowed = True
    
    def _get_relevant_documents(
        self, query: str, *, run_manager: Optional[CallbackManagerForRetrieverRun] = None
    ) -> List[Document]:
        """Retrieve documents from all retrievers and merge using weighted RRF."""
        
        # Get results from all retrievers
        all_results = []
        for retriever in self.retrievers:
            try:
                docs = retriever.invoke(query)
                all_results.append(docs)
            except Exception as e:
                print(f"‚ö†Ô∏è Retriever failed: {e}")
                all_results.append([])
        
        # Reciprocal Rank Fusion with weights
        doc_scores = {}
        for docs, weight in zip(all_results, self.weights):
            for rank, doc in enumerate(docs):
                doc_id = doc.page_content[:100]  # Use content snippet as ID
                if doc_id not in doc_scores:
                    doc_scores[doc_id] = {"doc": doc, "score": 0}
                # RRF formula: weight / (rank + 60)
                doc_scores[doc_id]["score"] += weight / (rank + 60)
        
        # Sort by score and return top k
        sorted_docs = sorted(doc_scores.values(), key=lambda x: x["score"], reverse=True)
        return [item["doc"] for item in sorted_docs[:self.k]]

In [4]:

# METADATA FLATTENING UTILITY

def flatten_metadata(metadata: Dict[str, Any]) -> Dict[str, Any]:
    """
    Flatten nested metadata to make it compatible with ChromaDB and Pinecone.
    Only keep str, int, float, bool values.
    """
    flat = {}
    for key, value in metadata.items():
        # Skip complex nested structures
        if isinstance(value, (dict, list)):
            # Convert to JSON string representation
            import json
            try:
                flat[key] = json.dumps(value)[:500]  # Limit length
            except:
                flat[key] = str(value)[:500]
        elif isinstance(value, (str, int, float, bool)):
            flat[key] = value
        elif value is None:
            flat[key] = "none"
        else:
            flat[key] = str(value)[:500]
    
    return flat


def clean_documents(docs: List[Document]) -> List[Document]:
    """Clean documents by flattening their metadata."""
    cleaned = []
    for doc in docs:
        cleaned_doc = Document(
            page_content=doc.page_content,
            metadata=flatten_metadata(doc.metadata)
        )
        cleaned.append(cleaned_doc)
    return cleaned



In [5]:

# EMBEDDING MODELS

class PhoBertLangChainWrapper(Embeddings):
    """PhoBERT Embedding Model for Vietnamese text (768 dimensions)"""
    
    def __init__(self, model_name="vinai/phobert-base", device=None):
        self.device = device or config.device
        print(f"üì¶ Loading {model_name} on {self.device}...")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to(self.device)
        self.model.eval()

    def _mean_pooling(self, model_output, attention_mask):
        """Apply mean pooling over token embeddings"""
        token_embeddings = model_output[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
            input_mask_expanded.sum(1), min=1e-9
        )

    def _embed(self, texts: List[str]) -> List[List[float]]:
        """Embed a list of texts"""
        encoded_input = self.tokenizer(
            texts, 
            padding=True, 
            truncation=True, 
            max_length=256, 
            return_tensors='pt'
        ).to(self.device)
        
        with torch.no_grad():
            model_output = self.model(**encoded_input)
        
        # Mean pooling + normalization
        sentence_embeddings = self._mean_pooling(model_output, encoded_input['attention_mask'])
        sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
        
        return sentence_embeddings.cpu().numpy().tolist()

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return self._embed(texts)

    def embed_query(self, text: str) -> List[float]:
        return self._embed([text])[0]


# Initialize embedding models
phobert_embedding = PhoBertLangChainWrapper(device=config.device)

print("‚úÖ Embedding models initialized")
print("‚ö†Ô∏è  Note: Pinecone requires OpenAI embeddings - skipping if quota exceeded")



üì¶ Loading vinai/phobert-base on cpu...
‚úÖ Embedding models initialized
‚ö†Ô∏è  Note: Pinecone requires OpenAI embeddings - skipping if quota exceeded


In [6]:
# CHUNKING STRATEGIES

def get_chunks_hybrid(file_path: str) -> List[Document]:
    """Hybrid Chunking using Docling (Structure-aware)"""
    print(f"üìÑ Running Hybrid Chunking on {file_path}...")
    try:
        pipeline_options = PdfPipelineOptions(
            do_ocr=True, 
            ocr_options=EasyOcrOptions(lang=['vi'])
        )
        loader = DoclingLoader(
            file_path=file_path,
            export_type=ExportType.DOC_CHUNKS,
            chunker=HybridChunker(
                chunk_size=config.chunk_size, 
                chunk_overlap=config.chunk_overlap
            )
        )
        chunks = list(loader.load())
        
        # CRITICAL: Clean metadata for ChromaDB compatibility
        chunks = clean_documents(chunks)
        
        print(f"   ‚úÖ Generated {len(chunks)} chunks (metadata cleaned)")
        return chunks
    except Exception as e:
        print(f"   ‚ùå Hybrid chunking failed: {e}")
        print(f"   üîÑ Falling back to Recursive chunking...")
        return get_chunks_recursive(file_path)


def get_chunks_recursive(file_path: str) -> List[Document]:
    """Recursive Character Text Splitter (General purpose)"""
    print(f"üìÑ Running Recursive Chunking on {file_path}...")
    mk = MarkItDown()
    markdown_text = mk.convert(file_path).markdown
    
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=config.chunk_size, 
        chunk_overlap=config.chunk_overlap,
        separators=["\n\n", "\n", ".", " ", ""]
    )
    doc_obj = Document(page_content=markdown_text, metadata={'source': file_path})
    chunks = splitter.split_documents([doc_obj])
    print(f"   ‚úÖ Generated {len(chunks)} chunks")
    return chunks


def get_chunks_custom(file_path: str, words_per_chunk: int = 200) -> List[Document]:
    """Custom Word-based Chunking (Simple word windows)"""
    print(f"üìÑ Running Custom Chunking on {file_path}...")
    mk = MarkItDown()
    text = mk.convert(file_path).markdown
    words = text.split()
    chunks = []
    
    for i in range(0, len(words), words_per_chunk):
        chunk_text = " ".join(words[i:i+words_per_chunk])
        chunks.append(Document(
            page_content=chunk_text, 
            metadata={'source': file_path, 'chunk_id': str(i)}  # Ensure string type
        ))
    
    print(f"   ‚úÖ Generated {len(chunks)} chunks")
    return chunks


# Strategy mapping
chunking_strategies = {
    "Hybrid": get_chunks_hybrid,
    "Recursive": get_chunks_recursive,
    "Custom": get_chunks_custom
}



In [7]:

# VECTOR DATABASE CREATION

def create_chroma_retriever(docs: List[Document], embeddings: Embeddings, collection_name: str):
    """Create Chroma vector store retriever (Local, persistent) - WITH FILE LOCK FIX"""
    print(f"   üîß Creating Chroma retriever for {collection_name}...")
    persist_dir = f"./chroma_db_{collection_name}"
    
    # Force remove old collection to avoid file locks
    try:
        if os.path.exists(persist_dir):
            shutil.rmtree(persist_dir)
            time.sleep(0.5)  # Wait for file system
    except Exception as e:
        print(f"   ‚ö†Ô∏è Warning during cleanup: {e}")
    
    try:
        vectorstore = Chroma.from_documents(
            documents=docs,
            embedding=embeddings,
            collection_name=collection_name,
            persist_directory=persist_dir
        )
        return vectorstore.as_retriever(search_kwargs={"k": config.top_k})
    except Exception as e:
        print(f"   ‚ùå Chroma creation failed: {e}")
        return None


def create_faiss_retriever(docs: List[Document], embeddings: Embeddings, collection_name: str):
    """Create FAISS vector store retriever (Local, in-memory)"""
    print(f"   üîß Creating FAISS retriever for {collection_name}...")
    try:
        vectorstore = FAISS.from_documents(docs, embeddings)
        return vectorstore.as_retriever(search_kwargs={"k": config.top_k})
    except Exception as e:
        print(f"   ‚ùå FAISS creation failed: {e}")
        return None


def create_pinecone_retriever(docs: List[Document], embeddings: Embeddings, collection_name: str):
    """Create Pinecone vector store retriever (Cloud-based) - SKIP IF NO QUOTA"""
    print(f"   üîß Creating Pinecone retriever for {collection_name}...")
    print(f"   ‚ö†Ô∏è  Skipping Pinecone (OpenAI quota exceeded - use PhoBERT for local testing)")
    return None  # Comment this out if you have valid OpenAI API key
    
    # Uncomment below if you have OpenAI credits:
    """
    index_name = "raglegal"
    namespace = f"exp_{collection_name}_{int(time.time())}"
    
    try:
        openai_embedding = OpenAIEmbeddings(model="text-embedding-3-small")
        vectorstore = PineconeVectorStore.from_documents(
            documents=docs,
            embedding=openai_embedding,
            index_name=index_name,
            namespace=namespace
        )
        return vectorstore.as_retriever(search_kwargs={"k": config.top_k})
    except Exception as e:
        print(f"   ‚ö†Ô∏è Pinecone Error: {e}")
        return None
    """


# Database strategy mapping (all use PhoBERT now)
db_strategies = {
    "Chroma": (create_chroma_retriever, phobert_embedding),
    "FAISS": (create_faiss_retriever, phobert_embedding),
    # "Pinecone": (create_pinecone_retriever, phobert_embedding)  # Commented out
}



In [9]:
# PIPELINE EXECUTION (6 Combinations: 3 Chunking √ó 2 VectorDB)

FILE_PATH = 'C:/Users/ADMIN/Documents/PROJECT/GroupProject/Taxelith/document/luat_thue_ttdb_2025.pdf'

results_store = {}  # Store all retrievers for evaluation

print("\n" + "="*80)
print("üöÄ STARTING PIPELINE: 3 Chunking √ó 2 VectorDB = 6 Experiments")
print("="*80)

for chunk_name, chunk_func in chunking_strategies.items():
    print(f"\n{'‚îÄ'*80}")
    print(f"üìã CHUNKING STRATEGY: {chunk_name}")
    print(f"{'‚îÄ'*80}")
    
    # Step 1: Generate chunks
    docs = chunk_func(FILE_PATH)
    
    if not docs:
        print(f"   ‚ùå No chunks generated, skipping {chunk_name}")
        continue
    
    # Step 2: Create BM25 retriever (sparse, keyword-based)
    try:
        bm25_retriever = BM25Retriever.from_documents(docs)
        bm25_retriever.k = config.top_k
        print(f"   ‚úÖ BM25 retriever created")
    except Exception as e:
        print(f"   ‚ùå BM25 creation failed: {e}")
        continue
    
    # Step 3: Loop through vector databases
    for db_name, (db_func, embedding_model) in db_strategies.items():
        experiment_id = f"{chunk_name}_{db_name}"
        print(f"\n   üî¨ Experiment: {experiment_id}")
        
        try:
            # Create dense retriever (semantic, embedding-based)
            dense_retriever = db_func(docs, embedding_model, collection_name=experiment_id)
            
            if dense_retriever is None:
                print(f"   ‚ùå Skipped {experiment_id} (Database creation failed)")
                continue
            
            # Create hybrid retriever using CUSTOM implementation
            ensemble_retriever = CustomEnsembleRetriever(
                retrievers=[bm25_retriever, dense_retriever],
                weights=[config.bm25_weight, config.dense_weight],
                k=config.top_k
            )
            
            results_store[experiment_id] = {
                "retriever": ensemble_retriever,
                "docs": docs,
                "chunk_strategy": chunk_name,
                "db_strategy": db_name
            }
            print(f"   ‚úÖ Hybrid retriever created (BM25: {config.bm25_weight}, Dense: {config.dense_weight})")
            
        except Exception as e:
            print(f"   ‚ùå Failed {experiment_id}: {str(e)}")
            import traceback
            traceback.print_exc()

print(f"\n{'='*80}")
print(f"‚úÖ Pipeline Complete: {len(results_store)} retrievers created")
print(f"{'='*80}")




üöÄ STARTING PIPELINE: 3 Chunking √ó 2 VectorDB = 6 Experiments

‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
üìã CHUNKING STRATEGY: Hybrid
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
üìÑ Running Hybrid Chunking on C:/Users/ADMIN/Documents/PROJECT/GroupProject/Taxelith/document/luat_thue_ttdb_2025.pdf...


[32m[INFO] 2025-11-26 16:40:31,397 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-11-26 16:40:31,430 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\ADMIN\Documents\PROJECT\GroupProject\Taxelith\.venv_rag\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-11-26 16:40:31,431 [RapidOCR] main.py:53: Using C:\Users\ADMIN\Documents\PROJECT\GroupProject\Taxelith\.venv_rag\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-11-26 16:40:31,611 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-11-26 16:40:31,616 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\ADMIN\Documents\PROJECT\GroupProject\Taxelith\.venv_rag\Lib\site-packages\rapidocr\models\ch_ppocr_mobile_v2.0_cls_infer.onnx[0m
[32m[INFO] 2025-11-26 16:40:31,617 [RapidOCR] main.py:53: Using C:\Users\ADMIN\Documents\PROJECT\GroupProject\Taxelith\.venv_rag\Lib\site-packages\rapidocr\mod

   ‚úÖ Generated 59 chunks (metadata cleaned)
   ‚úÖ BM25 retriever created

   üî¨ Experiment: Hybrid_Chroma
   üîß Creating Chroma retriever for Hybrid_Chroma...
   ‚úÖ Hybrid retriever created (BM25: 0.5, Dense: 0.5)

   üî¨ Experiment: Hybrid_FAISS
   üîß Creating FAISS retriever for Hybrid_FAISS...


2025-11-26 16:41:49,125 - INFO - Loading faiss with AVX2 support.
2025-11-26 16:41:49,568 - INFO - Successfully loaded faiss with AVX2 support.


   ‚úÖ Hybrid retriever created (BM25: 0.5, Dense: 0.5)

‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
üìã CHUNKING STRATEGY: Recursive
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
üìÑ Running Recursive Chunking on C:/Users/ADMIN/Documents/PROJECT/GroupProject/Taxelith/document/luat_thue_ttdb_2025.pdf...
   ‚úÖ Generated 55 chunks
   ‚úÖ BM25 retriever created

   üî¨ Experiment: Recursive_Chroma
   üîß Creating Chroma retriever for Recursive_Chroma...


2025-11-26 16:41:50,730 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


   ‚úÖ Hybrid retriever created (BM25: 0.5, Dense: 0.5)

   üî¨ Experiment: Recursive_FAISS
   üîß Creating FAISS retriever for Recursive_FAISS...
   ‚úÖ Hybrid retriever created (BM25: 0.5, Dense: 0.5)

‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
üìã CHUNKING STRATEGY: Custom
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
üìÑ Running Custom Chunking on C:/Users/ADMIN/Documents/PROJECT/GroupProject/Taxelith/document/luat_thue_ttdb_2025.pdf...
   ‚úÖ Generated 23 chunks
   ‚úÖ BM25 retriever created

   üî¨ Experiment: Custom_Chroma
   üîß Creating Chroma retriever for Custom_Chroma...


2025-11-26 16:42:22,041 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


   ‚úÖ Hybrid retriever created (BM25: 0.5, Dense: 0.5)

   üî¨ Experiment: Custom_FAISS
   üîß Creating FAISS retriever for Custom_FAISS...
   ‚úÖ Hybrid retriever created (BM25: 0.5, Dense: 0.5)

‚úÖ Pipeline Complete: 6 retrievers created


In [8]:
# EVALUATION FUNCTIONS
def calculate_simple_recall(retriever, test_questions: List[str], ground_truths: List[str]) -> float:
    """Simple recall: Check if ground truth text appears in retrieved docs"""
    hits = 0
    for question, truth in zip(test_questions, ground_truths):
        try:
            retrieved_docs = retriever.invoke(question)
            found = any(truth.lower() in doc.page_content.lower() for doc in retrieved_docs)
            if found:
                hits += 1
        except Exception as e:
            print(f"   ‚ö†Ô∏è Retrieval failed for question: {e}")
    return hits / len(test_questions) if test_questions else 0.0


def calculate_precision_at_k(retriever, test_questions: List[str], ground_truths: List[str], k: int = 5) -> float:
    """Calculate precision: ratio of relevant docs in top-k"""
    precisions = []
    for question, truth in zip(test_questions, ground_truths):
        try:
            retrieved_docs = retriever.invoke(question)[:k]
            relevant_count = sum(1 for doc in retrieved_docs if truth.lower() in doc.page_content.lower())
            precisions.append(relevant_count / k if k > 0 else 0)
        except Exception as e:
            print(f"   ‚ö†Ô∏è Retrieval failed: {e}")
            precisions.append(0)
    return sum(precisions) / len(precisions) if precisions else 0.0




In [None]:


# Test data - REPLACE WITH YOUR REAL TEST SET
test_questions = [
    "Thu·∫ø ti√™u th·ª• ƒë·∫∑c bi·ªát l√† g√¨?",
    "Thu·∫ø su·∫•t v·ªõi bia l√† bao nhi√™u?",
    "ƒê·ªëi t∆∞·ª£ng n√†o ch·ªãu thu·∫ø ti√™u th·ª• ƒë·∫∑c bi·ªát?"
]

test_ground_truths = [
    "thu·∫ø gi√°n thu",  # Key phrase to search for
    "65%",
    "h√†ng h√≥a"
]

print("\n" + "="*80)
print("üìä STARTING EVALUATION")
print("="*80)

evaluation_results = []

for exp_id, data in results_store.items():
    print(f"\nüß™ Evaluating: {exp_id}")
    retriever = data['retriever']
    
    try:
        # Calculate metrics
        recall = calculate_simple_recall(retriever, test_questions, test_ground_truths)
        precision = calculate_precision_at_k(retriever, test_questions, test_ground_truths, k=config.top_k)
        
        print(f"   ‚úÖ Recall@{config.top_k}: {recall:.4f}")
        print(f"   ‚úÖ Precision@{config.top_k}: {precision:.4f}")
        
        evaluation_results.append({
            "Experiment": exp_id,
            "Chunking": data['chunk_strategy'],
            "VectorDB": data['db_strategy'],
            "Recall": recall,
            "Precision": precision,
            "F1_Score": 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        })
        
    except Exception as e:
        print(f"   ‚ùå Evaluation failed: {str(e)}")



In [None]:

# RESULTS SUMMARY & EXPORT

if evaluation_results:
    df_results = pd.DataFrame(evaluation_results)
    
    # Sort by F1 score
    df_results = df_results.sort_values('F1_Score', ascending=False)
    
    print("\n" + "="*80)
    print("üìà EVALUATION RESULTS (Sorted by F1 Score)")
    print("="*80)
    print(df_results.to_string(index=False))
    
    # Save to CSV
    output_file = f"retrieval_benchmark_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    df_results.to_csv(output_file, index=False)
    print(f"\nüíæ Results saved to: {output_file}")
    
    # Find best configuration
    if len(df_results) > 0:
        best_config = df_results.iloc[0]
        print("\nüèÜ BEST CONFIGURATION:")
        print(f"   Experiment: {best_config['Experiment']}")
        print(f"   Chunking: {best_config['Chunking']}")
        print(f"   VectorDB: {best_config['VectorDB']}")
        print(f"   Recall: {best_config['Recall']:.4f}")
        print(f"   Precision: {best_config['Precision']:.4f}")
        print(f"   F1 Score: {best_config['F1_Score']:.4f}")
    
else:
    print("\n‚ö†Ô∏è No evaluation results generated")
    print("   Please check if any retrievers were successfully created")

print("\n" + "="*80)
print("‚úÖ PIPELINE COMPLETE")
print("="*80)