**Requirements Needed for the Assignment**

In [1]:
%%writefile requirements.txt
langchain>=0.2.0
langchain-community>=0.2.0
langchain-text-splitters
pypdf
pymupdf
sentence-transformers
faiss-cpu
chromadb
langchain-groq
python-dotenv
typesense
langchain-openai
langgraph
PyPDF2
rank_bm25


Writing requirements.txt


In [None]:
!pip install --upgrade pip




In [2]:
!pip install -r requirements.txt


Collecting langchain-community>=0.2.0 (from -r requirements.txt (line 2))
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting pypdf (from -r requirements.txt (line 4))
  Downloading pypdf-6.2.0-py3-none-any.whl.metadata (7.1 kB)
Collecting pymupdf (from -r requirements.txt (line 5))
  Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting faiss-cpu (from -r requirements.txt (line 7))
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting chromadb (from -r requirements.txt (line 8))
  Downloading chromadb-1.3.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting langchain-groq (from -r requirements.txt (line 9))
  Downloading langchain_groq-1.0.1-py3-none-any.whl.metadata (2.4 kB)
Collecting typesense (from -r requirements.txt (line 11))
  Downloading typesense-1.3.0-py3-none-any.whl.metadata (1.9 kB)
Collecting langch

Necessary Imports

In [3]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity
from pathlib import Path
from PyPDF2 import PdfReader
from langchain.schema import Document
from sklearn.feature_extraction.text import TfidfVectorizer
from rank_bm25 import BM25Okapi
import nltk
from nltk.tokenize import word_tokenize
from transformers import pipeline
import re



In [4]:
import nltk

# Download the required tokenizer explicitly
# Ensure required NLTK resources are downloaded
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)  # Fix for LookupError


True

Functions

Data Ingestion

In [5]:
# ------------------------
# Load all PDFs from a directory
# ------------------------
def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory and return LangChain Document objects"""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    pdf_files = list(pdf_dir.glob("**/*.pdf"))

    print(f"Found {len(pdf_files)} PDF files to process.")

    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()

            # Add source info to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'

            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")

        except Exception as e:
            print(f"  ✗ Error loading {pdf_file.name}: {e}")

    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

# ------------------------
# Different chunking strategies
# ------------------------
def chunk_documents(documents, strategy="pagewise"):
    all_chunks = []

    for doc in documents:
        text = doc.page_content
        metadata = doc.metadata.copy()

        if strategy == "pagewise":
            # Already loaded pagewise via PyPDFLoader, keep as is
            all_chunks.append(Document(page_content=text, metadata=metadata))

        elif strategy == "paragraph":
            paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
            for p in paragraphs:
                all_chunks.append(Document(page_content=p, metadata=metadata))

        elif strategy == "sentence":
            sentence_endings = re.compile(r'(?<=[.!?])\s+')
            sentences = [s.strip() for s in sentence_endings.split(text) if s.strip()]

            i = 0
            min_sent, max_sent = 3, 8
            while i < len(sentences):
                chunk_size = min(max_sent, len(sentences) - i)
                if chunk_size < min_sent:
                    chunk_size = len(sentences) - i
                chunk_text = " ".join(sentences[i:i + chunk_size])
                all_chunks.append(Document(page_content=chunk_text, metadata=metadata))
                i += chunk_size

        elif strategy == "title":
            pattern = r'(?:^|\n)([A-Z][A-Z\s\d]+)\n'
            splits = [m.start() for m in re.finditer(pattern, text)]
            splits.append(len(text))
            for i in range(len(splits)-1):
                chunk_text = text[splits[i]:splits[i+1]].strip()
                if chunk_text:
                    all_chunks.append(Document(page_content=chunk_text, metadata=metadata))

        elif strategy == "character":
            chunk_size = 500
            for i in range(0, len(text), chunk_size):
                all_chunks.append(Document(page_content=text[i:i+chunk_size], metadata=metadata))

        elif strategy == "token":
            token_size = 100
            tokens = text.split()
            for i in range(0, len(tokens), token_size):
                chunk_text = " ".join(tokens[i:i+token_size])
                all_chunks.append(Document(page_content=chunk_text, metadata=metadata))

        elif strategy == "overlapping":
            chunk_size = 500
            overlap = 100
            start = 0
            while start < len(text):
                end = start + chunk_size
                all_chunks.append(Document(page_content=text[start:end], metadata=metadata))
                start += chunk_size - overlap

        else:
            raise ValueError(f"Unknown chunking strategy: {strategy}")

    print(f"📄 Total chunks created with '{strategy}' strategy: {len(all_chunks)}")
    return all_chunks


Creating Embedding Manager

We created the EmbeddingManager class to simplify the process of turning text into numerical representations called embeddings using the SentenceTransformer model. When we create an instance of this class, it automatically loads the chosen model, which by default is "all-MiniLM-L6-v2", and prepares it for use. The class has a method to generate embeddings for a list of texts, converting each text into a numerical vector while showing a progress indicator and confirming the size of the generated embeddings. It also includes a method to check the size of each embedding vector so we know the dimensions of the output. Overall, the EmbeddingManager provides a simple and organized way to handle embeddings for text data, making it ready for tasks like semantic search, similarity comparisons, or other natural language processing applications.

In [6]:
class EmbeddingManager:
    """Handles document embedding generation using SentenceTransformer."""

    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize the embedding manager.

        Args:
            model_name: HuggingFace model name for sentence embeddings.
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load the SentenceTransformer model."""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(
                f"Model loaded successfully. "
                f"Embedding dimension: {self.model.get_sentence_embedding_dimension()}"
            )
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of texts.

        Args:
            texts: List of text strings to embed.

        Returns:
            numpy array of embeddings with shape (len(texts), embedding_dim)
        """
        if not self.model:
            raise ValueError("Model not loaded")

        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings

    def get_embedding_dimension(self) -> int:
        """Get the embedding dimension of the model."""
        if not self.model:
            raise ValueError("Model not loaded")
        return self.model.get_sentence_embedding_dimension()


# ✅ Initialize the embedding manager
embedding_manager = EmbeddingManager()
embedding_manager


Loading embedding model: all-MiniLM-L6-v2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x786afd154740>

Creating Vector Store Class

We created the VectorStore class to manage document embeddings using a ChromaDB vector store. When an instance of this class is created, it automatically initializes the vector store by creating a persistent storage directory and either loading or creating a collection for storing document embeddings. The class provides a method to add documents along with their embeddings to the store. Each document is assigned a unique ID, and relevant metadata such as its index and content length is stored. This ensures that all documents and their embeddings are organized and easily retrievable. Overall, the VectorStore class simplifies the process of storing, managing, and tracking document embeddings, making it ready for tasks like retrieval-augmented generation (RAG) or semantic search.




In [7]:
class VectorStore:
    """Manages document embeddings in a ChromaDB vector store."""

    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection."""
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )

            print(f"✅ Vector store initialized. Collection: {self.collection_name}")
            print(f"📄 Existing documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"❌ Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store.
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")

        print(f"🧩 Adding {len(documents)} documents to vector store...")

        ids, metadatas, documents_text, embeddings_list = [], [], [], []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            metadata = dict(doc.metadata)
            metadata["doc_index"] = i
            metadata["content_length"] = len(doc.page_content)
            metadatas.append(metadata)
            documents_text.append(doc.page_content)
            embeddings_list.append(embedding.tolist())

        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"✅ Successfully added {len(documents)} documents to vector store.")
            print(f"📄 Total documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"❌ Error adding documents to vector store: {e}")
            raise


Retriever Pipeline From Vector Store

We created the RAGRetriever class to handle searching and retrieving documents from the vector store based on a user query. When we create an instance of this class, we provide it with a vector store containing the document embeddings and an embedding manager to generate query embeddings.

When a query is submitted, the retriever first converts it into a numerical embedding using the embedding manager. It then searches the vector store for the most similar documents, up to a specified number of top results. The retriever also calculates similarity scores from the distances provided by the vector store and filters out any results below a set threshold. Each retrieved document includes its content, metadata, similarity score, distance, and rank.

Overall, RAGRetriever makes it easy to find relevant documents efficiently, providing structured results ready for tasks like retrieval-augmented generation (RAG) or semantic search.

In [8]:
class RAGRetriever:
    """Handles query-based retrieval from the vector store"""

    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """
        Initialize the retriever

        Args:
            vector_store: Vector store containing document embeddings
            embedding_manager: Manager for generating query embeddings
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query

        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold

        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")

        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]

        # Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )

            # Process results
            retrieved_docs = []

            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]

                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance

                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })

                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")

            return retrieved_docs

        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []



Generator Pipeline

We set up a simple RAG (Retrieval-Augmented Generation) pipeline using the llama-3.1-8b-instant model from Groq. First, we initialize the Groq LLM (ChatGroq) with the API key, model name, and parameters such as temperature and maximum tokens.

Next, we created the rag_simple function to handle the full RAG process. When a user submits a query, the function uses the retriever to fetch the most relevant documents from the vector store. It combines the content of these documents into a single context. If no relevant documents are found, it returns a message indicating that there is no context to answer the question.

The function then constructs a prompt for the LLM, instructing llama-3.1-8b-instant to answer the query concisely using the retrieved context. The model processes this prompt and generates a response, which the function returns as the answer.

In short, this pipeline connects document retrieval with language model generation. The retriever provides the knowledge from your stored documents, and llama-3.1-8b-instant produces accurate and concise answers based on that context, making the system capable of answering questions using the information in your documents.

In [None]:
### Simple RAG pipeline with Groq LLM
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv

load_dotenv()

### Initialize the Groq LLM (set your GROQ_API_KEY in environment)
groq_api_key = "************************"

llm = ChatGroq(
    groq_api_key=groq_api_key,
    model_name="llama-3.1-8b-instant",
    temperature=0.1,
    max_tokens=1024
)


In [10]:
def faithfulness_score(context: str, answer: str, embedding_manager) -> float:
    """
    Compute a faithfulness score (0 to 1) for an answer with respect to the context.
    Higher score means more faithful.
    """
    if not context or not answer:
        return 0.0

    # Compute embeddings using the embedding_manager
    context_emb = embedding_manager.generate_embeddings([context])
    answer_emb = embedding_manager.generate_embeddings([answer])

    # Compute cosine similarity
    score = cosine_similarity(answer_emb, context_emb)[0][0]

    # Ensure the score is between 0 and 1
    score = max(0.0, min(1.0, score))
    return score


def rag_simple_with_faithfulness(query: str, retriever, llm, embedding_manager, top_k=3):
    # Retrieve relevant context
    results = retriever.retrieve(query, top_k=top_k)
    context = "\n\n".join([doc['content'] for doc in results]) if results else ""

    if not context:
        return "No relevant context found to answer the question.", 0.0

    prompt = f"""Use the following context to answer the question concisely.

Context:
{context}

Question: {query}

Answer:"""

    # Generate the answer
    response = llm.invoke([prompt])
    answer = response.content

    # Compute faithfulness
    score = faithfulness_score(context, answer, embedding_manager)

    return answer, score

BM25 Retreiver technique

In [11]:
# Simple tokenizer (no NLTK required)
def simple_tokenize(text: str) -> List[str]:
    """Lowercase and split text into words, removing punctuation."""
    text = text.lower()
    tokens = re.findall(r'\b\w+\b', text)
    return tokens


In [12]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Dict, Any

# ------------------------
# BM25 Retriever
# ------------------------
class BM25Retriever:
    """Term-based retrieval using BM25 (no NLTK dependency)."""

    def __init__(self, documents: List[Any]):
        """
        Args:
            documents: List of Document objects with .page_content and .metadata
        """
        self.documents = documents
        self.texts = [doc.page_content for doc in documents]
        self.tokenized_texts = [simple_tokenize(t) for t in self.texts]
        self.bm25 = BM25Okapi(self.tokenized_texts)

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        print(f"Retrieving documents for query: '{query}' using BM25")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")

        query_tokens = simple_tokenize(query)
        scores = self.bm25.get_scores(query_tokens)
        ranked_idx = np.argsort(scores)[::-1]

        retrieved_docs = []
        for i in ranked_idx:
            if scores[i] < score_threshold:
                continue
            retrieved_docs.append({
                'id': f"doc_{i}",
                'content': self.texts[i],
                'metadata': self.documents[i].metadata,
                'similarity_score': float(scores[i]),
                'rank': len(retrieved_docs) + 1
            })
            if len(retrieved_docs) >= top_k:
                break

        print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
        return retrieved_docs


# ------------------------
# MMR Retriever
# ------------------------
class MMRRetriever:
    """Dense retrieval with Maximal Marginal Relevance (MMR)."""

    def __init__(self, documents: List[Any], embedding_manager: Any):
        self.documents = documents
        self.embedding_manager = embedding_manager
        self.texts = [doc.page_content for doc in documents]
        self.embeddings = embedding_manager.generate_embeddings(self.texts)

    def retrieve(self, query: str, top_k: int = 5, lambda_param: float = 0.5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        print(f"Retrieving documents for query: '{query}' using MMR")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")

        query_emb = self.embedding_manager.generate_embeddings([query])[0]
        doc_similarities = cosine_similarity([query_emb], self.embeddings)[0]

        selected = []
        candidate_indices = list(range(len(self.documents)))

        while len(selected) < top_k and candidate_indices:
            mmr_scores = []
            for idx in candidate_indices:
                sim_to_query = doc_similarities[idx]
                sim_to_selected = max([cosine_similarity([self.embeddings[idx]], [self.embeddings[s]])[0][0] for s in selected], default=0)
                score = lambda_param * sim_to_query - (1 - lambda_param) * sim_to_selected
                mmr_scores.append(score)
            best_idx = candidate_indices[np.argmax(mmr_scores)]
            if doc_similarities[best_idx] < score_threshold:
                break
            selected.append(best_idx)
            candidate_indices.remove(best_idx)

        retrieved_docs = []
        for rank, idx in enumerate(selected):
            retrieved_docs.append({
                'id': f"mmr_{idx}",
                'content': self.texts[idx],
                'metadata': self.documents[idx].metadata,
                'similarity_score': float(doc_similarities[idx]),
                'rank': rank + 1
            })

        print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
        return retrieved_docs


# ------------------------
# Hybrid Retriever
# ------------------------
class HybridRetriever:
    """Hybrid retrieval combining BM25 + dense embeddings."""

    def __init__(self, documents: List[Any], embedding_manager: Any, alpha: float = 0.5):
        self.documents = documents
        self.texts = [doc.page_content for doc in documents]
        self.embedding_manager = embedding_manager
        self.alpha = alpha

        # BM25 setup
        self.tokenized_texts = [simple_tokenize(t) for t in self.texts]
        self.bm25 = BM25Okapi(self.tokenized_texts)

        # Dense embeddings
        self.embeddings = self.embedding_manager.generate_embeddings(self.texts)

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        print(f"Retrieving documents for query: '{query}' using Hybrid Retriever")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")

        query_tokens = simple_tokenize(query)
        bm25_scores = self.bm25.get_scores(query_tokens)
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        dense_scores = cosine_similarity([query_embedding], self.embeddings)[0]

        final_scores = self.alpha * bm25_scores + (1 - self.alpha) * dense_scores
        ranked_idx = np.argsort(final_scores)[::-1]

        retrieved_docs = []
        for i in ranked_idx:
            if final_scores[i] < score_threshold:
                continue
            retrieved_docs.append({
                'id': f"doc_{i}",
                'content': self.texts[i],
                'metadata': self.documents[i].metadata,
                'bm25_score': float(bm25_scores[i]),
                'dense_score': float(dense_scores[i]),
                'final_score': float(final_scores[i]),
                'rank': len(retrieved_docs) + 1
            })
            if len(retrieved_docs) >= top_k:
                break

        print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
        return retrieved_docs


Post Processing functions

In [13]:
# Initialize summarizer (can be done once)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_passages(retrieved_docs: list, max_length=150, min_length=50) -> list:
    """
    Summarize long passages to extract salient points.

    Args:
        retrieved_docs: List of dictionaries containing 'content' and metadata
        max_length: Max tokens for summary
        min_length: Min tokens for summary

    Returns:
        List of updated retrieved_docs with 'summary' field
    """
    for doc in retrieved_docs:
        text = doc['content']
        if len(text.split()) > 100:  # Only summarize if text is lengthy
            summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
            doc['summary'] = summary[0]['summary_text']
        else:
            doc['summary'] = text
    return retrieved_docs

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


In [14]:
def filter_relevant_context(retrieved_docs: list, keywords: list) -> list:
    """
    Keep only documents that contain at least one keyword.

    Args:
        retrieved_docs: List of dictionaries with 'content'
        keywords: List of keywords related to the query

    Returns:
        Filtered list of documents
    """
    filtered_docs = []
    for doc in retrieved_docs:
        content_lower = doc['content'].lower()
        if any(kw.lower() in content_lower for kw in keywords):
            filtered_docs.append(doc)
    return filtered_docs

In [15]:
from collections import defaultdict
import numpy as np

def generate_queries(original_query: str) -> list:
    """
    Generate multiple rephrased queries.
    """
    # Placeholder: ideally use LLM or paraphraser
    return [
        original_query,
        f"What are the working hours for {original_query}?",
        f"When is the operation schedule for {original_query}?"
    ]

def reciprocal_rank_fusion(list_of_results: list, k=60) -> list:
    """
    Combine multiple ranked results using Reciprocal Rank Fusion.

    Args:
        list_of_results: List of lists of retrieved docs for each query
        k: Constant for RRF (default 60)

    Returns:
        Fused and reranked list of documents
    """
    score_dict = defaultdict(float)

    for results in list_of_results:
        for rank, doc in enumerate(results):
            doc_id = doc['id']
            score_dict[doc_id] += 1 / (k + rank + 1)

    # Merge docs by id
    merged_docs = {doc['id']: doc for results in list_of_results for doc in results}

    # Sort by RRF score
    fused_docs = sorted(merged_docs.values(), key=lambda d: score_dict[d['id']], reverse=True)

    return fused_docs

Base line Model

In [16]:
# ------------------------
# Process all PDFs in the data directory
# ------------------------
all_pdf_documents = process_all_pdfs("/content/drive/MyDrive/Rag_data_pdf/pdf")

# ------------------------
# Generate embeddings for the documents
# ------------------------
texts = [doc.page_content for doc in all_pdf_documents]
embeddings = embedding_manager.generate_embeddings(texts)

# ------------------------
# Initialize vector store and add documents
# ------------------------
vectorstore = VectorStore()
vectorstore.add_documents(all_pdf_documents, embeddings)

# ------------------------
# Initialize RAG retriever
# ------------------------
rag_retriever = RAGRetriever(vectorstore, embedding_manager)

# ------------------------
# Define your query
# ------------------------
query = "What are the Regular Hours of Operation?"

# ------------------------
# Call RAG pipeline with faithfulness scoring
# ------------------------
answer, score = rag_simple_with_faithfulness(
    query=query,
    retriever=rag_retriever,
    llm=llm,
    embedding_manager=embedding_manager,  # Important
    top_k=3
)

# ------------------------
# Print results
# ------------------------
print("Answer:", answer)
print(f"Faithfulness score: {score:.2f}")


Found 6 PDF files to process.

Processing: 6-Recruitment-policy-and-procedure.pdf
  ✓ Loaded 13 pages

Processing: 5-Compensation & Benefit Polic.pdf
  ✓ Loaded 65 pages

Processing: 7-Employee Handbook.pdf
  ✓ Loaded 35 pages

Processing: 4-Employee Code of Conduct.pdf
  ✓ Loaded 26 pages

Processing: 3-Anti-Discrimination and Anti-Harassment Policy and Procedures.pdf
  ✓ Loaded 11 pages

Processing: 2-Leave & Attendence Policy.pdf
  ✓ Loaded 57 pages

Total documents loaded: 207
Generating embeddings for 207 texts...


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Generated embeddings with shape: (207, 384)
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 0
🧩 Adding 207 documents to vector store...
✅ Successfully added 207 documents to vector store.
📄 Total documents in collection: 207
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 1 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Answer: The University's regular hours of operation are 8:00 a.m. to 5:00 p.m., Monday through Friday.
Faithfulness score: 0.43


In [None]:
pdf_directory = "/content/drive/MyDrive/Rag_data_pdf/pdf"

# Load all PDFs
all_pdf_documents = process_all_pdfs(pdf_directory)

# Choose chunking strategy
chunk_type = "paragraph"  # pagewise, paragraph, sentence, title, character, token, overlapping
all_chunks = chunk_documents(all_pdf_documents, strategy=chunk_type)

# Generate embeddings
texts = [doc.page_content for doc in all_chunks]
embeddings = embedding_manager.generate_embeddings(texts)

# Initialize vector store
vectorstore = VectorStore()
vectorstore.add_documents(all_chunks, embeddings)

# Choose retrieval method
# retriever = RAGRetriever(vectorstore, embedding_manager)  # Dense
retriever = BM25Retriever(all_chunks)                        # BM25
# retriever = MMRRetriever(all_chunks, embedding_manager)    # MMR
# retriever = HybridRetriever(all_chunks, embedding_manager) # Hybrid

# Define your query
query = "What are the Regular Hours of Operation?"

# Retrieve answer + faithfulness
answer, score = rag_simple_with_faithfulness(
    query=query,
    retriever=retriever,
    llm=llm,
    embedding_manager=embedding_manager,
    top_k=3
)

print("Answer:", answer)
print(f"Faithfulness score: {score:.2f}")

Found 6 PDF files to process.

Processing: 6-Recruitment-policy-and-procedure.pdf
  ✓ Loaded 13 pages

Processing: 5-Compensation & Benefit Polic.pdf
  ✓ Loaded 65 pages

Processing: 7-Employee Handbook.pdf
  ✓ Loaded 35 pages

Processing: 4-Employee Code of Conduct.pdf
  ✓ Loaded 26 pages

Processing: 3-Anti-Discrimination and Anti-Harassment Policy and Procedures.pdf
  ✓ Loaded 11 pages

Processing: 2-Leave & Attendence Policy.pdf
  ✓ Loaded 57 pages

Total documents loaded: 207
📄 Total chunks created with 'paragraph' strategy: 207
Generating embeddings for 207 texts...


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Generated embeddings with shape: (207, 384)
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 19243
🧩 Adding 207 documents to vector store...
✅ Successfully added 207 documents to vector store.
📄 Total documents in collection: 19450
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 3, Score threshold: 0.0
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Answer: The University's regular hours of operation are 8:00 a.m. to 5:00 p.m., Monday through Friday.
Faithfulness score: 0.43


In [None]:
import pandas as pd
from pathlib import Path

# ------------------------
# Define your combinations
# ------------------------
chunking_methods = ["pagewise", "paragraph", "sentence", "title", "character", "token", "overlapping"]
retrieval_methods = ["Dense", "BM25", "MMR", "Hybrid"]
post_processing_methods = ["None", "Summarization", "Keyword Filtering", "RRF"]

query = "What are the Regular Hours of Operation?"
keywords = ["hours", "operation", "schedule"]  # for keyword filtering

results = []
exp_id = 1

# ------------------------
# Loop through all combinations
# ------------------------
for chunk_type in chunking_methods:
    print(f"\n--- Processing Chunking Strategy: {chunk_type} ---")

    # 1️⃣ Load PDFs
    all_pdf_documents = process_all_pdfs(pdf_directory)

    # 2️⃣ Chunk PDFs using the chosen strategy
    all_chunks = chunk_documents(all_pdf_documents, strategy=chunk_type)

    # 3️⃣ Generate embeddings (for vector store)
    texts = [doc.page_content for doc in all_chunks]
    embeddings = embedding_manager.generate_embeddings(texts)

    for retr_method in retrieval_methods:
        for post_proc in post_processing_methods:

            # 4️⃣ Initialize vector store per combination
            vectorstore = VectorStore()
            vectorstore.add_documents(all_chunks, embeddings)

            # 5️⃣ Initialize retriever
            if retr_method == "Dense":
                retriever = RAGRetriever(vectorstore, embedding_manager)
            elif retr_method == "BM25":
                retriever = BM25Retriever(all_chunks)
            elif retr_method == "MMR":
                retriever = MMRRetriever(all_chunks, embedding_manager)
            elif retr_method == "Hybrid":
                retriever = HybridRetriever(all_chunks, embedding_manager)
            else:
                raise ValueError(f"Unknown retrieval method: {retr_method}")

            # 6️⃣ Retrieve documents + post-processing
            try:
                retrieved_docs = retriever.retrieve(query, top_k=3)

                if post_proc == "Summarization":
                    retrieved_docs = summarize_passages(retrieved_docs)
                elif post_proc == "Keyword Filtering":
                    retrieved_docs = filter_relevant_context(retrieved_docs, keywords)
                elif post_proc == "RRF":
                    queries = generate_queries(query)
                    list_of_results = [retriever.retrieve(q) for q in queries]
                    retrieved_docs = reciprocal_rank_fusion(list_of_results)

                # Combine context for RAG answer
                context = "\n\n".join([doc.get('summary', doc['content']) for doc in retrieved_docs]) if retrieved_docs else ""
                if context:
                    prompt = f"Use the following context to answer the question concisely.\n\nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"
                    response = llm.invoke([prompt])
                    answer = response.content
                    score = faithfulness_score(context, answer, embedding_manager)
                else:
                    answer = "No relevant context found"
                    score = 0.0

            except Exception as e:
                answer = f"Error: {str(e)}"
                score = None

            # 7️⃣ Save results
            results.append({
                "Exp_id": exp_id,
                "Chunking Technique": chunk_type,
                "Retrieval Technique": retr_method,
                "Post-processing": post_proc,
                "Faithfulness Score": score,
                "Query": query,
                "Answer": answer
            })

            print(f"[{exp_id}] {chunk_type} | {retr_method} | {post_proc} -> {score}")
            exp_id += 1

# ------------------------
# Save all results to Excel
# ------------------------
df = pd.DataFrame(results)
df.to_excel("RAG_pipeline_results_with_postprocessing.xlsx", index=False)
print("✅ Results saved to RAG_pipeline_results_with_postprocessing.xlsx")



--- Processing Chunking Strategy: pagewise ---
Found 6 PDF files to process.

Processing: 6-Recruitment-policy-and-procedure.pdf
  ✓ Loaded 13 pages

Processing: 5-Compensation & Benefit Polic.pdf
  ✓ Loaded 65 pages

Processing: 7-Employee Handbook.pdf
  ✓ Loaded 35 pages

Processing: 4-Employee Code of Conduct.pdf
  ✓ Loaded 26 pages

Processing: 3-Anti-Discrimination and Anti-Harassment Policy and Procedures.pdf
  ✓ Loaded 11 pages

Processing: 2-Leave & Attendence Policy.pdf
  ✓ Loaded 57 pages

Total documents loaded: 207
📄 Total chunks created with 'pagewise' strategy: 207
Generating embeddings for 207 texts...


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Generated embeddings with shape: (207, 384)
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 19450
🧩 Adding 207 documents to vector store...
✅ Successfully added 207 documents to vector store.
📄 Total documents in collection: 19657
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[1] pagewise | Dense | None -> 0.7098316550254822
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 19657
🧩 Adding 207 documents to vector store...
✅ Successfully added 207 documents to vector store.
📄 Total documents in collection: 19864
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[2] pagewise | Dense | Summarization -> 0.7347233295440674
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 19864
🧩 Adding 207 documents to vector store...
✅ Successfully added 207 documents to vector store.
📄 Total documents in collection: 20071
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[3] pagewise | Dense | Keyword Filtering -> 0.7098316550254822
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 20071
🧩 Adding 207 documents to vector store...
✅ Successfully added 207 documents to vector store.
📄 Total documents in collection: 20278
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'What are the working hours for What are the Regular Hours of Operation??'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'When is the operation schedule for What are the Regular Hours of Operation??'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[4] pagewise | Dense | RRF -> 0.7098316550254822
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 20278
🧩 Adding 207 documents to vector store...
✅ Successfully added 207 documents to vector store.
📄 Total documents in collection: 20485
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 3, Score threshold: 0.0
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[5] pagewise | BM25 | None -> 0.42540597915649414
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 20485
🧩 Adding 207 documents to vector store...
✅ Successfully added 207 documents to vector store.
📄 Total documents in collection: 20692
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 3, Score threshold: 0.0
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[6] pagewise | BM25 | Summarization -> 0.5397011041641235
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 20692
🧩 Adding 207 documents to vector store...
✅ Successfully added 207 documents to vector store.
📄 Total documents in collection: 20899
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 3, Score threshold: 0.0
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[7] pagewise | BM25 | Keyword Filtering -> 0.42540597915649414
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 20899
🧩 Adding 207 documents to vector store...
✅ Successfully added 207 documents to vector store.
📄 Total documents in collection: 21106
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 3, Score threshold: 0.0
Retrieved 3 documents (after filtering)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 5, Score threshold: 0.0
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'What are the working hours for What are the Regular Hours of Operation??' using BM25
Top K: 5, Score threshold: 0.0
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'When is the operation schedule for What are the Regular Hours of Operation??' using BM25
Top K: 5, Score threshold: 0.0
Retrieved 

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[8] pagewise | BM25 | RRF -> 0.42540597915649414
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 21106
🧩 Adding 207 documents to vector store...
✅ Successfully added 207 documents to vector store.
📄 Total documents in collection: 21313
Generating embeddings for 207 texts...


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Generated embeddings with shape: (207, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[9] pagewise | MMR | None -> 0.42540597915649414
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 21313
🧩 Adding 207 documents to vector store...
✅ Successfully added 207 documents to vector store.
📄 Total documents in collection: 21520
Generating embeddings for 207 texts...


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Generated embeddings with shape: (207, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[10] pagewise | MMR | Summarization -> 0.49829310178756714
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 21520
🧩 Adding 207 documents to vector store...
✅ Successfully added 207 documents to vector store.
📄 Total documents in collection: 21727
Generating embeddings for 207 texts...


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Generated embeddings with shape: (207, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[11] pagewise | MMR | Keyword Filtering -> 0.42540597915649414
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 21727
🧩 Adding 207 documents to vector store...
✅ Successfully added 207 documents to vector store.
📄 Total documents in collection: 21934
Generating embeddings for 207 texts...


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Generated embeddings with shape: (207, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'What are the working hours for What are the Regular Hours of Operation??' using MMR
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'When is the operation schedule for What are the Regular Hours of Operation??' using MMR
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[12] pagewise | MMR | RRF -> 0.03773488849401474
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 21934
🧩 Adding 207 documents to vector store...
✅ Successfully added 207 documents to vector store.
📄 Total documents in collection: 22141
Generating embeddings for 207 texts...


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Generated embeddings with shape: (207, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[13] pagewise | Hybrid | None -> 0.42540597915649414
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 22141
🧩 Adding 207 documents to vector store...
✅ Successfully added 207 documents to vector store.
📄 Total documents in collection: 22348
Generating embeddings for 207 texts...


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Generated embeddings with shape: (207, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[14] pagewise | Hybrid | Summarization -> 0.5397011041641235
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 22348
🧩 Adding 207 documents to vector store...
✅ Successfully added 207 documents to vector store.
📄 Total documents in collection: 22555
Generating embeddings for 207 texts...


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Generated embeddings with shape: (207, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[15] pagewise | Hybrid | Keyword Filtering -> 0.42540597915649414
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 22555
🧩 Adding 207 documents to vector store...
✅ Successfully added 207 documents to vector store.
📄 Total documents in collection: 22762
Generating embeddings for 207 texts...


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Generated embeddings with shape: (207, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'What are the working hours for What are the Regular Hours of Operation??' using Hybrid Retriever
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'When is the operation schedule for What are the Regular Hours of Operation??' using Hybrid Retriever
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[16] pagewise | Hybrid | RRF -> 0.42540597915649414

--- Processing Chunking Strategy: paragraph ---
Found 6 PDF files to process.

Processing: 6-Recruitment-policy-and-procedure.pdf
  ✓ Loaded 13 pages

Processing: 5-Compensation & Benefit Polic.pdf
  ✓ Loaded 65 pages

Processing: 7-Employee Handbook.pdf
  ✓ Loaded 35 pages

Processing: 4-Employee Code of Conduct.pdf
  ✓ Loaded 26 pages

Processing: 3-Anti-Discrimination and Anti-Harassment Policy and Procedures.pdf
  ✓ Loaded 11 pages

Processing: 2-Leave & Attendence Policy.pdf
  ✓ Loaded 57 pages

Total documents loaded: 207
📄 Total chunks created with 'paragraph' strategy: 207
Generating embeddings for 207 texts...


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Generated embeddings with shape: (207, 384)
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 22762
🧩 Adding 207 documents to vector store...
✅ Successfully added 207 documents to vector store.
📄 Total documents in collection: 22969
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[17] paragraph | Dense | None -> 0.7098316550254822
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 22969
🧩 Adding 207 documents to vector store...
✅ Successfully added 207 documents to vector store.
📄 Total documents in collection: 23176
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[18] paragraph | Dense | Summarization -> 0.7347233295440674
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 23176
🧩 Adding 207 documents to vector store...
✅ Successfully added 207 documents to vector store.
📄 Total documents in collection: 23383
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[19] paragraph | Dense | Keyword Filtering -> 0.7098316550254822
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 23383
🧩 Adding 207 documents to vector store...
✅ Successfully added 207 documents to vector store.
📄 Total documents in collection: 23590
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'What are the working hours for What are the Regular Hours of Operation??'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'When is the operation schedule for What are the Regular Hours of Operation??'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[20] paragraph | Dense | RRF -> 0.7098316550254822
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 23590
🧩 Adding 207 documents to vector store...
✅ Successfully added 207 documents to vector store.
📄 Total documents in collection: 23797
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 3, Score threshold: 0.0
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[21] paragraph | BM25 | None -> 0.42540597915649414
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 23797
🧩 Adding 207 documents to vector store...
✅ Successfully added 207 documents to vector store.
📄 Total documents in collection: 24004
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 3, Score threshold: 0.0
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[22] paragraph | BM25 | Summarization -> 0.5397011041641235
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 24004
🧩 Adding 207 documents to vector store...
✅ Successfully added 207 documents to vector store.
📄 Total documents in collection: 24211
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 3, Score threshold: 0.0
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[23] paragraph | BM25 | Keyword Filtering -> 0.42540597915649414
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 24211
🧩 Adding 207 documents to vector store...
✅ Successfully added 207 documents to vector store.
📄 Total documents in collection: 24418
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 3, Score threshold: 0.0
Retrieved 3 documents (after filtering)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 5, Score threshold: 0.0
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'What are the working hours for What are the Regular Hours of Operation??' using BM25
Top K: 5, Score threshold: 0.0
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'When is the operation schedule for What are the Regular Hours of Operation??' using BM25
Top K: 5, Score threshold: 0.0
Retrieve

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[24] paragraph | BM25 | RRF -> 0.42540597915649414
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 24418
🧩 Adding 207 documents to vector store...
✅ Successfully added 207 documents to vector store.
📄 Total documents in collection: 24625
Generating embeddings for 207 texts...


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Generated embeddings with shape: (207, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[25] paragraph | MMR | None -> 0.42540597915649414
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 24625
🧩 Adding 207 documents to vector store...
✅ Successfully added 207 documents to vector store.
📄 Total documents in collection: 24832
Generating embeddings for 207 texts...


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Generated embeddings with shape: (207, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[26] paragraph | MMR | Summarization -> 0.49829310178756714
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 24832
🧩 Adding 207 documents to vector store...
✅ Successfully added 207 documents to vector store.
📄 Total documents in collection: 25039
Generating embeddings for 207 texts...


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Generated embeddings with shape: (207, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[27] paragraph | MMR | Keyword Filtering -> 0.42540597915649414
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 25039
🧩 Adding 207 documents to vector store...
✅ Successfully added 207 documents to vector store.
📄 Total documents in collection: 25246
Generating embeddings for 207 texts...


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Generated embeddings with shape: (207, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'What are the working hours for What are the Regular Hours of Operation??' using MMR
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'When is the operation schedule for What are the Regular Hours of Operation??' using MMR
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[28] paragraph | MMR | RRF -> 0.03773488849401474
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 25246
🧩 Adding 207 documents to vector store...
✅ Successfully added 207 documents to vector store.
📄 Total documents in collection: 25453
Generating embeddings for 207 texts...


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Generated embeddings with shape: (207, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[29] paragraph | Hybrid | None -> 0.42540597915649414
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 25453
🧩 Adding 207 documents to vector store...
✅ Successfully added 207 documents to vector store.
📄 Total documents in collection: 25660
Generating embeddings for 207 texts...


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Generated embeddings with shape: (207, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[30] paragraph | Hybrid | Summarization -> 0.5397011041641235
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 25660
🧩 Adding 207 documents to vector store...
✅ Successfully added 207 documents to vector store.
📄 Total documents in collection: 25867
Generating embeddings for 207 texts...


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Generated embeddings with shape: (207, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[31] paragraph | Hybrid | Keyword Filtering -> 0.42540597915649414
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 25867
🧩 Adding 207 documents to vector store...
✅ Successfully added 207 documents to vector store.
📄 Total documents in collection: 26074
Generating embeddings for 207 texts...


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Generated embeddings with shape: (207, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'What are the working hours for What are the Regular Hours of Operation??' using Hybrid Retriever
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'When is the operation schedule for What are the Regular Hours of Operation??' using Hybrid Retriever
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[32] paragraph | Hybrid | RRF -> 0.42540597915649414

--- Processing Chunking Strategy: sentence ---
Found 6 PDF files to process.

Processing: 6-Recruitment-policy-and-procedure.pdf
  ✓ Loaded 13 pages

Processing: 5-Compensation & Benefit Polic.pdf
  ✓ Loaded 65 pages

Processing: 7-Employee Handbook.pdf
  ✓ Loaded 35 pages

Processing: 4-Employee Code of Conduct.pdf
  ✓ Loaded 26 pages

Processing: 3-Anti-Discrimination and Anti-Harassment Policy and Procedures.pdf
  ✓ Loaded 11 pages

Processing: 2-Leave & Attendence Policy.pdf
  ✓ Loaded 57 pages

Total documents loaded: 207
📄 Total chunks created with 'sentence' strategy: 562
Generating embeddings for 562 texts...


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Generated embeddings with shape: (562, 384)
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 26074
🧩 Adding 562 documents to vector store...
✅ Successfully added 562 documents to vector store.
📄 Total documents in collection: 26636
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[33] sentence | Dense | None -> 0.791718065738678
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 26636
🧩 Adding 562 documents to vector store...
✅ Successfully added 562 documents to vector store.
📄 Total documents in collection: 27198
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[34] sentence | Dense | Summarization -> 0.8330061435699463
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 27198
🧩 Adding 562 documents to vector store...
✅ Successfully added 562 documents to vector store.
📄 Total documents in collection: 27760
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[35] sentence | Dense | Keyword Filtering -> 0.810580849647522
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 27760
🧩 Adding 562 documents to vector store...
✅ Successfully added 562 documents to vector store.
📄 Total documents in collection: 28322
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'What are the working hours for What are the Regular Hours of Operation??'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'When is the operation schedule for What are the Regular Hours of Operation??'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[36] sentence | Dense | RRF -> 0.810580849647522
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 28322
🧩 Adding 562 documents to vector store...
✅ Successfully added 562 documents to vector store.
📄 Total documents in collection: 28884
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 3, Score threshold: 0.0
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[37] sentence | BM25 | None -> 0.6670128703117371
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 28884
🧩 Adding 562 documents to vector store...
✅ Successfully added 562 documents to vector store.
📄 Total documents in collection: 29446
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 3, Score threshold: 0.0
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[38] sentence | BM25 | Summarization -> 0.5160555839538574
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 29446
🧩 Adding 562 documents to vector store...
✅ Successfully added 562 documents to vector store.
📄 Total documents in collection: 30008
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 3, Score threshold: 0.0
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[39] sentence | BM25 | Keyword Filtering -> 0.6670128703117371
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 30008
🧩 Adding 562 documents to vector store...
✅ Successfully added 562 documents to vector store.
📄 Total documents in collection: 30570
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 3, Score threshold: 0.0
Retrieved 3 documents (after filtering)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 5, Score threshold: 0.0
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'What are the working hours for What are the Regular Hours of Operation??' using BM25
Top K: 5, Score threshold: 0.0
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'When is the operation schedule for What are the Regular Hours of Operation??' using BM25
Top K: 5, Score threshold: 0.0
Retrieved 

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[40] sentence | BM25 | RRF -> 0.6739540100097656
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 30570
🧩 Adding 562 documents to vector store...
✅ Successfully added 562 documents to vector store.
📄 Total documents in collection: 31132
Generating embeddings for 562 texts...


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Generated embeddings with shape: (562, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[41] sentence | MMR | None -> 0.7757186889648438
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 31132
🧩 Adding 562 documents to vector store...
✅ Successfully added 562 documents to vector store.
📄 Total documents in collection: 31694
Generating embeddings for 562 texts...


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Generated embeddings with shape: (562, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[42] sentence | MMR | Summarization -> 0.5748120546340942
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 31694
🧩 Adding 562 documents to vector store...
✅ Successfully added 562 documents to vector store.
📄 Total documents in collection: 32256
Generating embeddings for 562 texts...


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Generated embeddings with shape: (562, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[43] sentence | MMR | Keyword Filtering -> 0.7689907550811768
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 32256
🧩 Adding 562 documents to vector store...
✅ Successfully added 562 documents to vector store.
📄 Total documents in collection: 32818
Generating embeddings for 562 texts...


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Generated embeddings with shape: (562, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'What are the working hours for What are the Regular Hours of Operation??' using MMR
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'When is the operation schedule for What are the Regular Hours of Operation??' using MMR
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[44] sentence | MMR | RRF -> 0.7757186889648438
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 32818
🧩 Adding 562 documents to vector store...
✅ Successfully added 562 documents to vector store.
📄 Total documents in collection: 33380
Generating embeddings for 562 texts...


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Generated embeddings with shape: (562, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[45] sentence | Hybrid | None -> 0.6670128703117371
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 33380
🧩 Adding 562 documents to vector store...
✅ Successfully added 562 documents to vector store.
📄 Total documents in collection: 33942
Generating embeddings for 562 texts...


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Generated embeddings with shape: (562, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[46] sentence | Hybrid | Summarization -> 0.5160555839538574
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 33942
🧩 Adding 562 documents to vector store...
✅ Successfully added 562 documents to vector store.
📄 Total documents in collection: 34504
Generating embeddings for 562 texts...


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Generated embeddings with shape: (562, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[47] sentence | Hybrid | Keyword Filtering -> 0.6670128703117371
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 34504
🧩 Adding 562 documents to vector store...
✅ Successfully added 562 documents to vector store.
📄 Total documents in collection: 35066
Generating embeddings for 562 texts...


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Generated embeddings with shape: (562, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'What are the working hours for What are the Regular Hours of Operation??' using Hybrid Retriever
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'When is the operation schedule for What are the Regular Hours of Operation??' using Hybrid Retriever
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[48] sentence | Hybrid | RRF -> 0.6739540100097656

--- Processing Chunking Strategy: title ---
Found 6 PDF files to process.

Processing: 6-Recruitment-policy-and-procedure.pdf
  ✓ Loaded 13 pages

Processing: 5-Compensation & Benefit Polic.pdf
  ✓ Loaded 65 pages

Processing: 7-Employee Handbook.pdf
  ✓ Loaded 35 pages

Processing: 4-Employee Code of Conduct.pdf
  ✓ Loaded 26 pages

Processing: 3-Anti-Discrimination and Anti-Harassment Policy and Procedures.pdf
  ✓ Loaded 11 pages

Processing: 2-Leave & Attendence Policy.pdf
  ✓ Loaded 57 pages

Total documents loaded: 207
📄 Total chunks created with 'title' strategy: 61
Generating embeddings for 61 texts...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Generated embeddings with shape: (61, 384)
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 35066
🧩 Adding 61 documents to vector store...
✅ Successfully added 61 documents to vector store.
📄 Total documents in collection: 35127
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[49] title | Dense | None -> 0.810580849647522
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 35127
🧩 Adding 61 documents to vector store...
✅ Successfully added 61 documents to vector store.
📄 Total documents in collection: 35188
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[50] title | Dense | Summarization -> 0.8186597228050232
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 35188
🧩 Adding 61 documents to vector store...
✅ Successfully added 61 documents to vector store.
📄 Total documents in collection: 35249
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[51] title | Dense | Keyword Filtering -> 0.810580849647522
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 35249
🧩 Adding 61 documents to vector store...
✅ Successfully added 61 documents to vector store.
📄 Total documents in collection: 35310
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'What are the working hours for What are the Regular Hours of Operation??'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'When is the operation schedule for What are the Regular Hours of Operation??'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[52] title | Dense | RRF -> 0.810580849647522
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 35310
🧩 Adding 61 documents to vector store...
✅ Successfully added 61 documents to vector store.
📄 Total documents in collection: 35371
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 3, Score threshold: 0.0
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[53] title | BM25 | None -> 0.6992045044898987
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 35371
🧩 Adding 61 documents to vector store...
✅ Successfully added 61 documents to vector store.
📄 Total documents in collection: 35432
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 3, Score threshold: 0.0
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[54] title | BM25 | Summarization -> 0.5620495676994324
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 35432
🧩 Adding 61 documents to vector store...
✅ Successfully added 61 documents to vector store.
📄 Total documents in collection: 35493
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 3, Score threshold: 0.0
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[55] title | BM25 | Keyword Filtering -> 0.6992045044898987
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 35493
🧩 Adding 61 documents to vector store...
✅ Successfully added 61 documents to vector store.
📄 Total documents in collection: 35554
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 3, Score threshold: 0.0
Retrieved 3 documents (after filtering)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 5, Score threshold: 0.0
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'What are the working hours for What are the Regular Hours of Operation??' using BM25
Top K: 5, Score threshold: 0.0
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'When is the operation schedule for What are the Regular Hours of Operation??' using BM25
Top K: 5, Score threshold: 0.0
Retrieved 5 doc

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[56] title | BM25 | RRF -> 0.7176084518432617
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 35554
🧩 Adding 61 documents to vector store...
✅ Successfully added 61 documents to vector store.
📄 Total documents in collection: 35615
Generating embeddings for 61 texts...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Generated embeddings with shape: (61, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[57] title | MMR | None -> 0.776654839515686
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 35615
🧩 Adding 61 documents to vector store...
✅ Successfully added 61 documents to vector store.
📄 Total documents in collection: 35676
Generating embeddings for 61 texts...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Generated embeddings with shape: (61, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[58] title | MMR | Summarization -> 0.6349905729293823
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 35676
🧩 Adding 61 documents to vector store...
✅ Successfully added 61 documents to vector store.
📄 Total documents in collection: 35737
Generating embeddings for 61 texts...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Generated embeddings with shape: (61, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[59] title | MMR | Keyword Filtering -> 0.7222326397895813
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 35737
🧩 Adding 61 documents to vector store...
✅ Successfully added 61 documents to vector store.
📄 Total documents in collection: 35798
Generating embeddings for 61 texts...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Generated embeddings with shape: (61, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'What are the working hours for What are the Regular Hours of Operation??' using MMR
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'When is the operation schedule for What are the Regular Hours of Operation??' using MMR
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[60] title | MMR | RRF -> 0.776654839515686
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 35798
🧩 Adding 61 documents to vector store...
✅ Successfully added 61 documents to vector store.
📄 Total documents in collection: 35859
Generating embeddings for 61 texts...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Generated embeddings with shape: (61, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[61] title | Hybrid | None -> 0.6992045044898987
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 35859
🧩 Adding 61 documents to vector store...
✅ Successfully added 61 documents to vector store.
📄 Total documents in collection: 35920
Generating embeddings for 61 texts...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Generated embeddings with shape: (61, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[62] title | Hybrid | Summarization -> 0.5620495676994324
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 35920
🧩 Adding 61 documents to vector store...
✅ Successfully added 61 documents to vector store.
📄 Total documents in collection: 35981
Generating embeddings for 61 texts...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Generated embeddings with shape: (61, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[63] title | Hybrid | Keyword Filtering -> 0.6992045044898987
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 35981
🧩 Adding 61 documents to vector store...
✅ Successfully added 61 documents to vector store.
📄 Total documents in collection: 36042
Generating embeddings for 61 texts...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Generated embeddings with shape: (61, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'What are the working hours for What are the Regular Hours of Operation??' using Hybrid Retriever
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'When is the operation schedule for What are the Regular Hours of Operation??' using Hybrid Retriever
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[64] title | Hybrid | RRF -> 0.6992045044898987

--- Processing Chunking Strategy: character ---
Found 6 PDF files to process.

Processing: 6-Recruitment-policy-and-procedure.pdf
  ✓ Loaded 13 pages

Processing: 5-Compensation & Benefit Polic.pdf
  ✓ Loaded 65 pages

Processing: 7-Employee Handbook.pdf
  ✓ Loaded 35 pages

Processing: 4-Employee Code of Conduct.pdf
  ✓ Loaded 26 pages

Processing: 3-Anti-Discrimination and Anti-Harassment Policy and Procedures.pdf
  ✓ Loaded 11 pages

Processing: 2-Leave & Attendence Policy.pdf
  ✓ Loaded 57 pages

Total documents loaded: 207
📄 Total chunks created with 'character' strategy: 1058
Generating embeddings for 1058 texts...


Batches:   0%|          | 0/34 [00:00<?, ?it/s]

Generated embeddings with shape: (1058, 384)
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 36042
🧩 Adding 1058 documents to vector store...
✅ Successfully added 1058 documents to vector store.
📄 Total documents in collection: 37100
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[65] character | Dense | None -> 0.810580849647522
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 37100
🧩 Adding 1058 documents to vector store...
✅ Successfully added 1058 documents to vector store.
📄 Total documents in collection: 38158
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[66] character | Dense | Summarization -> 0.8186597228050232
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 38158
🧩 Adding 1058 documents to vector store...
✅ Successfully added 1058 documents to vector store.
📄 Total documents in collection: 39216
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[67] character | Dense | Keyword Filtering -> 0.810580849647522
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 39216
🧩 Adding 1058 documents to vector store...
✅ Successfully added 1058 documents to vector store.
📄 Total documents in collection: 40274
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'What are the working hours for What are the Regular Hours of Operation??'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'When is the operation schedule for What are the Regular Hours of Operation??'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[68] character | Dense | RRF -> 0.810580849647522
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 40274
🧩 Adding 1058 documents to vector store...
✅ Successfully added 1058 documents to vector store.
📄 Total documents in collection: 41332
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 3, Score threshold: 0.0
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[69] character | BM25 | None -> 0.6532644033432007
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 41332
🧩 Adding 1058 documents to vector store...
✅ Successfully added 1058 documents to vector store.
📄 Total documents in collection: 42390
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 3, Score threshold: 0.0
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[70] character | BM25 | Summarization -> 0.6532644033432007
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 42390
🧩 Adding 1058 documents to vector store...
✅ Successfully added 1058 documents to vector store.
📄 Total documents in collection: 43448
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 3, Score threshold: 0.0
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[71] character | BM25 | Keyword Filtering -> 0.6532644033432007
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 43448
🧩 Adding 1058 documents to vector store...
✅ Successfully added 1058 documents to vector store.
📄 Total documents in collection: 44506
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 3, Score threshold: 0.0
Retrieved 3 documents (after filtering)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 5, Score threshold: 0.0
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'What are the working hours for What are the Regular Hours of Operation??' using BM25
Top K: 5, Score threshold: 0.0
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'When is the operation schedule for What are the Regular Hours of Operation??' using BM25
Top K: 5, Score threshold: 0.0
Retriev

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[72] character | BM25 | RRF -> 0.6441465616226196
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 44506
🧩 Adding 1058 documents to vector store...
✅ Successfully added 1058 documents to vector store.
📄 Total documents in collection: 45564
Generating embeddings for 1058 texts...


Batches:   0%|          | 0/34 [00:00<?, ?it/s]

Generated embeddings with shape: (1058, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[73] character | MMR | None -> 0.5930248498916626
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 45564
🧩 Adding 1058 documents to vector store...
✅ Successfully added 1058 documents to vector store.
📄 Total documents in collection: 46622
Generating embeddings for 1058 texts...


Batches:   0%|          | 0/34 [00:00<?, ?it/s]

Generated embeddings with shape: (1058, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[74] character | MMR | Summarization -> 0.5930248498916626
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 46622
🧩 Adding 1058 documents to vector store...
✅ Successfully added 1058 documents to vector store.
📄 Total documents in collection: 47680
Generating embeddings for 1058 texts...


Batches:   0%|          | 0/34 [00:00<?, ?it/s]

Generated embeddings with shape: (1058, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[75] character | MMR | Keyword Filtering -> 0.5871707201004028
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 47680
🧩 Adding 1058 documents to vector store...
✅ Successfully added 1058 documents to vector store.
📄 Total documents in collection: 48738
Generating embeddings for 1058 texts...


Batches:   0%|          | 0/34 [00:00<?, ?it/s]

Generated embeddings with shape: (1058, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'What are the working hours for What are the Regular Hours of Operation??' using MMR
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'When is the operation schedule for What are the Regular Hours of Operation??' using MMR
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[76] character | MMR | RRF -> 0.5672956705093384
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 48738
🧩 Adding 1058 documents to vector store...
✅ Successfully added 1058 documents to vector store.
📄 Total documents in collection: 49796
Generating embeddings for 1058 texts...


Batches:   0%|          | 0/34 [00:00<?, ?it/s]

Generated embeddings with shape: (1058, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[77] character | Hybrid | None -> 0.6532644033432007
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 49796
🧩 Adding 1058 documents to vector store...
✅ Successfully added 1058 documents to vector store.
📄 Total documents in collection: 50854
Generating embeddings for 1058 texts...


Batches:   0%|          | 0/34 [00:00<?, ?it/s]

Generated embeddings with shape: (1058, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[78] character | Hybrid | Summarization -> 0.6532644033432007
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 50854
🧩 Adding 1058 documents to vector store...
✅ Successfully added 1058 documents to vector store.
📄 Total documents in collection: 51912
Generating embeddings for 1058 texts...


Batches:   0%|          | 0/34 [00:00<?, ?it/s]

Generated embeddings with shape: (1058, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[79] character | Hybrid | Keyword Filtering -> 0.6532644033432007
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 51912
🧩 Adding 1058 documents to vector store...
✅ Successfully added 1058 documents to vector store.
📄 Total documents in collection: 52970
Generating embeddings for 1058 texts...


Batches:   0%|          | 0/34 [00:00<?, ?it/s]

Generated embeddings with shape: (1058, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'What are the working hours for What are the Regular Hours of Operation??' using Hybrid Retriever
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'When is the operation schedule for What are the Regular Hours of Operation??' using Hybrid Retriever
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[80] character | Hybrid | RRF -> 0.6441465616226196

--- Processing Chunking Strategy: token ---
Found 6 PDF files to process.

Processing: 6-Recruitment-policy-and-procedure.pdf
  ✓ Loaded 13 pages

Processing: 5-Compensation & Benefit Polic.pdf
  ✓ Loaded 65 pages

Processing: 7-Employee Handbook.pdf
  ✓ Loaded 35 pages

Processing: 4-Employee Code of Conduct.pdf
  ✓ Loaded 26 pages

Processing: 3-Anti-Discrimination and Anti-Harassment Policy and Procedures.pdf
  ✓ Loaded 11 pages

Processing: 2-Leave & Attendence Policy.pdf
  ✓ Loaded 57 pages

Total documents loaded: 207
📄 Total chunks created with 'token' strategy: 786
Generating embeddings for 786 texts...


Batches:   0%|          | 0/25 [00:00<?, ?it/s]

Generated embeddings with shape: (786, 384)
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 52970
🧩 Adding 786 documents to vector store...
✅ Successfully added 786 documents to vector store.
📄 Total documents in collection: 53756
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[81] token | Dense | None -> 0.810580849647522
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 53756
🧩 Adding 786 documents to vector store...
✅ Successfully added 786 documents to vector store.
📄 Total documents in collection: 54542
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[82] token | Dense | Summarization -> 0.8186597228050232
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 54542
🧩 Adding 786 documents to vector store...
✅ Successfully added 786 documents to vector store.
📄 Total documents in collection: 55328
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[83] token | Dense | Keyword Filtering -> 0.810580849647522
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 55328
🧩 Adding 786 documents to vector store...
✅ Successfully added 786 documents to vector store.
📄 Total documents in collection: 56114
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'What are the working hours for What are the Regular Hours of Operation??'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'When is the operation schedule for What are the Regular Hours of Operation??'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[84] token | Dense | RRF -> 0.810580849647522
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 56114
🧩 Adding 786 documents to vector store...
✅ Successfully added 786 documents to vector store.
📄 Total documents in collection: 56900
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 3, Score threshold: 0.0
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[85] token | BM25 | None -> 0.7423429489135742
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 56900
🧩 Adding 786 documents to vector store...
✅ Successfully added 786 documents to vector store.
📄 Total documents in collection: 57686
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 3, Score threshold: 0.0
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[86] token | BM25 | Summarization -> 0.7423429489135742
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 57686
🧩 Adding 786 documents to vector store...
✅ Successfully added 786 documents to vector store.
📄 Total documents in collection: 58472
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 3, Score threshold: 0.0
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[87] token | BM25 | Keyword Filtering -> 0.7423429489135742
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 58472
🧩 Adding 786 documents to vector store...
✅ Successfully added 786 documents to vector store.
📄 Total documents in collection: 59258
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 3, Score threshold: 0.0
Retrieved 3 documents (after filtering)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 5, Score threshold: 0.0
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'What are the working hours for What are the Regular Hours of Operation??' using BM25
Top K: 5, Score threshold: 0.0
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'When is the operation schedule for What are the Regular Hours of Operation??' using BM25
Top K: 5, Score threshold: 0.0
Retrieved 5 d

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[88] token | BM25 | RRF -> 0.7533729076385498
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 59258
🧩 Adding 786 documents to vector store...
✅ Successfully added 786 documents to vector store.
📄 Total documents in collection: 60044
Generating embeddings for 786 texts...


Batches:   0%|          | 0/25 [00:00<?, ?it/s]

Generated embeddings with shape: (786, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[89] token | MMR | None -> 0.8234590888023376
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 60044
🧩 Adding 786 documents to vector store...
✅ Successfully added 786 documents to vector store.
📄 Total documents in collection: 60830
Generating embeddings for 786 texts...


Batches:   0%|          | 0/25 [00:00<?, ?it/s]

Generated embeddings with shape: (786, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[90] token | MMR | Summarization -> 0.8234590888023376
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 60830
🧩 Adding 786 documents to vector store...
✅ Successfully added 786 documents to vector store.
📄 Total documents in collection: 61616
Generating embeddings for 786 texts...


Batches:   0%|          | 0/25 [00:00<?, ?it/s]

Generated embeddings with shape: (786, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[91] token | MMR | Keyword Filtering -> 0.8234590888023376
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 61616
🧩 Adding 786 documents to vector store...
✅ Successfully added 786 documents to vector store.
📄 Total documents in collection: 62402
Generating embeddings for 786 texts...


Batches:   0%|          | 0/25 [00:00<?, ?it/s]

Generated embeddings with shape: (786, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'What are the working hours for What are the Regular Hours of Operation??' using MMR
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'When is the operation schedule for What are the Regular Hours of Operation??' using MMR
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[92] token | MMR | RRF -> 0.8234590888023376
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 62402
🧩 Adding 786 documents to vector store...
✅ Successfully added 786 documents to vector store.
📄 Total documents in collection: 63188
Generating embeddings for 786 texts...


Batches:   0%|          | 0/25 [00:00<?, ?it/s]

Generated embeddings with shape: (786, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[93] token | Hybrid | None -> 0.7423429489135742
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 63188
🧩 Adding 786 documents to vector store...
✅ Successfully added 786 documents to vector store.
📄 Total documents in collection: 63974
Generating embeddings for 786 texts...


Batches:   0%|          | 0/25 [00:00<?, ?it/s]

Generated embeddings with shape: (786, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[94] token | Hybrid | Summarization -> 0.7423429489135742
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 63974
🧩 Adding 786 documents to vector store...
✅ Successfully added 786 documents to vector store.
📄 Total documents in collection: 64760
Generating embeddings for 786 texts...


Batches:   0%|          | 0/25 [00:00<?, ?it/s]

Generated embeddings with shape: (786, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[95] token | Hybrid | Keyword Filtering -> 0.7423429489135742
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 64760
🧩 Adding 786 documents to vector store...
✅ Successfully added 786 documents to vector store.
📄 Total documents in collection: 65546
Generating embeddings for 786 texts...


Batches:   0%|          | 0/25 [00:00<?, ?it/s]

Generated embeddings with shape: (786, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'What are the working hours for What are the Regular Hours of Operation??' using Hybrid Retriever
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'When is the operation schedule for What are the Regular Hours of Operation??' using Hybrid Retriever
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[96] token | Hybrid | RRF -> 0.7533729076385498

--- Processing Chunking Strategy: overlapping ---
Found 6 PDF files to process.

Processing: 6-Recruitment-policy-and-procedure.pdf
  ✓ Loaded 13 pages

Processing: 5-Compensation & Benefit Polic.pdf
  ✓ Loaded 65 pages

Processing: 7-Employee Handbook.pdf
  ✓ Loaded 35 pages

Processing: 4-Employee Code of Conduct.pdf
  ✓ Loaded 26 pages

Processing: 3-Anti-Discrimination and Anti-Harassment Policy and Procedures.pdf
  ✓ Loaded 11 pages

Processing: 2-Leave & Attendence Policy.pdf
  ✓ Loaded 57 pages

Total documents loaded: 207
📄 Total chunks created with 'overlapping' strategy: 1297
Generating embeddings for 1297 texts...


Batches:   0%|          | 0/41 [00:00<?, ?it/s]

Generated embeddings with shape: (1297, 384)
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 65546
🧩 Adding 1297 documents to vector store...
✅ Successfully added 1297 documents to vector store.
📄 Total documents in collection: 66843
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[97] overlapping | Dense | None -> 0.810580849647522
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 66843
🧩 Adding 1297 documents to vector store...
✅ Successfully added 1297 documents to vector store.
📄 Total documents in collection: 68140
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[98] overlapping | Dense | Summarization -> 0.8186597228050232
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 68140
🧩 Adding 1297 documents to vector store...
✅ Successfully added 1297 documents to vector store.
📄 Total documents in collection: 69437
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[99] overlapping | Dense | Keyword Filtering -> 0.810580849647522
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 69437
🧩 Adding 1297 documents to vector store...
✅ Successfully added 1297 documents to vector store.
📄 Total documents in collection: 70734
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Retrieving documents for query: 'What are the Regular Hours of Operation?'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'What are the working hours for What are the Regular Hours of Operation??'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'When is the operation schedule for What are the Regular Hours of Operation??'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[100] overlapping | Dense | RRF -> 0.810580849647522
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 70734
🧩 Adding 1297 documents to vector store...
✅ Successfully added 1297 documents to vector store.
📄 Total documents in collection: 72031
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 3, Score threshold: 0.0
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[101] overlapping | BM25 | None -> 0.6304553747177124
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 72031
🧩 Adding 1297 documents to vector store...
✅ Successfully added 1297 documents to vector store.
📄 Total documents in collection: 73328
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 3, Score threshold: 0.0
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[102] overlapping | BM25 | Summarization -> 0.6304553747177124
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 73328
🧩 Adding 1297 documents to vector store...
✅ Successfully added 1297 documents to vector store.
📄 Total documents in collection: 74625
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 3, Score threshold: 0.0
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[103] overlapping | BM25 | Keyword Filtering -> 0.6304553747177124
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 74625
🧩 Adding 1297 documents to vector store...
✅ Successfully added 1297 documents to vector store.
📄 Total documents in collection: 75922
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 3, Score threshold: 0.0
Retrieved 3 documents (after filtering)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using BM25
Top K: 5, Score threshold: 0.0
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'What are the working hours for What are the Regular Hours of Operation??' using BM25
Top K: 5, Score threshold: 0.0
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'When is the operation schedule for What are the Regular Hours of Operation??' using BM25
Top K: 5, Score threshold: 0.0
Retr

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[104] overlapping | BM25 | RRF -> 0.722102165222168
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 75922
🧩 Adding 1297 documents to vector store...
✅ Successfully added 1297 documents to vector store.
📄 Total documents in collection: 77219
Generating embeddings for 1297 texts...


Batches:   0%|          | 0/41 [00:00<?, ?it/s]

Generated embeddings with shape: (1297, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[105] overlapping | MMR | None -> 0.6232724189758301
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 77219
🧩 Adding 1297 documents to vector store...
✅ Successfully added 1297 documents to vector store.
📄 Total documents in collection: 78516
Generating embeddings for 1297 texts...


Batches:   0%|          | 0/41 [00:00<?, ?it/s]

Generated embeddings with shape: (1297, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[106] overlapping | MMR | Summarization -> 0.6232724189758301
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 78516
🧩 Adding 1297 documents to vector store...
✅ Successfully added 1297 documents to vector store.
📄 Total documents in collection: 79813
Generating embeddings for 1297 texts...


Batches:   0%|          | 0/41 [00:00<?, ?it/s]

Generated embeddings with shape: (1297, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[107] overlapping | MMR | Keyword Filtering -> 0.657151997089386
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 79813
🧩 Adding 1297 documents to vector store...
✅ Successfully added 1297 documents to vector store.
📄 Total documents in collection: 81110
Generating embeddings for 1297 texts...


Batches:   0%|          | 0/41 [00:00<?, ?it/s]

Generated embeddings with shape: (1297, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using MMR
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'What are the working hours for What are the Regular Hours of Operation??' using MMR
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'When is the operation schedule for What are the Regular Hours of Operation??' using MMR
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[108] overlapping | MMR | RRF -> 0.7318090796470642
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 81110
🧩 Adding 1297 documents to vector store...
✅ Successfully added 1297 documents to vector store.
📄 Total documents in collection: 82407
Generating embeddings for 1297 texts...


Batches:   0%|          | 0/41 [00:00<?, ?it/s]

Generated embeddings with shape: (1297, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[109] overlapping | Hybrid | None -> 0.6304553747177124
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 82407
🧩 Adding 1297 documents to vector store...
✅ Successfully added 1297 documents to vector store.
📄 Total documents in collection: 83704
Generating embeddings for 1297 texts...


Batches:   0%|          | 0/41 [00:00<?, ?it/s]

Generated embeddings with shape: (1297, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[110] overlapping | Hybrid | Summarization -> 0.6304553747177124
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 83704
🧩 Adding 1297 documents to vector store...
✅ Successfully added 1297 documents to vector store.
📄 Total documents in collection: 85001
Generating embeddings for 1297 texts...


Batches:   0%|          | 0/41 [00:00<?, ?it/s]

Generated embeddings with shape: (1297, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[111] overlapping | Hybrid | Keyword Filtering -> 0.6304553747177124
✅ Vector store initialized. Collection: pdf_documents
📄 Existing documents in collection: 85001
🧩 Adding 1297 documents to vector store...
✅ Successfully added 1297 documents to vector store.
📄 Total documents in collection: 86298
Generating embeddings for 1297 texts...


Batches:   0%|          | 0/41 [00:00<?, ?it/s]

Generated embeddings with shape: (1297, 384)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Retrieving documents for query: 'What are the Regular Hours of Operation?' using Hybrid Retriever
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'What are the working hours for What are the Regular Hours of Operation??' using Hybrid Retriever
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Retrieving documents for query: 'When is the operation schedule for What are the Regular Hours of Operation??' using Hybrid Retriever
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
[112] overlapping | Hybrid | RRF -> 0.722102165222168
✅ Results saved to RAG_pipeline_results_with_postprocessing.xlsx
