In [36]:
# ========== SETUP ==========
import re
import uuid
from unstructured.partition.auto import partition
from qdrant_client import QdrantClient, models
from sentence_transformers import SentenceTransformer  # Changed import
import ollama
from langchain_ollama import ChatOllama
from datetime import datetime, timezone

# Configuration
CHUNK_METHODS = ["sliding", "sentence"]
SLIDING_SIZE = 200
SLIDING_OVERLAP = 50
SENTENCE_MAX = 300
MIN_CHUNK = 25

EMBEDDING_MODEL = "BAAI/bge-base-en-v1.5"  # Official model name
LLM_MODEL = "llama3"

# Initialize clients
qdrant = QdrantClient(host="localhost", port=6333)
embed_model = SentenceTransformer(EMBEDDING_MODEL)
llm = ChatOllama(model=LLM_MODEL)


In [37]:
# # Cleanup old collections
# for coll in qdrant.get_collections().collections:
#     if coll.name.startswith(("rag_data_", "raw_data_")):
#         qdrant.delete_collection(coll.name)

In [38]:
print("\nCollections ready:")
print([coll.name for coll in qdrant.get_collections().collections])


Collections ready:
['rag_data_sliding', 'rag_data_sentence']


In [39]:
# ========== TEXT PROCESSING ==========
def clean_text(text):
    if not text: return ""
    text = re.sub(r'[^\w\s.,;:!?\'-]', ' ', text)
    return re.sub(r'\s+', ' ', text).strip()

def chunk_sentences(text):
    text = clean_text(text)
    if len(text) < MIN_CHUNK or not any(c.isalpha() for c in text):
        return []
    
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks, current = [], ""
    
    for s in sentences:
        if len(current) + len(s) <= SENTENCE_MAX:
            current += s + " "
        else:
            if current.strip(): chunks.append(current.strip())
            current = s + " "
    
    if current.strip(): chunks.append(current.strip())
    return chunks

def chunk_sliding(text):
    text = clean_text(text)
    if len(text) < MIN_CHUNK or not any(c.isalpha() for c in text):
        return []
    
    chunks = []
    start = 0
    while start < len(text):
        end = start + SLIDING_SIZE
        chunk = text[start:end].strip()
        if chunk: chunks.append(chunk)
        start += SLIDING_SIZE - SLIDING_OVERLAP
    return chunks

In [40]:
# ========== DOCUMENT PROCESSING ==========
def create_rag_collection(method):
    """Create fresh Qdrant collection for a chunking method"""
    coll_name = f"rag_data_{method}"
    
    # Delete if collection exists
    if qdrant.collection_exists(coll_name):
        qdrant.delete_collection(coll_name)
    
    # Create collection
    qdrant.create_collection(
        collection_name=coll_name,
        vectors_config=models.VectorParams(
            size=embed_model.get_sentence_embedding_dimension(),
            distance=models.Distance.COSINE
        )
    )
    return coll_name

def process_pdf_to_chunks(filepath, method):
    elements = partition(filename=filepath, languages=["eng"])
    for el in elements:
        text = el.text.strip()
        
        # Metadata extraction — update with real attributes if available in elements
        page_num = getattr(el, "page_number", "unknown")
        title = getattr(el, "heading", "unknown")
        section = getattr(el, "section", "unknown")
        author = getattr(el, "author", "unknown")
        processed_at = datetime.now(timezone.utc).isoformat()
        
        if text:
            for chunk in (chunk_sliding(text) if method == "sliding" else chunk_sentences(text)):
                yield {
                    "chunk": chunk,
                    "page": page_num,
                    "title": title,
                    "section": section,
                    "author": author,
                    "processed_at": processed_at
                }

def store_chunks(chunks, filepath, method):
    """Store processed chunks in Qdrant"""
    coll_name = create_rag_collection(method)

    # Extract chunk texts to embed
    chunk_texts = [c["chunk"] for c in chunks]
    embeddings = embed_model.encode(chunk_texts, normalize_embeddings=True)
    
    points = []
    for chunk_data, embedding in zip(chunks, embeddings):
        payload = {
            "text": chunk_data["chunk"],
            "source": filepath,
            "method": method,
            "page": chunk_data.get("page", "unknown"),
            "title": chunk_data.get("title", "unknown"),
            "section": chunk_data.get("section", "unknown"),
            "author": chunk_data.get("author", "unknown"),
            "processed_at": chunk_data.get("processed_at", datetime.now(timezone.utc).isoformat())
        }
        points.append(
            models.PointStruct(
                id=str(uuid.uuid4()),
                vector=embedding.tolist(),
                payload=payload
            )
        )
    
    qdrant.upsert(coll_name, points)
    return len(points)


In [41]:
# # ========== PROCESS DOCUMENT ==========
# file_path = "../../../data/files/rag.pdf"

# for method in CHUNK_METHODS:
#     chunks = list(process_pdf_to_chunks(file_path, method))  # Use process_pdf_to_chunks
#     stored = store_chunks(chunks, file_path, method)  # Use store_chunks
#     print(f"Stored {stored} {method} chunks")

# print("\nCollections ready:")
# print([coll.name for coll in qdrant.get_collections().collections])

In [None]:
# ========== QUERY FUNCTIONS ==========
def cosine_similarity(a, b):
    dot_product = sum([x * y for x, y in zip(a, b)])
    norm_a = sum([x ** 2 for x in a]) ** 0.5
    norm_b = sum([x ** 2 for x in b]) ** 0.5
    return dot_product / (norm_a * norm_b)


def similarity_search(query_embedding, method="sentence", top_k=3):
    points = qdrant.scroll(
        collection_name=f"rag_data_{method}",
        with_vectors=True,
        with_payload=True
    )[0]

    scored_points = []
    for point in points:
        similarity = cosine_similarity(query_embedding, point.vector)
        scored_points.append({
            "text": point.payload['text'],
            "similarity": similarity,
            "source": point.payload.get('source', ''),
            "method": point.payload.get('method', ''),
            "page": point.payload.get('page', 'unknown'),
            "title": point.payload.get('title', 'unknown'),
            "section": point.payload.get('section', 'unknown'),
            "author": point.payload.get('author', 'unknown'),
            "processed_at": point.payload.get('processed_at', 'unknown')
        })

    # Sort by similarity descending
    return sorted(scored_points, key=lambda x: x['similarity'], reverse=True)[:top_k]


def search(query, method="sentence", top_k=3):
    """Modified to use custom similarity"""
    query_embedding = embed_model.encode(query, normalize_embeddings=True).tolist()
    return similarity_search(query_embedding, method, top_k)


def ask(query, method="sentence"):
    results = search(query, method)
    if not results:
        return "No relevant information found."

    # Compose context without scores for LLM
    context = "\n".join(f"- {res['text']}" for res in results)

    # prompt = (
    #     f"Based ONLY on the following context, answer the question:\n{context}\n\n"
    #     f"Question: {query}\nAnswer:"
    # )

    prompt = (
        f"Based ONLY on the following retrieved context, provide the exact information without any modification or added explanation:\n{context}\n\n"
        f"Question: {query}\nAnswer:\n"
        "(Do not generate or infer answers, only present the retrieved text exactly as it appears.)"
    )

    response = llm.invoke(prompt)

    # Print metadata for traceability
    print("\n--- Retrieved Chunks Metadata ---")
    for res in results:
        print(
            f"Page: {res['page']}, Title: {res['title']}, Section: {res.get('section', 'unknown')}, "
            f"Author: {res.get('author', 'unknown')}, Source: {res['source']}, "
            f"Similarity: {res['similarity']:.3f}, Processed At: {res.get('processed_at', 'unknown')}"
        )

    return response.content

In [54]:
def retrieve_only(query, method="sentence"):
    results = search(query, method)
    if not results:
        return "No relevant information found."

    # Compose context without scores or prompt
    context = "\n".join(f"- {res['text']}" for res in results)

    # Print metadata for traceability
    print("\n--- Retrieved Chunks Metadata ---")
    for res in results:
        print(
            f"Page: {res['page']}, Title: {res['title']}, Section: {res.get('section', 'unknown')}, "
            f"Author: {res.get('author', 'unknown')}, Source: {res['source']}, "
            f"Similarity: {res['similarity']:.3f}, Processed At: {res.get('processed_at', 'unknown')}"
        )

    # Return just the retrieved text as one string (or could return list of texts)
    return context


In [56]:
# ========== ASK QUESTIONS ==========
question = "What is Retrieval-Augmented Generation (RAG)?"
print("Q:", question)
print("A:", retrieve_only(question, method="sliding"))

Q: What is Retrieval-Augmented Generation (RAG)?

--- Retrieved Chunks Metadata ---
Page: unknown, Title: unknown, Section: unknown, Author: unknown, Source: ../../../data/files/rag.pdf, Similarity: 0.717, Processed At: 2025-08-12T12:54:06.656266+00:00
Page: unknown, Title: unknown, Section: unknown, Author: unknown, Source: ../../../data/files/rag.pdf, Similarity: 0.689, Processed At: 2025-08-12T12:54:06.650899+00:00
Page: unknown, Title: unknown, Section: unknown, Author: unknown, Source: ../../../data/files/rag.pdf, Similarity: 0.658, Processed At: 2025-08-12T12:54:06.655967+00:00
A: - 163 Q. Leng, K. Uhlenhuth, and A. Polyzotis, Best practices for llm evaluation of rag applications, https: www.databricks.com blog LLM-auto-eval-best-practices-RAG, 2023.
- o data corruption during retrieval. Secondly, incorporating tables into the data can complicate semantic similarity searches. When dealing with semi-structured data, one approach involves lever- aging
- sentence argument linking, a

In [57]:
# ========== ASK QUESTIONS ==========
question = "What is Retrieval-Augmented Generation (RAG)?"
print("Q:", question)
print("A:", ask(question, method="sliding"))

Q: What is Retrieval-Augmented Generation (RAG)?

--- Retrieved Chunks Metadata ---
Page: unknown, Title: unknown, Section: unknown, Author: unknown, Source: ../../../data/files/rag.pdf, Similarity: 0.717, Processed At: 2025-08-12T12:54:06.656266+00:00
Page: unknown, Title: unknown, Section: unknown, Author: unknown, Source: ../../../data/files/rag.pdf, Similarity: 0.689, Processed At: 2025-08-12T12:54:06.650899+00:00
Page: unknown, Title: unknown, Section: unknown, Author: unknown, Source: ../../../data/files/rag.pdf, Similarity: 0.658, Processed At: 2025-08-12T12:54:06.655967+00:00
A: Based on the provided context, there is no direct mention of what Retrieval-Augmented Generation (RAG) is. The relevant texts only discuss best practices for evaluating LLMs and semi-structured data, but do not define or explain RAG. Therefore, I can provide no information about RAG.


In [53]:
# ========== ASK QUESTIONS ==========
question = "What are the key limitations of Large Language Models (LLMs) that RAG aims to address?"
print("Q:", question)
print("A:", ask(question, method="sentence"))

Q: What are the key limitations of Large Language Models (LLMs) that RAG aims to address?

--- Retrieved Chunks Metadata ---
Page: unknown, Title: unknown, Section: unknown, Author: unknown, Source: ../../../data/files/rag.pdf, Similarity: 0.737, Processed At: 2025-08-12T12:54:55.866841+00:00
Page: unknown, Title: unknown, Section: unknown, Author: unknown, Source: ../../../data/files/rag.pdf, Similarity: 0.710, Processed At: 2025-08-12T12:54:55.868575+00:00
Page: unknown, Title: unknown, Section: unknown, Author: unknown, Source: ../../../data/files/rag.pdf, Similarity: 0.667, Processed At: 2025-08-12T12:54:55.869381+00:00
A: There is no answer provided in the given context regarding the key limitations of Large Language Models (LLMs) that RAG aims to address. The provided text only mentions a "notable paucity of research" and discusses the evolution of RAG technologies and their application on different tasks, but does not mention LLMs or their limitations.


In [44]:
# ========== ASK QUESTIONS ==========
question = "What is Retrieval-Augmented Generation (RAG)?"
print("Q:", question)
print("A:", ask(question, method="sliding"))

Q: What is Retrieval-Augmented Generation (RAG)?

--- Retrieved Chunks Metadata ---
Page: unknown, Title: unknown, Section: unknown, Author: unknown, Source: ../../../data/files/rag.pdf, Similarity: 0.717, Processed At: 2025-08-12T12:54:06.656266+00:00
Page: unknown, Title: unknown, Section: unknown, Author: unknown, Source: ../../../data/files/rag.pdf, Similarity: 0.689, Processed At: 2025-08-12T12:54:06.650899+00:00
Page: unknown, Title: unknown, Section: unknown, Author: unknown, Source: ../../../data/files/rag.pdf, Similarity: 0.658, Processed At: 2025-08-12T12:54:06.655967+00:00
A: Based on the provided context, Retrieval-Augmented Generation (RAG) is not explicitly defined in the given text. However, it can be inferred that RAG applications are being evaluated using Large Language Models (LLMs), which suggests that RAG might refer to a type of language processing application or model that uses LLMs.


In [45]:
# ========== ASK QUESTIONS ==========
question = "What are the three core components of a RAG framework?"
print("\nQ:", question)
print("A:", ask(question, method="sentence"))


Q: What are the three core components of a RAG framework?

--- Retrieved Chunks Metadata ---
Page: unknown, Title: unknown, Section: unknown, Author: unknown, Source: ../../../data/files/rag.pdf, Similarity: 0.815, Processed At: 2025-08-12T12:54:55.868575+00:00
Page: unknown, Title: unknown, Section: unknown, Author: unknown, Source: ../../../data/files/rag.pdf, Similarity: 0.709, Processed At: 2025-08-12T12:54:55.866841+00:00
Page: unknown, Title: unknown, Section: unknown, Author: unknown, Source: ../../../data/files/rag.pdf, Similarity: 0.592, Processed At: 2025-08-12T12:54:55.865799+00:00
A: Based on the provided context, there is no direct information about the three core components of a RAG (RAG) framework. The text only mentions three developmental paradigms within the RAG framework: Naive, Advanced, and Modular RAG, which represent progressive enhancements over their predecessors. However, it does not specify what these components are or how they function.

Therefore, I can't 

In [46]:
# ========== ASK QUESTIONS ==========
question = "What are the three core components of a RAG framework?"
print("\nQ:", question)
print("A:", ask(question, method="sliding"))


Q: What are the three core components of a RAG framework?

--- Retrieved Chunks Metadata ---
Page: unknown, Title: unknown, Section: unknown, Author: unknown, Source: ../../../data/files/rag.pdf, Similarity: 0.710, Processed At: 2025-08-12T12:54:06.656266+00:00
Page: unknown, Title: unknown, Section: unknown, Author: unknown, Source: ../../../data/files/rag.pdf, Similarity: 0.626, Processed At: 2025-08-12T12:54:06.653362+00:00
Page: unknown, Title: unknown, Section: unknown, Author: unknown, Source: ../../../data/files/rag.pdf, Similarity: 0.560, Processed At: 2025-08-12T12:54:06.655967+00:00
A: According to TABLE IV SUMMARY OF EVALUATION FRAMEWORKS from the provided context, the three core components of a RAG (Rationale and Argument Generation) framework are:

1. **Rationale Generation**: This involves generating explanations for the model's predictions or decisions.
2. **Argument Linking**: This involves linking the rationale to the original input or question, demonstrating how the 

In [47]:
# ========== ASK QUESTIONS ==========
question = "Explain the indexing process in Naive RAG."
print("\nQ:", question)
print("A:", ask(question, method="sentence"))


Q: Explain the indexing process in Naive RAG.

--- Retrieved Chunks Metadata ---
Page: unknown, Title: unknown, Section: unknown, Author: unknown, Source: ../../../data/files/rag.pdf, Similarity: 0.694, Processed At: 2025-08-12T12:54:55.868575+00:00
Page: unknown, Title: unknown, Section: unknown, Author: unknown, Source: ../../../data/files/rag.pdf, Similarity: 0.663, Processed At: 2025-08-12T12:54:55.865799+00:00
Page: unknown, Title: unknown, Section: unknown, Author: unknown, Source: ../../../data/files/rag.pdf, Similarity: 0.646, Processed At: 2025-08-12T12:54:55.866841+00:00
A: Based on the provided context, I can attempt to answer your question.

According to the text, it does not explicitly describe the indexing process in Naive RAG. However, since it is a part of the RAG system, we can infer that the indexing process might be similar to other retrieval-based systems. In general, indexing refers to the process of creating an inverted index or a data structure that allows for e

In [48]:
# ========== ASK QUESTIONS ==========
question = "Explain the indexing process in Naive RAG."
print("\nQ:", question)
print("A:", ask(question, method="sliding"))


Q: Explain the indexing process in Naive RAG.

--- Retrieved Chunks Metadata ---
Page: unknown, Title: unknown, Section: unknown, Author: unknown, Source: ../../../data/files/rag.pdf, Similarity: 0.685, Processed At: 2025-08-12T12:54:06.650899+00:00
Page: unknown, Title: unknown, Section: unknown, Author: unknown, Source: ../../../data/files/rag.pdf, Similarity: 0.681, Processed At: 2025-08-12T12:54:06.656266+00:00
Page: unknown, Title: unknown, Section: unknown, Author: unknown, Source: ../../../data/files/rag.pdf, Similarity: 0.647, Processed At: 2025-08-12T12:54:06.650875+00:00
A: Based on the provided context, there is no mention of an "indexing process" or "Naive RAG". The text appears to be discussing data corruption, semantic similarity searches, and structured vs. semi-structured data, but does not mention indexing or RAG (possibly a typo). Therefore, I cannot provide an answer to this question based on the given context.


In [49]:
# ========== ASK QUESTIONS ==========
question = "What are the key challenges in the retrieval phase of Naive RAG?"
print("\nQ:", question)
print("A:", ask(question, method="sentence"))


Q: What are the key challenges in the retrieval phase of Naive RAG?

--- Retrieved Chunks Metadata ---
Page: unknown, Title: unknown, Section: unknown, Author: unknown, Source: ../../../data/files/rag.pdf, Similarity: 0.745, Processed At: 2025-08-12T12:54:55.868575+00:00
Page: unknown, Title: unknown, Section: unknown, Author: unknown, Source: ../../../data/files/rag.pdf, Similarity: 0.698, Processed At: 2025-08-12T12:54:55.866841+00:00
Page: unknown, Title: unknown, Section: unknown, Author: unknown, Source: ../../../data/files/rag.pdf, Similarity: 0.684, Processed At: 2025-08-12T12:54:55.865799+00:00
A: Based on the provided context, there is no mention of specific challenges related to the retrieval phase of Naive RAG. The passage only provides an overview of the three developmental paradigms within the RAG framework (Naive, Advanced, and Modular) but does not discuss the key challenges in the retrieval phase.


In [50]:
# ========== ASK QUESTIONS ==========
question = "What are the key challenges in the retrieval phase of Naive RAG?"
print("\nQ:", question)
print("A:", ask(question, method="sliding"))


Q: What are the key challenges in the retrieval phase of Naive RAG?

--- Retrieved Chunks Metadata ---
Page: unknown, Title: unknown, Section: unknown, Author: unknown, Source: ../../../data/files/rag.pdf, Similarity: 0.704, Processed At: 2025-08-12T12:54:06.656266+00:00
Page: unknown, Title: unknown, Section: unknown, Author: unknown, Source: ../../../data/files/rag.pdf, Similarity: 0.693, Processed At: 2025-08-12T12:54:06.650899+00:00
Page: unknown, Title: unknown, Section: unknown, Author: unknown, Source: ../../../data/files/rag.pdf, Similarity: 0.680, Processed At: 2025-08-12T12:54:06.650875+00:00
A: Based on the context, the answer is:

data corruption during retrieval
