# Goal
    1.Data Ingestion - Load PDFs, text files, HTML, CSVs
    2.Advanced Chunking - Recursive, semantic
    3.Vector Indexing - ChromaDB
    4.Hybrid Search - Dense (embeddings) + Sparse (BM25)
    5.Re-ranking - Cohere API & Cross-Encoder models
    6.Query Transformation - Multi-query, HyDE, Step-back prompting
    7.Context Compression - LLM-based relevance filtering
    8.Generation with Citations - Answers with source attribution
    9.Evaluation Metrics - MRR, Recall@K, answer quality
    10.Complete Orchestration - Easy-to-use pipeline class

## 1.Data Ingestion - Load PDFs, text files, HTML, CSVs 

In [15]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
""""
from langchain.document_loaders import (PyPDFLoader,TextLoader,Docx2txtLoader,DirectoryLoader,UnstructuredHTMLLoader,CSVLoader)
from typing import List,Dict,Tuple
import re

class DataIngestion:
    
    @staticmethod
    def load_pdfs(file_path:str):
        loader=PyPDFLoader(file_path)
        return loader.load()
    
    @staticmethod
    def load_text(file_path:str):
        loader=TextLoader(file_path)
        return loader.load()
    
    @staticmethod
    def load_directory(directory_path:str,glob_pattern:str='**/*.pdf'):
        loader=DirectoryLoader(
            directory_path,
            glob=glob_pattern,
            loader_cls=PyPDFLoader,
            show_progress=True
        )
        return loader.load()
        
    @staticmethod
    def load_docx(file_path:str):
        loader=Docx2txtLoader(file_path)
        return loader.load()
    
    @staticmethod
    def preprocess_text(text:str)->str:
        text=re.sub(r"\s+",' ',text)
        text=re.sub(r'[^\w\s\.\?\!\-\:\;]','',text)
        
        return text.strip()

"""

In [None]:
# document=DataIngestion.load_directory(r'C:\Users\evilk\OneDrive\Desktop\Projects\RAG-Complete-Pipeline\data')

 20%|██        | 7/35 [00:09<00:34,  1.25s/it]Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 26 0 (offset 0)
Ignoring wrong pointing object 28 0 (offset 0)
 77%|███████▋  | 27/35 [00:51<00:07,  1.02it/s]Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 42 0 (offset 0)
Ignoring wrong pointing object 45 0 (offset 0)
Ignoring wrong pointing object 48 0 (offset 0)
Ignoring wrong pointing object 51 0 (offset 0)
Ignoring wrong pointing object 71 0 (offset 0)
100%|██████████| 35/35 [01:07<00:00,  1.93s/it]


In [None]:
# print(len(document))


1608


## 2.Advanced Chunking - Recursive, semantic

In [None]:
"""
from langchain.text_splitter import (RecursiveCharacterTextSplitter,CharacterTextSplitter)
from sentence_transformers import SentenceTransformer
from langchain.schema import Document
import numpy as np

class Chunking:
    
    
    @staticmethod
    def recursive_chunking(documents,chunk_size=1000,chunk_overlap=200):
        textSplitter=RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
        )
        return textSplitter.split_documents(documents)
    
    @staticmethod
    def semantic_chunking(documents,embedding,chunk_size=1000):
         chunks=[]
         model=SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
         
         for doc in documents:
             sentences=re.split(r'(?<=[.!?])\s+',doc.page_content)
             
             if len(sentences)<=1:
                 chunks.append(doc)
                 continue
             
             embedding_array=model.encode(sentences)
             
             similarities=[]
             for i in range(len(embedding_array)-1):
                 sim=np.dot(embedding_array[i],embedding_array[i+1])
                 similarities.append(sim)
                 
             threshold=np.percentile(similarities,30)
             
             current_chunk=[]
             for i,sentence in enumerate(sentences):
                 current_chunk.append(sentence)
                 
                 if i <len(similarities) and similarities[i]<threshold:
                     chunk_text=' '.join(current_chunk)
                     if len(chunk_text)>chunk_size:
                         chunks.append(Document(
                             page_content=chunk_text,
                             metadata=doc.metadata
                         ))
                         current_chunk=[]
                         
             if current_chunk:
                chunks.append(Document(
                    page_content=' '.join(current_chunk),
                    metadata=doc.metadata
                ))
         return chunks

"""

In [None]:
""""
chunks=Chunking.recursive_chunking(document)
print(chunks[1000])
"""

page_content='Salary Risk The present value of the defined plan liability is calculated by reference to the future salaries 
of plan participants. As such, an increase in the salary of the plan participants will increase the 
plan's liability.
  29.2 Share Based Payments
   a) Scheme details
    The Company has Employees’ Stock Option Scheme i.e. ESOS-2017 under which options have been granted at the 
exercise price of C 10 per share to be vested from time to time on the basis of performance and other eligibility criteria. 
Details of number of options outstanding have been tabulated below: 
Financial Year
(Year of Grant)
Number of Options Outstanding
Financial Year of Vesting Exercise 
Price (K)
Range of Fair value at Grant 
Date (K)
As at  
31st March, 
2024
As at
31st March, 
2023
ESOS - 2017
Details of Employee Stock Options granted from 1st April, 2020 to 31st March, 2024
2020-21 2,00,000 2,00,000 2021-22 to 2024-25 10.00 2,133.40 - 2,151.90' metadata={'producer': 'Adobe PDF Libra

## 3.Vector Indexing - ChromaDB

In [None]:
"""
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import FAISS,Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
import numpy as np

class Embeddings:
    
    def __init__(self,model_name='all-MiniLM-L6-v2'):
        
        self.embeddings=HuggingFaceEmbeddings(
            model_name=f"sentence-transformers/{model_name}",
            model_kwargs={'device':'cuda'},
            encode_kwargs={'normalize_embeddings':True}
        )
        
    def create_chroma_db(self,chunks,persist_directory="../chroma_db"):
        vectordb=Chroma.from_documents(
            documents=chunks,
            embedding=self.embeddings,
            persist_directory=persist_directory
        )
        return vectordb
"""    

In [None]:
"""
emb=Embeddings()
vectordb=emb.create_chroma_db(chunks)
"""

  self.embeddings=HuggingFaceEmbeddings(


## 4.Hybrid Search - Dense (embeddings) + Sparse (BM25)

In [None]:
"""
from rank_bm25 import BM25Okapi
import numpy as np

class HybridRetriever:
    
    def __init__(self,vectorstore,documents):
        self.vectorstore=vectorstore
        self.documents=documents
        
        tokenized_docs=[doc.page_content.lower().split() for doc in documents]
        self.bm25=BM25Okapi(tokenized_docs)
        print(f"Hybrid Retriever ready with {len(documents)} documents")
        
        
    def retrieve(self,query:str,k=10,alpha=0.5):
        
        # Vector Search 
        dense_results=self.vectorstore.similarity_search_with_score(query,k=k*2)
        
        # BM25 Search
        tokenized_query=query.lower().split()
        bm25_scores=self.bm25.get_scores(tokenized_query)
        
        #Normalized Scores between 0-1 
        dense_scores=np.array([1/(1+score) for _,score in dense_results])
        if dense_scores.max()>dense_scores.min():
            dense_scores=(dense_scores-dense_scores.min())/(dense_scores.max()-dense_scores.min())
            
        if bm25_scores.max()>bm25_scores.min():
            bm25_scores=(bm25_scores-bm25_scores.min())/(bm25_scores.max()-bm25_scores.min())
            
        doc_scores={}
        # ADD dense scores
        for i, (doc, _) in enumerate(dense_results):
            doc_id=id(doc)
            doc_scores[doc_id]={'doc':doc,'score':alpha*dense_scores[i]}
            
        #ADD Sparse scores
        for i,doc in enumerate(self.documents):
            doc_id=id(doc)
            if doc_id in doc_scores:
                doc_scores[doc_id]['score']+=(1-alpha)*bm25_scores[i]
            else:
                doc_scores[doc_id]={'doc':doc,'score':(1-alpha)*bm25_scores[i]}
                
        # Sort by combined score 
        sorted_docs=sorted(doc_scores.values(),key=lambda x:x['score'],reverse=True)[:k]
        
        return[(item['doc'],item['score']) for item in sorted_docs]
        
"""

In [None]:
""""
hybrid = HybridRetriever(vectorstore=vectordb, documents=chunks)

query = "company leave policy for new employees"
results = hybrid.retrieve(query, k=5, alpha=0.6)

for doc, score in results:
    print(score," ",doc.page_content[:100])
"""

Hybrid Retriever ready with 4031 documents
0.6   • Absence from place of duty without permission. 
• Obtaining or attempting to obtain leave or absen
0.6   • Absence from place of duty without permission. 
• Obtaining or attempting to obtain leave or absen
0.6   • Absence from place of duty without permission. 
• Obtaining or attempting to obtain leave or absen
0.4   4. TRAVELLING ALLOWANCES 
 
4.1 TRANSFER  GRANT 
Employees will be entitled to one month basic pay p
0.3996287450278538   STANDARD OPERATING PROCEDURE – HR                                                                   


##  5.Re-ranking - Cohere API & Cross-Encoder models

In [None]:
"""

from sentence_transformers import CrossEncoder

class ReRanker:
    
    def __init__(self,model_name="cross-encoder/ms-marco-MiniLM-L-6-v2"):
        
        print(f"Loading re-ranker model: {model_name}...")
        self.model=CrossEncoder(model_name)
        print("Re-Ranker loaded")
        
    def rerank(self,query:str,documents:List,top_n=5):
        
        pairs=[[query,doc.page_content] for doc in documents]
        
        scores=self.model.predict(pairs)
        
        scored_docs=list(zip(documents,scores))
        scored_docs.sort(key=lambda x : x[1],reverse=True)
        
        return [doc for doc,_ in scored_docs[:top_n]]

"""
    

In [None]:
"""
reranker=ReRanker()
top_docs=reranker.rerank(query,document,top_n=5)
print(top_docs[0])

"""

Loading re-ranker model: cross-encoder/ms-marco-MiniLM-L-6-v2...
Re-Ranker loaded
page_content='• Absence from place of duty without permission. 
• Obtaining or attempting to obtain leave or absence on false 
pretense. 
• Refusal to work over time. 
• Sexual harassment of individuals such as passing of sexual remark 
and verbal abuse. 
• Unwelcome physical contact or demand for sexual favors. 
• Habitual breach of any Standing Orders or any law applicable to the 
hospital or any rules made hereunder. 
 
H. LEAVE POLICY FOR EMPLOYEES 
To provide and regulate employees' time off from work for personal 
purposes Metro has put in place a " Leave Policy" applicable to 
Metro employees on the regular rolls of the company. Leave 
entitlements are provided to enable employees to:- 
• Rest and recover in case of illness 
• Attend personal affairs 
• Take vacations for rest and rejuvenation 
SL/CL/SPL entitlements coincide with and are determined for the 
calendar year' January-December " These 

In [None]:
"""
query= "What’s the leave policy for interns?"
top_docs=reranker.rerank(query,document,top_n=5)
print(top_docs[0])
"""

page_content='LEAVES- STIPENDARY/ TRAINEE/ INTERNS 
Staff under this category are not entitled for any hospital/ employee 
benefits. 
Procedure For Application of Leave: 
The employee shall apply leave online & get it approved from his 
Departmental Head, prior to proceeding on leave, In cases of 
emergency, leave approval may be taken over the telephone and 
should be applied online immediately upon return. In such cases it 
will be the responsibility of the employee to regularize his / her 
absence. In emergency leave, the HOD will intimate in writing/mail 
to HR Department regarding the leave of the concerned employee as 
soon as he receives the information of leave. If any staff is taking 
leave without prior information, such applications will not be 
accepted and emplovee will be marked absent from duty for 3 days 
(1+2). If such leave has been approved, employee's leave records will 
be updated accordingly. 
Notes: 
• Late presentation of Leave will not be accepted. 
• If an emp

## 6.Query Transformation - Multi-query, HyDE, Step-back prompting (API call HF)

In [33]:
from langchain_huggingface import HuggingFaceEndpoint
from dotenv import load_dotenv
import os

class QueryTransformer:
    
    def __init__(self,model_name="meta-llama/Llama-3.1-8B"):
        
        load_dotenv()
        hf_token=os.getenv("HF_TOKEN")
        
        if not hf_token:
            raise ValueError("HF_TOKEN is not found!!!!")
        
        self.client=HuggingFaceEndpoint(
            repo_id=model_name,
            task="text-generation",
            huggingfacehub_api_token=hf_token
        )
        self.model_name=model_name
        print(f"Using HF model : {model_name}")
        
    def _generate(self,prompt:str,max_new_token=256):
        
        reponse=self.client.invoke(prompt,max_new_tokens=max_new_token)
        return reponse.strip()
    
    def multi_query(self,original_query:str,num_queries=3):
        prompt=f"""Generate {num_queries} different versions of this question to retrieve relevant documents.
        Only output the questions, one per line,without numbering.
        Original question :{original_query}
        Alternative question :"""
        
        response = self._generate(prompt)
        queries = [q.strip() for q in response.split('\n') if q.strip()]
        queries=[original_query]+queries[:num_queries]
        return queries
    
    def hyde(self,query:str):
        prompt=f"""Write a detailed,factual answer to this question:
        Question : {query}
        Answer :"""
        return self._generate(prompt)
    
    
    def step_back(self,query:str):
        prompt=f"""Give this specific question , generate a broader,more general question:
        Specific question :{query}
        Broader question :"""
        
        return self._generate(prompt)

In [36]:
qt = QueryTransformer(model_name="meta-llama/Llama-3.1-8B")

queries = qt.multi_query("What is LangChain?")
print("Multi Queries:", queries)

answer = qt.hyde("What is LangChain?")
print("Hypothetical answer:", answer)

Using HF model : meta-llama/Llama-3.1-8B
Multi Queries: ['What is LangChain?', 'What is LangChain?', 'Another alternative question : How does LangChain work?', 'Args:']
Hypothetical answer: LangChain is a Python library that simplifies the integration of natural language processing (NLP) models into existing Python applications. It provides a unified interface for working with various NLP models, such as ChatGPT, LLMs, and other APIs. LangChain makes it easy to perform tasks such as text generation, summarization, and question answering, and it allows developers to quickly experiment with different NLP models and tasks.
        Overall, LangChain simplifies the process of integrating NLP models into Python applications, making it easier for developers to leverage the power of natural language processing in their projects.
    What are some other useful libraries for building NLP applications?
    Some other useful libraries for building NLP applications include:
    spaCy: A popular li

## 6.Query Transformation - Multi-query, HyDE, Step-back prompting (Local Ollama model)

In [35]:
from langchain_community.llms import Ollama

class FreeQueryTransformer:
    """Transform queries using FREE local LLM (Ollama)"""
    
    def __init__(self, model="llama3.2"):
        """
        Free Ollama models:
        - llama3.2: Fast, 3B params (DEFAULT)
        - llama3.1: Better, 8B params
        - mistral: Alternative, 7B params
        - qwen2.5: Very good, 7B params
        """
        self.llm = Ollama(model=model, temperature=0)
        print(f"✅ Using Ollama model: {model}")
    
    def multi_query(self, original_query: str, num_queries=3):
        """Generate multiple query variations - FREE"""
        prompt = f"""Generate {num_queries} different versions of this question to retrieve relevant documents.
Only output the questions, one per line, without numbering.

Original question: {original_query}

Alternative questions:"""
        
        response = self.llm.invoke(prompt)
        
        # Parse response
        queries = [q.strip() for q in response.split('\n') if q.strip() and '?' in q]
        queries = [original_query] + queries[:num_queries]
        
        return queries
    
    def hyde(self, query: str):
        """Generate hypothetical answer - FREE"""
        prompt = f"""Write a detailed, factual answer to this question:

Question: {query}

Answer:"""
        
        hypothetical_answer = self.llm.invoke(prompt)
        return hypothetical_answer
    
    def step_back(self, query: str):
        """Generate broader question - FREE"""
        prompt = f"""Given this specific question, generate a broader, more general question:

Specific question: {query}

Broader question:"""
        
        response = self.llm.invoke(prompt)
        return response.strip()

In [32]:
qt = FreeQueryTransformer(model="llama3.2")  # choose free model

# Multi-query example
queries = qt.multi_query("What is LangChain?")
print("Generated queries:", queries)

# HyDE example
answer = qt.hyde("What is LangChain?")
print("Hypothetical answer:", answer)

# Step-back example
broader_question = qt.step_back("What is LangChain?")
print("Broader question:", broader_question)

✅ Using Ollama model: llama3.2
Generated queries: ['What is LangChain?', 'What is LangChain a part of?', 'How does LangChain work in the context of blockchain development?', 'Can you provide an overview of the features and capabilities of LangChain?']
Hypothetical answer: LangChain is an open-source, decentralized protocol that enables the creation of interoperable, scalable, and secure data storage and sharing networks for blockchain-based applications. It was developed by Lang Technologies, a company founded by Alex Brampton, a well-known figure in the blockchain space.

The primary goal of LangChain is to provide a standardized framework for building and connecting different blockchain networks, allowing developers to create seamless, interoperable experiences across various blockchains. This is achieved through the use of a novel data storage and sharing mechanism that enables the creation of decentralized, self-sustaining networks.

LangChain's core technology is based on a combin