# Goal
    1.Data Ingestion - Load PDFs, text files, HTML, CSVs
    2.Advanced Chunking - Recursive, semantic
    3.Vector Indexing - ChromaDB
    4.Hybrid Search - Dense (embeddings) + Sparse (BM25)
    5.Re-ranking -Cross-Encoder models
    6.Query Transformation - Multi-query, HyDE, Step-back prompting
    7.Context Compression - LLM-based relevance filtering
    8.Generation with Citations - Answers with source attribution
    9.Evaluation Metrics - MRR, Recall@K, answer quality
    10.Complete Orchestration - Easy-to-use pipeline class

## 1.Data Ingestion - Load PDFs, text files, HTML, CSVs 

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
""""
from langchain.document_loaders import (PyPDFLoader,TextLoader,Docx2txtLoader,DirectoryLoader,UnstructuredHTMLLoader,CSVLoader)
from typing import List,Dict,Tuple
import re

class DataIngestion:
    
    @staticmethod
    def load_pdfs(file_path:str):
        loader=PyPDFLoader(file_path)
        return loader.load()
    
    @staticmethod
    def load_text(file_path:str):
        loader=TextLoader(file_path)
        return loader.load()
    
    @staticmethod
    def load_directory(directory_path:str,glob_pattern:str='**/*.pdf'):
        loader=DirectoryLoader(
            directory_path,
            glob=glob_pattern,
            loader_cls=PyPDFLoader,
            show_progress=True
        )
        return loader.load()
        
    @staticmethod
    def load_docx(file_path:str):
        loader=Docx2txtLoader(file_path)
        return loader.load()
    
    @staticmethod
    def preprocess_text(text:str)->str:
        text=re.sub(r"\s+",' ',text)
        text=re.sub(r'[^\w\s\.\?\!\-\:\;]','',text)
        
        return text.strip()

"""

'"\nfrom langchain.document_loaders import (PyPDFLoader,TextLoader,Docx2txtLoader,DirectoryLoader,UnstructuredHTMLLoader,CSVLoader)\nfrom typing import List,Dict,Tuple\nimport re\n\nclass DataIngestion:\n\n    @staticmethod\n    def load_pdfs(file_path:str):\n        loader=PyPDFLoader(file_path)\n        return loader.load()\n\n    @staticmethod\n    def load_text(file_path:str):\n        loader=TextLoader(file_path)\n        return loader.load()\n\n    @staticmethod\n    def load_directory(directory_path:str,glob_pattern:str=\'**/*.pdf\'):\n        loader=DirectoryLoader(\n            directory_path,\n            glob=glob_pattern,\n            loader_cls=PyPDFLoader,\n            show_progress=True\n        )\n        return loader.load()\n\n    @staticmethod\n    def load_docx(file_path:str):\n        loader=Docx2txtLoader(file_path)\n        return loader.load()\n\n    @staticmethod\n    def preprocess_text(text:str)->str:\n        text=re.sub(r"\\s+",\' \',text)\n        text=re.

In [3]:
# document=DataIngestion.load_directory(r'C:\Users\evilk\OneDrive\Desktop\Projects\RAG-Complete-Pipeline\data')

In [4]:
# print(len(document))


## 2.Advanced Chunking - Recursive, semantic

In [5]:
"""
from langchain.text_splitter import (RecursiveCharacterTextSplitter,CharacterTextSplitter)
from sentence_transformers import SentenceTransformer
from langchain.schema import Document
import numpy as np

class Chunking:
    
    
    @staticmethod
    def recursive_chunking(documents,chunk_size=1000,chunk_overlap=200):
        textSplitter=RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
        )
        return textSplitter.split_documents(documents)
    
    @staticmethod
    def semantic_chunking(documents,embedding,chunk_size=1000):
         chunks=[]
         model=SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
         
         for doc in documents:
             sentences=re.split(r'(?<=[.!?])\s+',doc.page_content)
             
             if len(sentences)<=1:
                 chunks.append(doc)
                 continue
             
             embedding_array=model.encode(sentences)
             
             similarities=[]
             for i in range(len(embedding_array)-1):
                 sim=np.dot(embedding_array[i],embedding_array[i+1])
                 similarities.append(sim)
                 
             threshold=np.percentile(similarities,30)
             
             current_chunk=[]
             for i,sentence in enumerate(sentences):
                 current_chunk.append(sentence)
                 
                 if i <len(similarities) and similarities[i]<threshold:
                     chunk_text=' '.join(current_chunk)
                     if len(chunk_text)>chunk_size:
                         chunks.append(Document(
                             page_content=chunk_text,
                             metadata=doc.metadata
                         ))
                         current_chunk=[]
                         
             if current_chunk:
                chunks.append(Document(
                    page_content=' '.join(current_chunk),
                    metadata=doc.metadata
                ))
         return chunks

"""

'\nfrom langchain.text_splitter import (RecursiveCharacterTextSplitter,CharacterTextSplitter)\nfrom sentence_transformers import SentenceTransformer\nfrom langchain.schema import Document\nimport numpy as np\n\nclass Chunking:\n\n\n    @staticmethod\n    def recursive_chunking(documents,chunk_size=1000,chunk_overlap=200):\n        textSplitter=RecursiveCharacterTextSplitter(\n            chunk_size=chunk_size,\n            chunk_overlap=chunk_overlap,\n            length_function=len,\n            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]\n        )\n        return textSplitter.split_documents(documents)\n\n    @staticmethod\n    def semantic_chunking(documents,embedding,chunk_size=1000):\n         chunks=[]\n         model=SentenceTransformer(\'sentence-transformers/all-MiniLM-L6-v2\')\n\n         for doc in documents:\n             sentences=re.split(r\'(?<=[.!?])\\s+\',doc.page_content)\n\n             if len(sentences)<=1:\n                 chunks.append(doc)\n        

In [6]:
""""
chunks=Chunking.recursive_chunking(document)
print(chunks[1000])
"""

'"\nchunks=Chunking.recursive_chunking(document)\nprint(chunks[1000])\n'

## 3.Vector Indexing - ChromaDB

In [7]:
"""
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import FAISS,Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
import numpy as np

class Embeddings:
    
    def __init__(self,model_name='all-MiniLM-L6-v2'):
        
        self.embeddings=HuggingFaceEmbeddings(
            model_name=f"sentence-transformers/{model_name}",
            model_kwargs={'device':'cuda'},
            encode_kwargs={'normalize_embeddings':True}
        )
        
    def create_chroma_db(self,chunks,persist_directory="../chroma_db"):
        vectordb=Chroma.from_documents(
            documents=chunks,
            embedding=self.embeddings,
            persist_directory=persist_directory
        )
        return vectordb
"""    

'\nfrom sentence_transformers import SentenceTransformer\nfrom langchain.vectorstores import FAISS,Chroma\nfrom langchain_community.embeddings import HuggingFaceEmbeddings\nimport numpy as np\n\nclass Embeddings:\n\n    def __init__(self,model_name=\'all-MiniLM-L6-v2\'):\n\n        self.embeddings=HuggingFaceEmbeddings(\n            model_name=f"sentence-transformers/{model_name}",\n            model_kwargs={\'device\':\'cuda\'},\n            encode_kwargs={\'normalize_embeddings\':True}\n        )\n\n    def create_chroma_db(self,chunks,persist_directory="../chroma_db"):\n        vectordb=Chroma.from_documents(\n            documents=chunks,\n            embedding=self.embeddings,\n            persist_directory=persist_directory\n        )\n        return vectordb\n'

In [8]:
"""
emb=Embeddings()
vectordb=emb.create_chroma_db(chunks)
"""

'\nemb=Embeddings()\nvectordb=emb.create_chroma_db(chunks)\n'

## 4.Hybrid Search - Dense (embeddings) + Sparse (BM25)

In [9]:
"""
from rank_bm25 import BM25Okapi
import numpy as np

class HybridRetriever:
    
    def __init__(self,vectorstore,documents):
        self.vectorstore=vectorstore
        self.documents=documents
        
        tokenized_docs=[doc.page_content.lower().split() for doc in documents]
        self.bm25=BM25Okapi(tokenized_docs)
        print(f"Hybrid Retriever ready with {len(documents)} documents")
        
        
    def retrieve(self,query:str,k=10,alpha=0.5):
        
        # Vector Search 
        dense_results=self.vectorstore.similarity_search_with_score(query,k=k*2)
        
        # BM25 Search
        tokenized_query=query.lower().split()
        bm25_scores=self.bm25.get_scores(tokenized_query)
        
        #Normalized Scores between 0-1 
        dense_scores=np.array([1/(1+score) for _,score in dense_results])
        if dense_scores.max()>dense_scores.min():
            dense_scores=(dense_scores-dense_scores.min())/(dense_scores.max()-dense_scores.min())
            
        if bm25_scores.max()>bm25_scores.min():
            bm25_scores=(bm25_scores-bm25_scores.min())/(bm25_scores.max()-bm25_scores.min())
            
        doc_scores={}
        # ADD dense scores
        for i, (doc, _) in enumerate(dense_results):
            doc_id=id(doc)
            doc_scores[doc_id]={'doc':doc,'score':alpha*dense_scores[i]}
            
        #ADD Sparse scores
        for i,doc in enumerate(self.documents):
            doc_id=id(doc)
            if doc_id in doc_scores:
                doc_scores[doc_id]['score']+=(1-alpha)*bm25_scores[i]
            else:
                doc_scores[doc_id]={'doc':doc,'score':(1-alpha)*bm25_scores[i]}
                
        # Sort by combined score 
        sorted_docs=sorted(doc_scores.values(),key=lambda x:x['score'],reverse=True)[:k]
        
        return[(item['doc'],item['score']) for item in sorted_docs]
        
"""

'\nfrom rank_bm25 import BM25Okapi\nimport numpy as np\n\nclass HybridRetriever:\n\n    def __init__(self,vectorstore,documents):\n        self.vectorstore=vectorstore\n        self.documents=documents\n\n        tokenized_docs=[doc.page_content.lower().split() for doc in documents]\n        self.bm25=BM25Okapi(tokenized_docs)\n        print(f"Hybrid Retriever ready with {len(documents)} documents")\n\n\n    def retrieve(self,query:str,k=10,alpha=0.5):\n\n        # Vector Search \n        dense_results=self.vectorstore.similarity_search_with_score(query,k=k*2)\n\n        # BM25 Search\n        tokenized_query=query.lower().split()\n        bm25_scores=self.bm25.get_scores(tokenized_query)\n\n        #Normalized Scores between 0-1 \n        dense_scores=np.array([1/(1+score) for _,score in dense_results])\n        if dense_scores.max()>dense_scores.min():\n            dense_scores=(dense_scores-dense_scores.min())/(dense_scores.max()-dense_scores.min())\n\n        if bm25_scores.max()>b

In [None]:
"""
hybrid = HybridRetriever(vectorstore=vectordb, documents=chunks)

query = "company leave policy for new employees"
results = hybrid.retrieve(query, k=5, alpha=0.6)

for doc, score in results:
    print(score," ",doc.page_content[:100])
"""

'"\nhybrid = HybridRetriever(vectorstore=vectordb, documents=chunks)\n\nquery = "company leave policy for new employees"\nresults = hybrid.retrieve(query, k=5, alpha=0.6)\n\nfor doc, score in results:\n    print(score," ",doc.page_content[:100])\n'

##  5.Re-ranking -Cross-Encoder models

In [None]:

"""
from sentence_transformers import CrossEncoder
from typing import List
class ReRanker:
    
    def __init__(self,model_name="cross-encoder/ms-marco-MiniLM-L-6-v2"):
        
        print(f"Loading re-ranker model: {model_name}...")
        self.model=CrossEncoder(model_name)
        print("Re-Ranker loaded")
        
    def rerank(self,query:str,documents:List,top_n=5):
        
        pairs=[[query,doc.page_content] for doc in documents]
        
        scores=self.model.predict(pairs)
        
        scored_docs=list(zip(documents,scores))
        scored_docs.sort(key=lambda x : x[1],reverse=True)
        
        return [doc for doc,_ in scored_docs[:top_n]]


"""

W1025 00:20:30.908000 6136 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


In [39]:
"""
query = "company leave policy for new employees"
reranker=ReRanker()
top_docs=reranker.rerank(query,document,top_n=5)
print(top_docs[0])
"""


'\nquery = "company leave policy for new employees"\nreranker=ReRanker()\ntop_docs=reranker.rerank(query,document,top_n=5)\nprint(top_docs[0])\n'

## 6.Query Transformation - Multi-query, HyDE, Step-back prompting (API call HF)

In [14]:

from langchain_huggingface import HuggingFaceEndpoint
from dotenv import load_dotenv
import os

class QueryTransformer:
    
    def __init__(self,model_name="meta-llama/Llama-3.1-8B"):
        
        load_dotenv()
        hf_token=os.getenv("HF_TOKEN")
        
        if not hf_token:
            raise ValueError("HF_TOKEN is not found!!!!")
        
        self.client=HuggingFaceEndpoint(
            repo_id=model_name,
            task="text-generation",
            huggingfacehub_api_token=hf_token
        )
        self.model_name=model_name
        print(f"Using HF model : {model_name}")
        
    def _generate(self,prompt:str,max_new_token=256):
        
        reponse=self.client.invoke(prompt,max_new_tokens=max_new_token)
        return reponse.strip()
    
    def multi_query(self,original_query:str,num_queries=3):
        prompt=f"""Generate {num_queries} different versions of this question to retrieve relevant documents.
        Only output the questions, one per line,without numbering.
        Original question :{original_query}
        Alternative question :"""
        
        response = self._generate(prompt)
        queries = [q.strip() for q in response.split('\n') if q.strip()]
        queries=[original_query]+queries[:num_queries]
        return queries
    
    def hyde(self,query:str):
        prompt=f"""Write a detailed,factual answer to this question:
        Question : {query}
        Answer :"""
        return self._generate(prompt)
    
    
    def step_back(self,query:str):
        prompt=f"""Give this specific question , generate a broader,more general question:
        Specific question :{query}
        Broader question :"""
        
        return self._generate(prompt)
        


In [15]:
qt = QueryTransformer(model_name="meta-llama/Llama-3.1-8B")

queries = qt.multi_query("What is LangChain?")
print("Multi Queries:", queries)

answer = qt.hyde("What is LangChain?")
print("Hypothetical answer:", answer)

Using HF model : meta-llama/Llama-3.1-8B
Multi Queries: ['What is LangChain?', 'What is LNC?', 'Alternative question :What is LNC?', 'Alternative question :What is LNC?']
Hypothetical answer: LangChain is a Python-based framework that allows developers to build complex language models and AI-powered applications. It provides a set of APIs and modules that enable developers to create models that can generate and manipulate text, answer questions, and perform other natural language processing (NLP) tasks.
        The framework was developed by a team of researchers and engineers at OpenAI, and it is built on top of the OpenAI API, a powerful tool for building AI-powered applications. LangChain also includes a set of pretrained models and datasets, which can be used to train and fine-tune the models.
        Some of the key features of LangChain include:
        Language Modeling: LangChain includes a set of pretrained language models, such as BERT, GPT-3, and XLNet, which can be used to 

## 6.Query Transformation - Multi-query, HyDE, Step-back prompting (Local Ollama model)

In [None]:
from langchain_community.llms import Ollama

class LocalQueryTransformer:
    
    
    def __init__(self, model="llama3.2"):

        self.llm = Ollama(model=model, temperature=0)
        print(f"Using Ollama model: {model}")
    
    def multi_query(self, original_query: str, num_queries=3):
        
        prompt = f"""Generate {num_queries} different versions of this question to retrieve relevant documents.
                Only output the questions, one per line, without numbering.

                Original question: {original_query}

                Alternative questions:"""
        
        response = self.llm.invoke(prompt)
        
     
        queries = [q.strip() for q in response.split('\n') if q.strip() and '?' in q]
        queries = [original_query] + queries[:num_queries]
        
        return queries
    
    def hyde(self, query: str):
        
        prompt = f"""Write a detailed, factual answer to this question:

                Question: {query}

                Answer:"""
        
        hypothetical_answer = self.llm.invoke(prompt)
        return hypothetical_answer
    
    def step_back(self, query: str):
        
        prompt = f"""Given this specific question, generate a broader, more general question:

                Specific question: {query}

                Broader question:"""
        
        response = self.llm.invoke(prompt)
        return response.strip()

In [None]:
qt = QueryTransformer(model="llama3.2")  


queries = qt.multi_query("What is LangChain?")
print("Generated queries:", queries)


answer = qt.hyde("What is LangChain?")
print("Hypothetical answer:", answer)


broader_question = qt.step_back("What is LangChain?")
print("Broader question:", broader_question)

âœ… Using Ollama model: llama3.2
Generated queries: ['What is LangChain?', 'What is LangChain a part of?', 'How does LangChain work in the context of blockchain development?', 'Can you provide an overview of the features and capabilities of LangChain?']
Hypothetical answer: LangChain is an open-source, decentralized protocol that enables the creation of interoperable, scalable, and secure data storage and sharing networks for blockchain-based applications. It was developed by Lang Technologies, a company founded by Alex Brampton, a well-known figure in the blockchain space.

The primary goal of LangChain is to provide a standardized framework for building and connecting different blockchain networks, allowing developers to create seamless, interoperable experiences across various blockchains. This is achieved through the use of a novel data storage and sharing mechanism that enables the creation of decentralized, self-sustaining networks.

LangChain's core technology is based on a comb

## 7.Context Compression - LLM-based relevance filtering

In [26]:
from langchain.schema import Document
from typing import List

class ContextCompression:
    
    def __init__(self,llm):
        self.llm=llm
        
    def compress_documents(self,query:str,documents:List, max_docs=5):
        
        compressed=[]
        
        for doc in documents[:max_docs]:
            prompt = f"""Extract only the sentences that directly answer or are relevant to the question below. 
                    If no sentence is relevant, output "None". 
                    Do not include unrelated information.

                    Example:
                    Question: What is AI?
                    Document: Artificial intelligence (AI) enables machines to learn. Cars have engines.
                    Relevant sentences: Artificial intelligence (AI) enables machines to learn.

                    Now do the same.

                    Question: {query}

                    Document: {doc.page_content[:1500]}

                    Relevant sentences:"""
            try:
                relevant_text=self.llm.invoke(prompt)
                if relevant_text.strip():
                    compressed.append(Document(
                        page_content=relevant_text.strip(),
                        metadata=doc.metadata
                    ))
            except:
                compressed.append(doc)
        return compressed if compressed else documents[:max_docs]
    

In [27]:
llm = Ollama(model="llama3")

docs = [
    Document(page_content="RAG stands for Retrieval-Augmented Generation. It combines retrieval and generation. CNN is used for image tasks."),
    Document(page_content="Transformers are used in NLP. RAG helps improve LLM performance by grounding answers.")
]

compressor=ContextCompression(llm)
result=compressor.compress_documents("What is RAG?",docs)

In [28]:
for r in result:
    print("\n Compressed Documents \n ",r.page_content)


 Compressed Documents 
  RAG stands for Retrieval-Augmented Generation. It combines retrieval and generation. CNN is used for image tasks.

 Compressed Documents 
  Transformers are used in NLP. RAG helps improve LLM performance by grounding answers.


## 8.Generation with Citations - Answers with source attribution

In [29]:
from typing import List


class RAGGenerator:
    
    def __init__(self,model="llama3.2"):
        self.llm=Ollama(model=model,temperature=0)
        
    def generate_with_citations(self,query:str,context_docs:List):
        
        context_text=""
        for i,doc in enumerate(context_docs):
            source=doc.metadata.get('source','Unknown')
            context_text +=f"\n [Source {i+1}]: {source}\n{doc.page_content}\n"
            
        prompt=f"""Answer the question based Only on the context provided.
        Include citations like [Source X] after each claim.
        If the context doesn't contain the answer ,say "I cannot find this information in the provided source."
                                                
        Context:{context_text}
        Question:{query}
        Answer with citations: 
        """
        answer=self.llm.invoke(prompt)
        return answer 
        

In [None]:
from langchain.schema import Document

docs = [
    Document(
        page_content="RAG stands for Retrieval-Augmented Generation. It combines retrieval and generation models.",
        metadata={"source": "ai_notes.txt"}
    ),
    Document(
        page_content="Transformers are used in NLP and LLMs.",
        metadata={"source": "ml_intro.txt"}
    )
]

rag = RAGGenerator(model='llama3.2')

query ="What is RAG?"
answer=rag.generate_with_citations(query,docs)

print(answer)

According to the provided context, RAG (Retrieval-Augmented Generation) stands for Retrieval-Augmented Generation. It combines retrieval and generation models [Source 1].


##  9.Evaluation Metrics - MRR, Recall@K, answer quality

In [34]:
class RAGEvaluator:
    
    @staticmethod
    def calculate_mrr(retrieved_docs,relevant_doc_ids):
        for i,doc in enumerate(retrieved_docs):
            if doc.metadata.get('doc_id') in relevant_doc_ids:
                return 1/(i+1)
        return 0
    
    @staticmethod
    def calculate_recall_at_k(retrieved_docs,relevant_doc_ids,k=5):
        retrieved_ids=[doc.metadata.get('doc_id') for doc in retrieved_docs[:k]]
        relevant_retrieved=set(retrieved_ids)& set(relevant_doc_ids)
        return len(relevant_retrieved)/len(relevant_doc_ids) if relevant_doc_ids else 0
    
    @staticmethod
    def calculate_precision_at_k(retrieved_docs,relevant_doc_ids,k=5):
        retrieved_ids=[doc.metadata.get("doc_id") for doc in retrieved_docs[:k]]
        relevant_retrieved=set(retrieved_ids)&set(relevant_doc_ids)
        return len(relevant_retrieved)/k if k >0 else 0        

In [36]:
from langchain.schema import Document

retrieved_docs = [
    Document(page_content="About the Eiffel Tower.", metadata={'doc_id': 'D3'}),
    Document(page_content="French cuisine and recipes.", metadata={'doc_id': 'D2'}),
    Document(page_content="Paris is the capital of France.", metadata={'doc_id': 'D1'}),
    Document(page_content="Population of Germany.", metadata={'doc_id': 'D4'}),
    Document(page_content="Mountains of Italy.", metadata={'doc_id': 'D5'}),
]

relevant_doc_ids = ['D1', 'D3', 'D6']

mrr = RAGEvaluator.calculate_mrr(retrieved_docs, relevant_doc_ids)
recall = RAGEvaluator.calculate_recall_at_k(retrieved_docs, relevant_doc_ids, k=5)
precision =RAGEvaluator.calculate_precision_at_k(retrieved_docs, relevant_doc_ids, k=5)

print("Mean reciprocal rank : ",mrr)
print("Recall@k : ",recall)
print("Precision@k : ",precision)

Mean reciprocal rank :  1.0
Recall@k :  0.6666666666666666
Precision@k :  0.4


##    10.Complete Orchestration - Easy-to-use pipeline class