In [2]:
### RAG pipeline- Data Ingestion to Vector DB Pipeline

In [3]:
import os
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
## read all the pdfs inside the directory
def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents=[]
    pdf_dir=Path(pdf_directory)

    pdf_files=list(pdf_dir.glob("**/*.pdf"))

    print(f"Found {len(pdf_files)} PDF files to process")

    for pdf_file in pdf_files:
        print(f"\nProcessig: {pdf_file.name}")
        try:
            loader=PyPDFLoader(str(pdf_file))
            documents=loader.load()

            #ADD SOURCE INFO TO METADATA{optional}
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = "pdf"

            all_documents.extend(documents)
            print(f"Loaded {len(documents)} pages")
        
        except Exception as e:
            print(f"Error: {e}")

    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

all_pdf_documents=process_all_pdfs("../data")

Found 4 PDF files to process

Processig: adhd.pdf
Loaded 7 pages

Processig: embedding.pdf
Loaded 16 pages

Processig: hypothyrodism.pdf
Loaded 8 pages

Processig: rag.pdf
Loaded 9 pages

Total documents loaded: 40


In [5]:
all_pdf_documents

[Document(metadata={'producer': 'National University of Ireland, Galway', 'creator': 'Arbortext Advanced Print Publisher 10.0.1465/W Unicode', 'creationdate': '2017-06-09T17:10:28+05:30', 'title': 'untitled', 'moddate': '2017-09-14T13:56:47+00:00', 'source': '..\\data\\pdfs\\adhd.pdf', 'total_pages': 7, 'page': 0, 'page_label': '1', 'source_file': 'adhd.pdf', 'file_type': 'pdf'}, page_content='The impact of attention de ﬁcit hyperactivity\ndisorder (ADHD) in adulthood: a qualitative study\nC. Watters1, D. Adamis 2,*, F. McNicholas 3 and B. Gavin 3\n1 Psychology Services, Markievicz House, HSE, Sligo, Ireland\n2 Mental Health Services, HSE, Sligo, Ireland\n3 Child and Adolescent Psychiatry, School of Medicine, UCD, Dublin, Ireland\nObjectives. There is limited evidence of the unmet needs and experiences of adults with attention de ﬁcit hyperactivity\ndisorder (ADHD). Previous research in this area is predominantly quantitative by nature, few studies employing\nqualitative approaches. Th

In [6]:
#Text splitting get into chunks

def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """Split documents into smaller chunks for better RAG  performance"""
    textt_splitter=RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len, #tells it to measure text length using Python len.
        separators=["\n\n","\n"," ",""]
    )
    split_docs=textt_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks") #how many documents into how many chunks

    #show example of a chunk
    if split_docs:
        print(f"\nExample chunks:")
        print(f"Content: {split_docs[0].page_content[:200]}...") #first 200 character of the chunks
        print(f"MetaData: {split_docs[0].metadata}") #meta data attached to the chunk

    return split_docs

In [7]:
chunks=split_documents(all_pdf_documents)
chunks

Split 40 documents into 204 chunks

Example chunks:
Content: The impact of attention de ﬁcit hyperactivity
disorder (ADHD) in adulthood: a qualitative study
C. Watters1, D. Adamis 2,*, F. McNicholas 3 and B. Gavin 3
1 Psychology Services, Markievicz House, HSE,...
MetaData: {'producer': 'National University of Ireland, Galway', 'creator': 'Arbortext Advanced Print Publisher 10.0.1465/W Unicode', 'creationdate': '2017-06-09T17:10:28+05:30', 'title': 'untitled', 'moddate': '2017-09-14T13:56:47+00:00', 'source': '..\\data\\pdfs\\adhd.pdf', 'total_pages': 7, 'page': 0, 'page_label': '1', 'source_file': 'adhd.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'National University of Ireland, Galway', 'creator': 'Arbortext Advanced Print Publisher 10.0.1465/W Unicode', 'creationdate': '2017-06-09T17:10:28+05:30', 'title': 'untitled', 'moddate': '2017-09-14T13:56:47+00:00', 'source': '..\\data\\pdfs\\adhd.pdf', 'total_pages': 7, 'page': 0, 'page_label': '1', 'source_file': 'adhd.pdf', 'file_type': 'pdf'}, page_content='The impact of attention de ﬁcit hyperactivity\ndisorder (ADHD) in adulthood: a qualitative study\nC. Watters1, D. Adamis 2,*, F. McNicholas 3 and B. Gavin 3\n1 Psychology Services, Markievicz House, HSE, Sligo, Ireland\n2 Mental Health Services, HSE, Sligo, Ireland\n3 Child and Adolescent Psychiatry, School of Medicine, UCD, Dublin, Ireland\nObjectives. There is limited evidence of the unmet needs and experiences of adults with attention de ﬁcit hyperactivity\ndisorder (ADHD). Previous research in this area is predominantly quantitative by nature, few studies employing\nqualitative approaches. Th

In [8]:
## Embedding and vectostore db

In [9]:
import os
import sys

# Add PyTorch DLL directory to PATH
torch_lib_path = os.path.join(sys.prefix, 'Lib', 'site-packages', 'torch', 'lib')
if os.path.exists(torch_lib_path):
    os.add_dll_directory(torch_lib_path)
    # Also add to PATH for older Python versions
    os.environ['PATH'] = torch_lib_path + os.pathsep + os.environ['PATH']

# Now import
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings

In [10]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List,Dict,Any,Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
class EmbeddingManager:
    def __init__(self,model_name:str="all-MiniLM-L6-v2"):
        """
        Initialize the embedding manager

        Args:
        model_name:HuggingFace model name for sentence embedding
        """
        self.model_name=model_name
        self.model=None
        self._load_model()

    def _load_model(self): #_load_model is a protect function menation only accesible inside the class
        """Load the sentencetansformer model"""
        try:
            print(f"Loading Embedding model: {self.model_name}")
            self.model=SentenceTransformer(self.model_name)
            print(f"Model loaded succesfully. Mebedding DImension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading the model {self.model_name}: {e}")
            raise

    def generate_embedding(self,texts:List[str]) -> np.ndarray:
        """
        generate embedding for a list of texts

        args:
            texsts: list of text strings to embed

        returns:
            numpy array of embedding with shape (len(texts), embedding_dim) 
        """
        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generating embedding for {len(texts)} texts....")
        embeddings=self.model.encode(texts,show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    
#Initialize the embeddings manager
embedding_manager=EmbeddingManager()
embedding_manager

Loading Embedding model: all-MiniLM-L6-v2




Model loaded succesfully. Mebedding DImension: 384


<__main__.EmbeddingManager at 0x1a32eedcdd0>

In [12]:
## VectorStore

In [13]:
class VectorStore:
    """Manages document embeddings in a chromaDB vector Store"""

    def __init__(self,collection_name:str="pdf_documents" ,persist_directory:str="../data/vector_store"): 
        #foldeer path
        #name for your storage box
        self.collection_name = collection_name
        self.persist_directory=persist_directory
        self.client=None
        self.collection=None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection"""
        try:
            #Create Persistent ChromaDB client
            os.makedirs(self.persist_directory,exist_ok=True)
            self.client=chromadb.PersistentClient(path=self.persist_directory)#creating a clent which will have a refers to chromadb vector store 
            
            #Get or create collection
            self.collection=self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"Description": "PDF document embeddings for RAG"}
            )
            """
            1. self.client.get_or_create_collection(...)

            Asks ChromaDB: "Do you have a collection with this name?"
            If YES → Use the existing one
            If NO → Create a new one

            2. name=self.collection_name

            The name of the collection (e.g., "pdf_documents")

            3. metadata={"description": "PDF document embeddings for RAG"}

            Extra info about what this collection stores
            Like a label: "This box contains PDF embeddings for RAG"

            4. self.collection = ...

            Store the collection so you can use it later
            """
            print(f"Vector store initialized . Collection: {self.collection_name}")
            print(f"Exisiting documents in collection: {self.collection.count()}")
        
        except Exception as e:
            print(f"Existing initializing vector store: {e}")
            raise

    def add_documents(self,documents:List[Any],embeddings:np.ndarray):
        """
        Add documents to their embeddings to the vecotr sotre

        args:
            documents:kist of langchain documents
            embeddings:corresponding embeddings fpr thr documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match nunmber of embeddings")
        #Becoz documents or chunks should be equal to number of embeddings same as chunks
        print(f"Adding {len(documents)} to the vector store")

        #Prepare data for chromaDB
        ids=[]
        metadatas=[]
        documents_text=[]
        embeddings_list=[]

        for i,(doc,embedding) in enumerate(zip(documents,embeddings)):
            # enumerate counts while you loop and zip make the embedding numer and chunk together in a zip file and 
            # i is the count and (doc,embedding) doc- one documents and embedding- the documents number

            """Generate Uniquie uiversal ID (uuid)"""
            doc_id=f"doc_{uuid.uuid4().hex[:8]}_{i}" #looks like doc_a3f5b2c1_0 ,,hex will converts(uuid) it to letters and numbers
            ids.append(doc_id)

            """Prepare metadata"""
            metadata=dict(doc.metadata) #the doc has laready some info so we copy it first
            metadata['doc_index'] = i # doc number like 0,1,2
            metadata['content_length']=len(doc.page_content)#len of text
            metadatas.append(metadata)

            """Document content"""
            documents_text.append(doc.page_content)

            """Embedding"""
            embeddings_list.append(embedding.tolist()) #chromadb wants regular list


        """ Add to collection"""
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")  # ← Indented!
            print(f"Total documents in collection: {self.collection.count()}")        # ← Indented!

        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore=VectorStore()
vectorstore

Vector store initialized . Collection: pdf_documents
Exisiting documents in collection: 297


<__main__.VectorStore at 0x1a33066dc10>

In [14]:
chunks

[Document(metadata={'producer': 'National University of Ireland, Galway', 'creator': 'Arbortext Advanced Print Publisher 10.0.1465/W Unicode', 'creationdate': '2017-06-09T17:10:28+05:30', 'title': 'untitled', 'moddate': '2017-09-14T13:56:47+00:00', 'source': '..\\data\\pdfs\\adhd.pdf', 'total_pages': 7, 'page': 0, 'page_label': '1', 'source_file': 'adhd.pdf', 'file_type': 'pdf'}, page_content='The impact of attention de ﬁcit hyperactivity\ndisorder (ADHD) in adulthood: a qualitative study\nC. Watters1, D. Adamis 2,*, F. McNicholas 3 and B. Gavin 3\n1 Psychology Services, Markievicz House, HSE, Sligo, Ireland\n2 Mental Health Services, HSE, Sligo, Ireland\n3 Child and Adolescent Psychiatry, School of Medicine, UCD, Dublin, Ireland\nObjectives. There is limited evidence of the unmet needs and experiences of adults with attention de ﬁcit hyperactivity\ndisorder (ADHD). Previous research in this area is predominantly quantitative by nature, few studies employing\nqualitative approaches. Th

In [15]:
#Converts text to embedding
texts=[doc.page_content for doc in chunks]
texts

#generate the embeddings
embeddings=embedding_manager.generate_embedding(texts)

#store into the vectorDB
vectorstore.add_documents(chunks,embeddings)

Generating embedding for 204 texts....


Batches: 100%|██████████| 7/7 [00:09<00:00,  1.30s/it]


Generated embeddings with shape: (204, 384)
Adding 204 to the vector store
Successfully added 204 documents to vector store
Total documents in collection: 501


### Retriever Pipeline from VectorStore

In [16]:
class RAGRetriever:
    """Handles query-based retrieval from the vector store"""
    
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """
        Initialize the retriever
        
        Args:
            vector_store: Vector store containing document embeddings
            embedding_manager: Manager for generating query embeddings
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query
        
        Args:
            query: The search query
            
            top_k: Number of top results to return
            Too few (top_k=1): Might miss important info
            Too many (top_k=50): Too much irrelevant stuff, slows down the AI
            Just right (top_k=3-5): Good balance!

            score_threshold: Minimum similarity score threshold
            
        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")
        
        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embedding([query])[0]
        
        # Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            # Process results
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0] # the actual text
                metadatas = results['metadatas'][0] #info about each document
                distances = results['distances'][0] #How far apart the number are
                ids = results['ids'][0] #the doc ids
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance #Distance = how far apart (lower is better)Similarity = how similar (higher is better)
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({ #ifthe doc i sismilar enough meaning distance small and similarity higher
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        }) #put all info aboiut doc in a list
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            
            return retrieved_docs #all the lsit of documents
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever=RAGRetriever(vectorstore,embedding_manager)

#Takes the query ,finds similar doc,and gives them back to us

In [17]:
rag_retriever.retrieve("Tell me about thyroid")

Retrieving documents for query: 'Tell me about thyroid'
Top K: 5, Score threshold: 0.0
Generating embedding for 1 texts....


Batches: 100%|██████████| 1/1 [00:00<00:00, 30.77it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)





[{'id': 'doc_655c5f00_116',
  'content': 'substantially burdening individuals and society ( 2, 3). Furthermore,\nRA is associated with a signi ﬁcantly higher mortality rate than the\ngeneral population, with approximately 40% of patients with RA\nsuccumbing to cardiovascular disease ( 4, 5). Thyroid dysfunction,\nencompassing hyperthyroidism, hypothyroidism, subclinical\nhyperthyroidism, and subclinical hypothyroidism, is a common\nendocrine disorder diagnosed pr imarily through biochemical\nindicators such as thyroid s timulating hormone (TSH),\ntriiodothyronine (T3), thyroxine (T4), free triiodothyronine (FT3),\nand free thyroxine (FT4). The prevalence of hyperthyroidism ranges\nfrom 0.2% to 1.3% ( 6, 7), while that of hypothyroidism varies from\n0.2 to 5.3% ( 6, 8). Hyperthyroidism and hypothyroidism could\nimpact various bodily systems, including the integumentary,\nmuscular, skeletal, cardiovascular, nervous, digestive, endocrine,\nand circulatory systems. While most studies have 

### Integration of VectorDbcontext pipeline with LLM output

In [18]:
#Simple TAGpipline with groq llm
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
load_dotenv()

# ChatGroq automatically reads GROQ_API_KEY from environment
llm = ChatGroq(model_name="llama-3.1-8b-instant",temperature=0.1,max_tokens=1024) #How creative/random the AI is

## 2 Simple RAG function:retrieve context + generate response (RAG+LLM)
def rag_simple(query,retriever,llm,top_k=3):
    # retrieve the content
    results=retriever.retrieve(query,top_k=top_k) #ask the retriever to find the doc matching query

    # Combine all document text
    context="\n\n".join([doc['content'] for doc in results]) if results else ""
    if not context:
        return "No relevant context found to the question."
    
    #generate the answer using GroqLLM
    prompt=f"""Use the following contexgt to answer the question concisely.
    Context:
    {context}
    
    Question:{query}

    Answer:
    """
    response=llm.invoke(prompt)
    return response.content

In [19]:
answer=rag_simple("Tell me about hypothyroidism",rag_retriever,llm)
print(answer)

Retrieving documents for query: 'Tell me about hypothyroidism'
Top K: 3, Score threshold: 0.0
Generating embedding for 1 texts....


Batches: 100%|██████████| 1/1 [00:00<00:00, 23.28it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)





Hypothyroidism is a condition where the thyroid gland does not produce enough thyroid hormones, which are essential for various bodily systems, including the integumentary, muscular, skeletal, cardiovascular, nervous, digestive, endocrine, and circulatory systems.


## Enchanced the RAG pipeline

In [None]:
## Enchanced RAG pipeline features--------
def rag_features(query,retriever,llm,top_k=5,min_score=0.2,return_context=False):
    """
    RAG pipeline with extra features:
    -Return answer,sources,confidence score,and optionally full context.

    score_threshold is a cutoff point that decides which search results are good enough to use.
    When your RAG system retrieves documents, it gives each one a relevance score (like a match percentage).
    The score_threshold filters out any results below that score.
    """
    results=retriever.retrieve(query,top_k=top_k,score_threshold=min_score)
    if not results:
        return {'answer': 'No relevant context found.','source':[],'confidence':0.0,'context':''}
    
    #Prepare context and sources
    context="\n\n".join(doc['content'] for doc in results)
    sources=[{
        'source':doc['metadata'].get('source_file',doc['metadata'].get('soource','unknown')),
        'page':doc['metadata'].get('page','unknown'),
        'score':doc['similarity_score'],
        'preview':doc['content'][:300] + '...'
    } for doc in results]
    confidence = max([doc['similarity_score']for doc in results])

    #Generate answer
    prompt=f"""Use the following context to answer the question concisely.

    Context:
    {context}

    Question: {query}

    Answer:"""
    response=llm.invoke([prompt.format(context=context,query=query)])
    """
    invoke() is a method that runs/executes something and returns a result.
    llm.invoke() specifically:
    It sends a prompt to a language model (groq.ai) and gets back the generated response.
    
    Then llm.invoke(prompt) sends the complete prompt with the actual context and question to the LLM.
    So response.content would be the LLM's actual answer based on the real context and query.
    """

    output={
        'answer':response.content,
        'sources':sources,
        'confidence':confidence
    }
    if return_context:
        output['context'] = context
    return output

#Example usage:
result=rag_features("What is adhd?",rag_retriever,llm,top_k=3,min_score=0.1,return_context=True)
print('Answer',result['answer'])
print('Sources',result['sources'])
print('Confidence',result['confidence'])
print('Context Preview',result['context'][:300])

Retrieving documents for query: 'What is adhd?'
Top K: 3, Score threshold: 0.1
Generating embedding for 1 texts....


Batches: 100%|██████████| 1/1 [00:00<00:00, 33.57it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)





Answer Attention Deficit Hyperactivity Disorder (ADHD) is a neurodevelopmental disorder.
Sources [{'source': 'adhd.pdf', 'page': 0, 'score': 0.3984838128089905, 'preview': 'becoming aware of the stigma associated with ADHD can help clinicians improve upon individual treatment plans to\nmeet their patient’s needs. It is important to note that this sample experienced co-morbid diagnoses and so this may limit\nthe interpretation of the ﬁndings. Further research in this area...'}, {'source': 'adhd.pdf', 'page': 0, 'score': 0.3984838128089905, 'preview': 'becoming aware of the stigma associated with ADHD can help clinicians improve upon individual treatment plans to\nmeet their patient’s needs. It is important to note that this sample experienced co-morbid diagnoses and so this may limit\nthe interpretation of the ﬁndings. Further research in this area...'}, {'source': 'adhd.pdf', 'page': 4, 'score': 0.38451898097991943, 'preview': 'treatment. ‘Well I would say I had a huge resistance to\nP