# 🤖RAG ⚕️Healthbot Pipeline 
## Building a Retrieval-Augmented Generation (RAG) pipeline for a healthbot using LangChain, Pinecone, and Gemini.




## 1. Setup & Imports
Get current working directory and import document loaders and text splitter from LangChain.

In [1]:
%pwd

'c:\\Users\\LakshmanReddy\\OneDrive\\Documents\\AI-ML\\projects\\RAG PROJECT\\RAG PROJECT\\LLM-RAG-HEALTHBOT\\research'

In [2]:
from langchain.document_loaders import PyPDFLoader , DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


## 2. PDF Extraction
Define a function to extract documents from PDF files in a directory.

In [3]:
# extract text from the pdf file

def  extract_from_pdf(file_path):
    loader = DirectoryLoader(file_path, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents
     

In [4]:
extracted_docs = extract_from_pdf("..\medical data")

  extracted_docs = extract_from_pdf("..\medical data")
Error loading file ..\medical data\Medical_book.pdf
  extracted_docs = extract_from_pdf("..\medical data")


ImportError: `pypdf` package not found, please install it with `pip install pypdf`


Display extracted documents and show number of extracted documents.

In [None]:
extracted_docs

In [None]:
len(extracted_docs)

## 3. Document Preprocessing
Import Document schema and define filter_to_minimal_docs to keep only source and page content.

In [None]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs) :
    """
    Given a list of Document objects, return a new list of Document objects
    containing only 'source' in metadata and the original page_content.
    """
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs

In [None]:
minimal_docs = filter_to_minimal_docs(extracted_docs)
minimal_docs[0]

## 4. Text Chunking
Define chunker function, chunk minimal documents, print number of chunks, and display first chunk’s content.

In [None]:
# Chunking the documents into smaller pieces

def chunker(docs ,chunk_size=1200 , chunk_oerlap= 100):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_oerlap
    )
    text_chunks= text_splitter.split_documents(docs)
    return text_chunks

In [None]:
text_chunks = chunker(minimal_docs)
print(f"Number of chunks: {len(text_chunks)}")

In [None]:
text_chunks[0].page_content

## 5. Embedding Model Setup
Import and initialize HuggingFace embeddings. Test embedding with a sample sentence. Display embedding vector size and sample values.

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
def download_embeddings():
    """
    Download and return the HuggingFace embeddings model.
    """
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name
    )
    return embeddings

embedding = download_embeddings()

In [None]:
embedding

In [None]:

vector = embedding.embed_query("This is an example sentence to be embedded.")
vector

In [None]:
print(len(vector))                         # As its a 384 dimensional vector
print(vector[:5])                          # Print first 5 dimensions of the vector

## 6. Environment Variables
Load .env file and read Pinecone and Gemini API keys.

In [None]:
import os
from dotenv import load_dotenv
load_dotenv() 

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

## 7. Pinecone Index Setup
Initialize Pinecone client and create Pinecone index if not exists.

In [None]:
from pinecone import Pinecone
# initialize pinecone client (pc)
pc = Pinecone(api_key=PINECONE_API_KEY)           


In [None]:
from pinecone import ServerlessSpec

index_name="medical"

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension=384,        #dimesnions of the sentence transformer model
        metric="cosine",      #codine similarity
        spec=ServerlessSpec(cloud="aws",region="us-east-1")
        
    )
my_index=pc.Index(index_name)

## 8. Vector Store Creation
Import and create PineconeVectorStore from document chunks and embeddings. Explain docsearch usage.

In [None]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embedding
)
# docsearch is an instance of PineconeVectorStore, which is a LangChain wrapper for storing and searching document embeddings in a Pinecone index.



- add more data if necessary.

In [None]:
new_doc = Document(
    page_content="This is the page content of the new document .",
    metadata={"source": "social media"}
)

In [None]:
docsearch.add_documents([new_doc])

## 9. Retrieval
- Create retriever from vector store. Retrieve data for a sample query.

In [None]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [None]:
retrieved_data = retriever.invoke("what is acne ?")
retrieved_data


## 10. LLM Setup
- Import and initialize Gemini chat model.

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
import os

chat_model = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    api_key=os.getenv("GEMINI_API_KEY")
)


## 11. RAG Chain Construction
- Import chain and prompt utilities. Define system and human prompts. Create document combination and retrieval chains.

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [None]:
system_prompt = (
    "You are an Medical assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. keep the answer concise ."
    
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [None]:
question_answer_chain = create_stuff_documents_chain(chat_model, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

## 12. End-to-End QA
- Invoke RAG chain with a sample medical question. Print the generated answer.

In [None]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})

print(response["answer"])