# **Retrieval-Augmented Generation (RAG) Pipeline**

## **Data Preprocessing**

#### **Load and Extract the Data**

In [37]:
from langchain.document_loaders import PyPDFLoader

def read_doc(pdf_name):
    
    ### load the pdf
    data = PyPDFLoader(pdf_name)

    extract_data = data.load()
    ### extract all pages as documents 
    
    return extract_data


In [38]:
documents = read_doc("INTRODUCTION.pdf")
documents

[Document(metadata={'producer': 'Acrobat Distiller 6.0 (Windows)', 'creator': 'Acrobat PDFMaker 6.0 for Word', 'creationdate': '2008-01-18T11:47:46-08:00', 'author': 'user', 'company': 'Central Texas College', 'moddate': '2015-03-06T16:10:22-08:00', 'title': 'Urdu Introduction – information needed:', 'source': 'INTRODUCTION.pdf', 'total_pages': 29, 'page': 0, 'page_label': '1'}, page_content='1\nSRI LANKAN FAMILIARIZATION COURSE \nIntroduction To  \nSRI LANKA \n \nGEOGRAPHY  \n \nSri Lanka is a 25,000 sq mi (65, 610 sq. km) tropical island paradise and a major tourist destination.  \ne. It is just off the southeastern coast of India between five and 10 degrees north of the equator. It is \nstrategically located near Indian Ocean sea lanes. It is slightly larger than West Virginia.  It has a \nbeautiful coast line of 1,340 kilometers which attracts a lot of tourists all year around. Sinharaja, \nwhich is one of world’s protected tropical rain forests, is located in Sri Lanka.  \n \nThe 

In [39]:
len(documents)

29

#### **Chunking the Extracted Data**

In [40]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunking(extracted_documents):
    
    chunking_method = RecursiveCharacterTextSplitter(chunk_size = 135, chunk_overlap = 40)
    documents_cunking = chunking_method.split_documents(extracted_documents)
    
    return documents_cunking


### Chunk size refers to the number of characters or tokens in each piece (chunk) of text that you split your data into.
###    Think of it like cutting a long article into small paragraphs.
###    Example: If chunk size = 800 → each chunk contains 800 characters.



### Chunk overlap means how much content is shared between consecutive chunks.
###    It's used to maintain context between chunks.
###    Example:
###        Chunk 1 → characters 0 to 800
###        Chunk 2 → characters 750 to 1550 (overlap of 50)

### This overlap helps ensure:
###    You don’t cut off a sentence or thought between chunks.
###    The model has some context from the previous chunk.



In [41]:
chunked_documents = chunking(documents)
chunked_documents

[Document(metadata={'producer': 'Acrobat Distiller 6.0 (Windows)', 'creator': 'Acrobat PDFMaker 6.0 for Word', 'creationdate': '2008-01-18T11:47:46-08:00', 'author': 'user', 'company': 'Central Texas College', 'moddate': '2015-03-06T16:10:22-08:00', 'title': 'Urdu Introduction – information needed:', 'source': 'INTRODUCTION.pdf', 'total_pages': 29, 'page': 0, 'page_label': '1'}, page_content='1\nSRI LANKAN FAMILIARIZATION COURSE \nIntroduction To  \nSRI LANKA \n \nGEOGRAPHY'),
 Document(metadata={'producer': 'Acrobat Distiller 6.0 (Windows)', 'creator': 'Acrobat PDFMaker 6.0 for Word', 'creationdate': '2008-01-18T11:47:46-08:00', 'author': 'user', 'company': 'Central Texas College', 'moddate': '2015-03-06T16:10:22-08:00', 'title': 'Urdu Introduction – information needed:', 'source': 'INTRODUCTION.pdf', 'total_pages': 29, 'page': 0, 'page_label': '1'}, page_content='SRI LANKA \n \nGEOGRAPHY  \n \nSri Lanka is a 25,000 sq mi (65, 610 sq. km) tropical island paradise and a major tourist d

In [42]:
len(chunked_documents)             #### number of chunkes

376

## **Indexing**

#### **Selecting the Embedding Model**

#### Select and Download an Embedding Model from Hugging Face 

In [43]:
"""
from langchain_community.embeddings import SentenceTransformerEmbeddings

embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
"""

'\nfrom langchain_community.embeddings import SentenceTransformerEmbeddings\n\nembedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")\n'

#### Save the Selected Embedding Model in locally

In [44]:
"""
embedding_model_path = "./local_embedding_model"
sentence_transformer_model = embedding_model.client   # Access the underlying SentenceTransformer model inside LangChain's wrapper
sentence_transformer_model.save(embedding_model_path)
"""


'\nembedding_model_path = "./local_embedding_model"\nsentence_transformer_model = embedding_model.client   # Access the underlying SentenceTransformer model inside LangChain\'s wrapper\nsentence_transformer_model.save(embedding_model_path)\n'

#### Load the Saved Embedding Model

In [45]:
from langchain.embeddings import SentenceTransformerEmbeddings

embedding_model_path = "./local_embedding_model"
embedding_model = SentenceTransformerEmbeddings(model_name=embedding_model_path)  # Load from local path


### Hugging Face is an open-source platform and company that provides tools, models, datasets, and infrastructure for building machine 
### learning (especially natural language processing) applications — with a strong focus on Large Language Models (LLMs).


#### Check the Embedding Model 

In [46]:
example_embedding = embedding_model.embed_query("Hello world")
example_embedding

[-0.03447731211781502,
 0.03102317824959755,
 0.006734973285347223,
 0.026108987629413605,
 -0.03936202451586723,
 -0.16030248999595642,
 0.06692396104335785,
 -0.006441520527005196,
 -0.047450557351112366,
 0.014758833684027195,
 0.0708753690123558,
 0.05552764981985092,
 0.019193340092897415,
 -0.026251304894685745,
 -0.010109521448612213,
 -0.026940487325191498,
 0.022307438775897026,
 -0.022226588800549507,
 -0.1496925950050354,
 -0.017493003979325294,
 0.007676261477172375,
 0.054352227598428726,
 0.003254449460655451,
 0.031725913286209106,
 -0.0846213772892952,
 -0.029406029731035233,
 0.05159557983279228,
 0.0481240414083004,
 -0.0033148375805467367,
 -0.05827924236655235,
 0.041969284415245056,
 0.02221062034368515,
 0.1281888782978058,
 -0.0223389845341444,
 -0.01165628433227539,
 0.06292835623025894,
 -0.03287626802921295,
 -0.09122604876756668,
 -0.031175415962934494,
 0.052699532359838486,
 0.047034814953804016,
 -0.08420310914516449,
 -0.030056163668632507,
 -0.0207448359

In [47]:
vector_dimension = len(example_embedding)    ### This length should be same as pinecone vector DB dimension
vector_dimension

384

#### **Create the Pinecone Vector DB**

#### Load the Evironment Variables

In [48]:
from dotenv import load_dotenv

load_dotenv()

True

#### Create a Connection with the Pinecone Account

In [49]:
from pinecone import Pinecone
import os 

pinecone_api_key = os.getenv("PINECONE_API_KEY")           #### select the pinecone API key
pinecone_client = Pinecone(api_key=pinecone_api_key)       #### Now create the connection under this pinecone API key

### The Pinecone() function requires you to pass the pinecone api key as a string, and it doesn’t automatically read from environment variables

#### Create the Index If not Created Under Selected Pinecone connection 

In [50]:
from pinecone import ServerlessSpec

index_name = "sri-lanka-informations"

if not pinecone_client.has_index(index_name):
    pinecone_client.create_index(
        name=index_name,
        dimension=vector_dimension,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )


### index = pinecone_client.Index(index_name)  This creates a connection to your existing or created Pinecone index
### it is necessary if you want to interact with the index after creating it below as:
###   Insert (upsert) vectors
###   Search (query) vectors
###   Delete vectors
###   Update vectors

### Examle:
### index.upsert(vectors=your_data)        

#### **Embedding the Chunked Documents and Upload into Pinecoe Vctor DB** 

In [51]:
"""
from langchain_pinecone import PineconeVectorStore

vectorstore = PineconeVectorStore.from_documents(
    documents=chunked_documents,
    embedding=embedding_model,
    index_name=index_name
)
"""
### * We don't need write pinecone_api_key = pinecone_api_key in PineconeVectorStore.from_documents() because when we use langchain_pinecone and load_dotenv()
### then it automatically select the pinecone api key from .env file.

### More we don't need to use index = pinecone_client.Index(index_name) to take the access for changing the index. Because when we use PineconeVectorStore in
### langchain_pinecone then it automatically does the following under the hood:
###   1.Initializes the Pinecone client using the API key and environment (discribe as *)
###   2.Connects to the index with the name you provided (index_name)
###   3.Upserts (uploads) your document embeddings
###   4.And finally, returns a LangChain-compatible retriever

### So you don’t need to manually call:
### pinecone_client = Pinecone(...)
### index = pinecone_client.Index(index_name)

 

'\nfrom langchain_pinecone import PineconeVectorStore\n\nvectorstore = PineconeVectorStore.from_documents(\n    documents=chunked_documents,\n    embedding=embedding_model,\n    index_name=index_name\n)\n'

## **Defining the Retrieval Request**

#### Import the Created Index in Pinecode Vector DB

In [52]:
from langchain_pinecone import PineconeVectorStore

import_index = PineconeVectorStore.from_existing_index(
    index_name= "sri-lanka-informations",
    embedding= embedding_model
)

### We don't need write pinecone_api_key = pinecone_api_key in PineconeVectorStore.from_existing_index() because when we use langchain_pinecone and load_dotenv()
### then it automatically select the pinecone api key from .env file.

### So you don’t need to manually call:
### pinecone_client = Pinecone(...)
### index = pinecone_client.Index(index_name)

#### Create a Retriever Connection 

In [53]:
retriever = import_index.as_retriever(search_type = "similarity", search_kwargs = {"k":3})

### that line is where you turn your Pinecone index connection into a Retriever object that LangChain can use in your RAG 
### pipeline.

### search_type="similarity"
### Means the retriever will find chunks whose vector embeddings are most similar to the user query embedding.

### search_kwargs={"k": 3}
### Means "return the top 3 most similar chunks".
### If your chunk size is ~400 tokens and k=3 → your LLM will get about 1,200 tokens as your retrieved context.

### 3️⃣ Why This Step is Important
### The retriever is the bridge between:
###    Your stored document embeddings in Pinecone.
###    Your question-answering chain (LLM + prompt).
### When the user asks a question:
###    Retriever turns the question into an embedding vector.
###    Looks up the closest k vectors in Pinecone.
###    Returns those chunks as {context} for the LLM.

#### Check the Created Retriever

In [54]:
retriever.invoke("How climate of Sri Lanka?")

[Document(id='212f01c6-4bca-469d-be31-10828898cbf0', metadata={'author': 'user', 'company': 'Central Texas College', 'creationdate': '2008-01-18T11:47:46-08:00', 'creator': 'Acrobat PDFMaker 6.0 for Word', 'moddate': '2015-03-06T16:10:22-08:00', 'page': 1.0, 'page_label': '2', 'producer': 'Acrobat Distiller 6.0 (Windows)', 'source': 'INTRODUCTION.pdf', 'title': 'Urdu Introduction – information needed:', 'total_pages': 29.0}, page_content='Description \n \nCLIMATE  \nThe climate of Sri Lanka is tropical with rains from the northeast monsoon from December to'),
 Document(id='dce9d650-0879-43a1-8fd7-379bd0cfe680', metadata={'author': 'user', 'company': 'Central Texas College', 'creationdate': '2008-01-18T11:47:46-08:00', 'creator': 'Acrobat PDFMaker 6.0 for Word', 'moddate': '2015-03-06T16:10:22-08:00', 'page': 1.0, 'page_label': '2', 'producer': 'Acrobat Distiller 6.0 (Windows)', 'source': 'INTRODUCTION.pdf', 'title': 'Urdu Introduction – information needed:', 'total_pages': 29.0}, page_

## **Preparing the LLM**

#### **Selecting the LLM**

#### Login to the Hugging Face

In [55]:
from huggingface_hub import login

login(os.getenv("HUGGINGFACE_HUB_TOKEN"))

### The login() function requires you to pass the token as a string, and it doesn’t automatically read from environment variables

#### Selecting and Download the LLM from Hugging Face

In [56]:
"""
from transformers import pipeline

LLM_name = "google/flan-t5-base"
chatmodel = pipeline(task = "text2text-generation", model = LLM_name)
"""


### "text2text-generation" is a task type in Hugging Face’s transformers library. It tells the pipeline() that the model takes text 
### input and produces text output.

### These models are trained to treat everything as a text-in → text-out problem. That includes:
###    Translation
###    Question answering
###    Summarization
###    Explanation
###    Instruction 

### Example:
### response = chatmodel("Translate English to French: I love you.")
### print(response[0]["generated_text"])
### Output: "Je t'aime"

### Text task
#| Task                         | Description                                                    | Example Models            |
#| ---------------------------- | -------------------------------------------------------------- | ------------------------- |
#| `"text-classification"`      | Classifies text into categories (e.g., sentiment analysis)     | BERT, RoBERTa, DistilBERT |
#| `"zero-shot-classification"` | Classify text into user-defined labels without retraining      | BART, RoBERTa             |
#| `"text-generation"`          | Generate text from a prompt (like ChatGPT-style completion)    | GPT-2, GPT-J, Mistral     |
#| `"text2text-generation"`     | Text in → Text out (used for T5-like models)                   | T5, FLAN-T5, BART         |
#| `"translation"`              | Translate text between languages                               | MarianMT, Helsinki-NLP    |
#| `"summarization"`            | Summarize long text into short form                            | BART, T5                  |
#| `"question-answering"`       | Extract answer from context given a question                   | BERT, DistilBERT          |
#| `"conversational"`           | Multi-turn dialogue (e.g., for chatbots)                       | Blenderbot, DialoGPT      |
#| `"fill-mask"`                | Predict missing words in a sentence (masked language modeling) | BERT, RoBERTa             |

### Image task
#| Task                     | Description                    | Example Models      |
#| ------------------------ | ------------------------------ | ------------------- |
#| `"image-classification"` | Classifies image content       | ViT, ConvNeXt       |
#| `"object-detection"`     | Detects objects in an image    | DETR, YOLO, OWL-ViT |
#| `"image-segmentation"`   | Labels each pixel of an image  | SegFormer, DPT      |
#| `"image-to-text"`        | Generates captions from images | BLIP, GIT           |

### Audio task
#| Task                             | Description                                 | Example Models    |
#| -------------------------------- | ------------------------------------------- | ----------------- |
#| `"automatic-speech-recognition"` | Converts spoken audio to text               | Whisper, Wav2Vec2 |
#| `"audio-classification"`         | Classifies sound type (e.g., music, speech) | Wav2Vec2, Hubert  |

### Multimodel task
#| Task                            | Description                             | Example Models  |
#| ------------------------------- | --------------------------------------- | --------------- |
#| `"document-question-answering"` | Answer questions from scanned documents | Donut, LayoutLM |
#| `"visual-question-answering"`   | Answer questions based on an image      | OFA, BLIP       |


'\nfrom transformers import pipeline\n\nLLM_name = "google/flan-t5-base"\nchatmodel = pipeline(task = "text2text-generation", model = LLM_name)\n'

#### Saving the Downloaded LLM 

In [57]:
"""
### Saving path
LLM_path = "./flan-t5-base-local"

### Save the model
model = chatmodel.model
model.save_pretrained(LLM_path)

### Save the tokenizer
tokenizer = chatmodel.tokenizer
tokenizer.save_pretrained(LLM_path)

"""

### 🔤 What Is a Tokenizer?
###    A tokenizer is a tool that converts text into tokens — which are smaller chunks (like words or subwords) — so that language models
###    like ChatGPT or Flan-T5 can understand and process them.

### 🧠 Why Do We Need a Tokenizer?
###    Machine learning models don’t understand text directly — they only understand numbers.
###    So, a tokenizer does two key jobs:
###    Text → Tokens → Numbers (for input to the model)
###    Numbers → Tokens → Text (for decoding the model’s output)

### Saving a tokenizer = saving the text-to-token logic so your model can use it again and again without relying on downloading from 
###  Hugging Face.

'\n### Saving path\nLLM_path = "./flan-t5-base-local"\n\n### Save the model\nmodel = chatmodel.model\nmodel.save_pretrained(LLM_path)\n\n### Save the tokenizer\ntokenizer = chatmodel.tokenizer\ntokenizer.save_pretrained(LLM_path)\n\n'

#### Loading the Saved LLM

In [58]:

from transformers import pipeline

LLM_path = "./flan-t5-base-local"

loaded_LLM = pipeline(task="text2text-generation", model= LLM_path, max_new_tokens = 100)


Device set to use cpu


#### Convert Hugging Face Model into Langchain-Compatible Object

In [59]:
from langchain_huggingface import HuggingFacePipeline

#chatmodel = HuggingFacePipeline(pipeline=loaded_LLM)
chatmodel = HuggingFacePipeline(pipeline=loaded_LLM)

### When we use a LLM from Hugging Face pipeline (like one created with transformers.pipeline) then it cannot be used
### with RAG workflow inside the LangChain (a framework to build applications using LLMs.)
 

### The HuggingFacePipeline class inside huggingface modulle of LangChain library provides facility for your loaded_LLM 
### pipeline model works as a LangChain-compatible object. Then you can now use it in:
###   RAG chains
###   Prompt templates
###   Document chains
###   Retrieval chains


#### Defining the Input Structure of the LLM

In [60]:
from langchain.prompts import PromptTemplate

system_prompt = """
You are an assistant that answers questions about Sri Lanka.

Instructions:
1. Use ONLY the information from the document 'Introduction to Sri Lanka'.
2. The document covers: geography, history, culture, economy, religion, transportation, health care, education, and society.
3. Do NOT use outside knowledge or make up information.
4. Answer in EXACTLY complete and clear sentences.
5. If the answer is not found in the document, reply:
   "The document does not provide this information."

Context:
{context}

Question:
{input}

Answer:
"""

prompt = PromptTemplate(
    input_variables=["context", "input"],
    template=system_prompt.strip()
)



### This is the template we follow to give inputs to the LLM. This template contains both user's question informations 
### and the guidance which LLM should follow when give the answer. This guidence gives a chance to handle LLM model.

### When user ask a question then the question is send through {input} and it's corresponding retrievel which gives 
### from pinecone vector DB joined together as one block of text (not as separate {chunk1},{chunk2}, {chunk3}
### variables) and send it through {context} into system_prompt then the input sttructure of the LLM below as:

"""
You are an assistant that answers questions about Sri Lanka.

Instructions:
1. Use ONLY the information from the document 'Introduction to Sri Lanka'.
2. The document covers: geography, history, culture, economy, religion, transportation, health care, education, and society.
3. Do NOT use outside knowledge or make up information.
4. Answer in EXACTLY three complete and clear sentences.
5. If the answer is not found in the document, reply:
   "The document does not provide this information."

Context:
[retrieved chunk 1 text]
[retrieved chunk 2 text]
[retrieved chunk 3 text]

Question: 
What is XYZ?
"""
 

'\nYou are an assistant that answers questions about Sri Lanka.\n\nInstructions:\n1. Use ONLY the information from the document \'Introduction to Sri Lanka\'.\n2. The document covers: geography, history, culture, economy, religion, transportation, health care, education, and society.\n3. Do NOT use outside knowledge or make up information.\n4. Answer in EXACTLY three complete and clear sentences.\n5. If the answer is not found in the document, reply:\n   "The document does not provide this information."\n\nContext:\n[retrieved chunk 1 text]\n[retrieved chunk 2 text]\n[retrieved chunk 3 text]\n\nQuestion: \nWhat is XYZ?\n'

#### Combining Retrieved Documents, User Question, and Instructions and pass to the LLM

In [61]:
from langchain.chains.combine_documents import create_stuff_documents_chain


question_answer_chain = create_stuff_documents_chain(chatmodel, prompt)

### What create_stuff_documents_chain does:
###    Think of it as a glue function between your retrieved documents and your LLM prompt.

### This glue function work as below:
###   1. When retriever gives you chunks from Pinecone:
###      Those chunks are LangChain Document objects with .page_content (text) and optional metadata.
###      You can't just "throw" them at the LLM — they need to be merged into the {context} placeholder in your prompt.

###   2. create_stuff_documents_chain does this merging automatically:
###      Joins all chunk texts together into one block ({context}).
###      Inserts that block into the prompt.
###      Sends the final, filled-in prompt to your chatmodel (LLM).
###      Collects the LLM's output as your answer

## **Connecting the Prepeared LLM with Pinecone Vector DB**

In [62]:
from langchain.chains import create_retrieval_chain

rag_chain = create_retrieval_chain(retriever, question_answer_chain)


### 2️⃣ Purpose of create_retrieval_chain()
### It combines:
###    The retrieval step (search Pinecone for relevant chunks).
###    The generation step (pass chunks + question + prompt to LLM via question_answer_chain).
### You get a single callable object (rag_chain) that:
###    Takes a user’s question.
###    Automatically retrieves top-k chunks.
###    Passes them + question + prompt into your LLM.
###    Returns the final grounded answer.

#### Check the Connection from a User Input

In [63]:
response = rag_chain.invoke({"input":"How climate of Sri Lanka?"})
print(response["answer"])

tropical


#### Errors

In [64]:
### TypeError: messages=[SystemMessage(...), HumanMessage(...)] have the wrong format...

### ✅ Why This Happens
### The error is occurring because chatmodel = pipeline(...) from Hugging Face Transformers is not compatible with LangChain's chat 
### interface, especially when used with ChatPromptTemplate, which generates chat-style messages (SystemMessage, HumanMessage, etc).
### However, you're using create_stuff_documents_chain() which expects the model to be a LangChain-compatible LLM — not a Hugging Face pipeline.

### ✅ Solution: Use HuggingFacePipeline with LangChain
### You need to wrap your Hugging Face pipeline into a LangChain-compatible model using HuggingFacePipeline.

### from langchain.llms import HuggingFacePipeline

# Load HF model locally
#local_dir = "./flan-t5-base-local"
#hf_pipeline = pipeline(task="text2text-generation", model=local_dir)

# Wrap the HF pipeline to be LangChain-compatible
#chatmodel = HuggingFacePipeline(pipeline=hf_pipeline)


In [65]:
### Token indices sequence length is longer than the specified maximum sequence length for this model (629 > 512)

### ✅ Why This Happens
### flan-t5-base only supports up to 512 tokens. But LangChain's RAG pipeline combines retrieved documents and your question into one 
### input. If that combined input exceeds 512 tokens, you get:

### ✅ How to Fix It
### You need to limit the total number of tokens or limit the number of retrieved documents passed to the model. Here are 3 effective
### solutions:
###  Solution 1: Limit number of retrieved documents
###  Solution 2: Truncate context before sending to model
###  Solution 3: Use a larger model (if needed)

#### Notes

In [66]:
### Retrieval-Augmented Generation (RAG) is a method that improves the accuracy and relevance of LLM responses by combining 
### an external knowledge source (like a vector database) with the model’s generation ability. Instead of relying only on the
### LLM’s internal knowledge, RAG retrieves relevant facts from your dataset and feeds them into the model at query time.

### 🎯 Core Purpose of RAG:
###    Enhance accuracy → Avoids hallucinations by grounding answers in real documents.
###    Keep answers up-to-date → You can update your database without retraining the LLM.
###    Make answers domain-specific → Tailors the LLM to a particular topic or dataset.
###    Improve trust → Sources can be shown to the user for transparency.

### ⚙️ Core Steps of RAG
###     1. Data Preprocessing
###          Load data, data cleaning, data extracting (eg. PDFs, websites)
###     2. Indexing
###          Selecting embedding model, Create Pinecone vector DB, embedding chunked data and store in pinecone vector DB.
###     3. Defining the retrievel reqquest
###     4. Preparing the LLM
###     5. Connecting prepared LLM with Pinecone vector DB
###     6. Check the connection  

In [67]:
### When you're developing an LLM-based chatbot, especially one focused on domains like medicine, the reliability of the chatbot’s 
### output depends on several important factors. Here's a breakdown of the key elements:

### ✅ 1. Quality and Relevance of the Data (Documents / Knowledge Base)
###    Garbage in, garbage out applies here.
###    If your chatbot is connected to a vector database (like Pinecone), the quality of the documents you upload directly affects the accuracy.
###    For example: Uploading a clear, verified Diagnostic Tests Guide PDF will result in better answers than using random or outdated internet content.

### ✅ 2. Prompt Engineering
###    The system prompt or instructions you give the LLM heavily guide how it responds.
###    For example:
###       "You are a medical assistant. Use the retrieved context to answer the user question concisely. If unsure, say you don't know."
###    Good prompts improve relevance, safety, and structure of output.

### ✅ 3. LLM Capability (Model Power)
###    Model size, training data, and architecture matter.
###    Example:
###        google/flan-t5-base is small (~250M params), so it's faster and cheaper but less accurate and less expressive.
###        GPT-4o or Mistral 7B are much stronger at understanding nuanced queries and giving richer answers.

### ✅ 4. Context Window and Chunk Size
###    If your LLM has a context limit of 512 tokens (like flan-t5-base), it may not understand or process long questions or multiple retrieved chunks.
###    Larger models like GPT-4o support 128k tokens, so they handle long context better.

### ✅ 5. Retrieval Quality (if using RAG)
###    If you're using Retrieval-Augmented Generation (RAG) with vector stores like Pinecone, the chatbot’s answer depends on the retrieved chunks.
###    Influenced by:
###        Embedding model quality (e.g., all-MiniLM-L6-v2)
###        Vector dimension match
###        Chunking strategy (chunk size, overlap, etc.)
###        Query rewriting / rephrasing to improve retrieval

In [68]:
### ❓ If your LLM supports 512 tokens, and your chunk size is 400 tokens, and you use 3 retrievers, does the LLM receive all 3 at once
### (1200 tokens)? Or one at a time?

### ✅ Answer:
### All 3 chunks are typically passed together in a single prompt — as part of the {context} in your system prompt — unless you 
### manually loop over them or limit the context size.

###🧠 Here's what's happening under the hood:
###    1.Retriever returns the top k relevant documents (in your case, 3 chunks, each 400 tokens).
###    2.These 3 documents are concatenated into one string and inserted into {context} in your prompt.
###    3.Your prompt becomes something like:
###       "You are an assistant... only use the context below...
###       Context:
###       [chunk 1, ~400 tokens]
###       [chunk 2, ~400 tokens]
###       [chunk 3, ~400 tokens]
###       User: What is XYZ?"

###    4.Total token count:
###    System + Prompt Template ≈ ~50 tokens
###    Context: 3 × 400 = 1200 tokens
###    Question: ~10–30 tokens
###    → Total: ~1300 tokens, which exceeds your LLM limit (512 tokens).

### ⚠️ What happens when you exceed the token limit?
###     If you're using a strict model like Flan-T5, it will truncate or fail.
###     You might see incomplete answers or errors.
###     Your LLM may only process the last part of the context (e.g., chunk 3 only), which leads to irrelevant or short answers.

### ✅ Solutions:
###    Reduce chunk size
###    → Try chunk_size=150 and chunk_overlap=30. Then 3 chunks = ~450 tokens.
###    Reduce number of retrieved documents
###    → Use retriever = vectorstore.as_retriever(k=1 or 2).
###    Summarize or rank chunks before passing to LLM
###    → (Advanced) Run a pre-step that merges or filters chunks into fewer tokens.
###    Switch to an LLM with larger context window
###    → Use a model like Mistral 7B (context ~8k tokens) if resources allow.

In [69]:
### The dimension of an embedding model — like 384, 768, 1024, etc. — plays a key role in how text is represented numerically. Let’s
### break down what happens when you use a low vs. high dimensional embedding model, especially in a chatbot or vector search setting
### (like Pinecone + LLM).

### 🔢 What Is Embedding Dimension?
### The embedding dimension is the size of the numeric vector that represents a piece of text (sentence, paragraph, etc.).
### For example:
###    all-MiniLM-L6-v2 → 384-dimensional
###    all-mpnet-base-v2 → 768-dimensional
###    BAAI/bge-large-en → 1024-dimensional

### 📉 Low-Dimensional Embeddings (e.g., 384)
###    ✅ Pros:
###       Smaller storage: Each vector is lighter (good for Pinecone costs).
###       Faster computation: Quicker similarity searches, distance calculations.
###       Lower memory usage: Useful for low-resource environments.
###    ❌ Cons:
###       Lower expressiveness: Might miss nuances in meaning.
###       More overlap between vectors: Especially in large datasets, different meanings might seem “close”.

### 📈 High-Dimensional Embeddings (e.g., 768–1024+)
###   ✅ Pros:
###    More expressive: Captures richer semantics, word context, syntax.
###    Better accuracy: Especially for dense documents or longer context.
###    Less collision: Similar-looking sentences are less likely to be misinterpreted.
###  ❌ Cons:
###    Slower & heavier: More memory, compute, and bandwidth.
###    Costly in vector DBs: Indexing and querying is more expensive (e.g., in Pinecone).
###    May overfit: On small datasets, large dimensions can lead to noisy similarity scores.

### ⚖️ Conclusion
###    Low-dimensional (384) → ✅ Good for quick, simple, low-resource tasks.
###    High-dimensional (768–1024) → ✅ Better for precision, dense queries, technical content (like your medical chatbot).

### 👉 But always match Pinecone’s index dimension to the model’s output. If the dimensions mismatch, you’ll get an error or misleading 
### results.


In [70]:
### chunk overlap plays a very important role in improving the context awareness and answer accuracy of your LLM-based chatbot, 
### especially when using retrieval-based systems (RAG) like yours.

### 🧩 What is chunk overlap?
### When splitting a long document into smaller chunks (e.g., 400 tokens each), chunk overlap means repeating a portion of the previous chunk in the next one.
### Example:
###    Chunk size = 400 tokens
###    Overlap = 50 tokens
### So, the chunks look like:
###    Chunk 1: tokens 0–399
###    Chunk 2: tokens 350–749 ← overlaps 50 tokens from chunk 1
###    Chunk 3: tokens 700–1099 ← overlaps 50 tokens from chunk 2

### ✅ Why is chunk overlap important?
###    1. Preserves context across boundaries
###        Sometimes, important information (definitions, names, conclusions) falls between two chunks. Without overlap, that 
###        information might get split and lost in both chunks.

###    2. Improves search recall (retrieval quality)
###         When embedding chunks for vector search (Pinecone), overlapping helps capture more meaningful embeddings — especially when:
###         Sentences are long
###         Sections span multiple paragraphs
###       This makes similarity search more accurate.

###    3.Reduces hallucinations in LLM answers
###     Because the retriever returns complete, overlapping context, the LLM can answer based on fully intact facts, not partial or 
###     clipped information.

### ⚠️ What if you don’t use chunk overlap?
###    You risk cutting off important sentences, especially around the boundaries.
###    Retrieval might miss critical context.
###    Answers may feel short, incomplete, or even incorrect.

In [71]:
### response time in your chatbot depends on several parts of the RAG pipeline, so you can reduce latency by optimizing both 
### your retrieval and your LLM inference.

### 1️⃣ Speed Up Retrieval from Pinecone
###     Reduce Retriever k
###   *     You’re currently using k=3 (3 document chunks). If your LLM can work well with less, try k=2 to reduce retrieval and prompt length.
###     Optimize Embedding Search
###         Make sure your Pinecone index metric matches your embedding type (cosine, dot product, or Euclidean).
###         Use smaller dimension embeddings (e.g., 384 instead of 768) if accuracy remains good — smaller vectors = faster search.
###   * Warm Up the Connection
###         Call the retriever once at startup with a dummy query to keep the Pinecone connection alive. This avoids first-query delays.

### 2️⃣ Optimize the LLM Inference
###     Choose a Faster Model
###         google/flan-t5-base is ~250M params — not huge, but you could try flan-t5-small (fewer parameters, faster) or other small instruction-tuned models.
###         Or use quantized models from Hugging Face (e.g., 8-bit or 4-bit weights).
###   * Lower max_new_tokens
###         If your responses don’t need to be long, set max_new_tokens=100 or lower to reduce generation time.
###     Load Model Locally
###        Avoid downloading from Hugging Face on every run — save locally and load from disk.

### 3️⃣ Preprocessing and Chunking Improvements
###     Reduce Chunk Size
###         Smaller chunks (e.g., 256 tokens instead of 400) mean faster LLM processing — but keep enough context for answers.
###     Smarter Prompt Templates
###        Keep your system prompt short but clear — large system prompts eat into token budget and slow down inference.

### 4️⃣ Parallel or Cached Operations
###     Use Retrieval Cache
###         Cache embeddings and retrieval results for repeated queries.
###     Parallelize Requests
###        If you have multiple user queries, send them in batches instead of sequentially.


In [72]:
                                           ############################### Python 3.10.0 ################################



################## requred dor app ###############3
#python-dotenv==1.1.1    
### from dotenv import load_dotenv (load the environment variables in .env file)

#langchain==0.3.27        
### from langchain.prompts import PromptTemplate   (to create LLM system prompt) 
### from langchain.chains.combine_documents import create_stuff_documents_chain (combining retriever, user question, and instructions and pass to the LLM)
### from langchain.chains import create_retrieval_chain  (to connct the LLM with pinecone vector DB)
### from langchain.text_splitter import RecursiveCharacterTextSplitter ( to chunk extracted data) 

#transformers==4.55.0     
### from transformers import pipeline  (download a LLM from hugginhface and load it locally)

#pinecone-client==6.0.0  
### from langchain_pinecone import PineconeVectorStore (to embedding chunked data and upload to pinecone simultaniousy. Moreover to import exsisting index frompinecone)
### from pinecone import Pinecone (to create a connection with pinecone)
### from pinecone import ServerlessSpec ( to creat a index)

#langchain-huggingface==0.3.1 
### from langchain_huggingface import HuggingFaceEmbeddings  (to load the embedding model from locally which downloaded by huggingface) 
### from langchain_huggingface import HuggingFacePipeline   ( to convert huggingface model into langchain-compatible object)

#langchain-pinecone==0.2.11    
### langchain_pinecone

#sentence-transformers==5.1.0
### from sentence_transformers import SentenceTransformer (to download a embedding model from huggingface)  
### HuggingFaceEmbeddings    ( for embedding sentences, chunks)

#streamlit==1.48.0
### Model deploy



##### requred only .ipynb file ##################
#huggingface-hub==0.34.3
### For login to hugginhface account

#ipykernel==6.30.1
### To add kernel for jupiternote book files

#langchain-community==0.3.27
#pypdf==5.9.0
### from langchain.document_loaders import PyPDFLoader  ( pdf load and extract)

#ipywidgets==8.1.7
### to work with jupiter notebook file


