## We need to make an .env file which contains the api keys of langchain, url of langsmith and all.

In [1]:
import os
import warnings
from dotenv import load_dotenv

os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
warnings.filterwarnings("ignore")

load_dotenv()

True

### Loading the Documents
os.walk returns three values so, we define it with root, dirs and files
But the root goes through the available directories inside the mentioned path i.e., rag-dataset and then the files goes thorugh the file names
So, we don't need to call root while joining the path

In [2]:
pdfs = []

for root, dirs, files in os.walk('rag-dataset'):
    #print(root, dirs, files)
    for file in files:
        if file.endswith('.pdf'):
            pdfs.append(os.path.join(root, file))

In [3]:
pdfs

['rag-dataset\\gym supplements\\1. Analysis of Actual Fitness Supplement.pdf',
 'rag-dataset\\gym supplements\\2. High Prevalence of Supplement Intake.pdf',
 'rag-dataset\\health supplements\\1. dietary supplements - for whom.pdf',
 'rag-dataset\\health supplements\\2. Nutraceuticals research.pdf',
 'rag-dataset\\health supplements\\3.health_supplements_side_effects.pdf']

In [4]:
from langchain_community.document_loaders import PyMuPDFLoader

docs = []
for pdf in pdfs:
    loader = PyMuPDFLoader(pdf)
    pages = loader.load()
    docs.extend(pages)

In [5]:
len(docs)

64

### Chunking the extracted Documents

Chunk overlap includes a small chunk of text from the previous chunk to the next chunk which helps to preserve the context

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size= 1000, chunk_overlap = 80)

chunks = text_splitter.split_documents(docs)


In [7]:
len(chunks)

301

In [8]:
import tiktoken

encoding = tiktoken.encoding_for_model('gpt-4o-mini')

len(encoding.encode(docs[0].page_content)), len(encoding.encode(chunks[0].page_content))

(969, 294)

## Vector Embeddings for Documents

In [9]:
from langchain_ollama import OllamaEmbeddings
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

In [10]:
embeddings = OllamaEmbeddings(model='nomic-embed-text', base_url="http://localhost:11434")

test_vector = embeddings.embed_query("I believe this query will also generate a fixed number of vectors which will define the dimensions")

In [11]:
len(test_vector)

768

In [12]:
index = faiss.IndexFlatL2(len(test_vector))
index.ntotal


0

In [13]:
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)

In [14]:
len(docs), len(chunks)

(64, 301)

In [15]:
ids = vector_store.add_documents(documents=chunks)

In [16]:
print(len(ids))

301


## Similar Document Retrieval Process

In [17]:
question = "What is used to gain muscle mass?"
vector_store.search(query=question, search_type='similarity')

[Document(metadata={'source': 'rag-dataset\\gym supplements\\1. Analysis of Actual Fitness Supplement.pdf', 'file_path': 'rag-dataset\\gym supplements\\1. Analysis of Actual Fitness Supplement.pdf', 'page': 0, 'total_pages': 15, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': '', 'producer': 'iLovePDF', 'creationDate': '', 'modDate': 'D:20241021113850Z', 'trapped': ''}, page_content='caffeine, which is found in many sports and food supplements. Caffeine reduces perceived\neffort, minimizes fatigue and pain, and proves to be effective for endurance and high-\nintensity activities, which is the choice of consumers [4].\nCreatine monohydrate is another well-known supplement used to gain muscle mass\nand support performance and recovery. It is known not to increase fat mass and remains\neffective even when taken in recommended doses [5]. Despite its popularity in the fitness\nFoods 2024, 13, 1424. https://doi.org/10.3390/foods13091424\nhttps://www.

In [18]:
similar_chunks = vector_store.search(query=question, search_type='similarity')

for parts in similar_chunks:
    print(parts.page_content)
    print("\n\n")

caffeine, which is found in many sports and food supplements. Caffeine reduces perceived
effort, minimizes fatigue and pain, and proves to be effective for endurance and high-
intensity activities, which is the choice of consumers [4].
Creatine monohydrate is another well-known supplement used to gain muscle mass
and support performance and recovery. It is known not to increase fat mass and remains
effective even when taken in recommended doses [5]. Despite its popularity in the fitness
Foods 2024, 13, 1424. https://doi.org/10.3390/foods13091424
https://www.mdpi.com/journal/foods



market for weight-loss products, including dietary supplements. Among military service 
members, athletes and bodybuilders it is also common to ingest dietary sports supplements 
intended to burn fat and increase performance, muscle mass or strength. As examples, 53% 
of active-duty US Army soldiers report using at least one dietary supplement per week (60), 
and 64% of college students participating in ath

In [19]:
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs = {'k': 3, 'fetch_k': 100, 'lambda_mult': 1})

In [20]:
similar_chunks = retriever.invoke(question)

for parts in similar_chunks:
    print(parts.page_content)
    print("\n\n")

caffeine, which is found in many sports and food supplements. Caffeine reduces perceived
effort, minimizes fatigue and pain, and proves to be effective for endurance and high-
intensity activities, which is the choice of consumers [4].
Creatine monohydrate is another well-known supplement used to gain muscle mass
and support performance and recovery. It is known not to increase fat mass and remains
effective even when taken in recommended doses [5]. Despite its popularity in the fitness
Foods 2024, 13, 1424. https://doi.org/10.3390/foods13091424
https://www.mdpi.com/journal/foods



market for weight-loss products, including dietary supplements. Among military service 
members, athletes and bodybuilders it is also common to ingest dietary sports supplements 
intended to burn fat and increase performance, muscle mass or strength. As examples, 53% 
of active-duty US Army soldiers report using at least one dietary supplement per week (60), 
and 64% of college students participating in ath

In [21]:
question = "what is used to reduce weight?"
# question = "what are side effects of supplements?"
# question = "what are the benefits of supplements?"
# question = "what are the benefits of BCAA supplements?"
try_data = retriever.invoke(question)

## Better Answer Generation using retrieved documents and prompt

In [22]:
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate

from langchain_ollama import ChatOllama

In [23]:
model = ChatOllama(model="llama3.2:3b", base_url="http://localhost:11434")

model.invoke('hi')

AIMessage(content='How can I help you today?', additional_kwargs={}, response_metadata={'model': 'llama3.2:3b', 'created_at': '2025-01-16T09:17:35.848536Z', 'message': {'role': 'assistant', 'content': ''}, 'done_reason': 'stop', 'done': True, 'total_duration': 13946148100, 'load_duration': 11379824300, 'prompt_eval_count': 26, 'prompt_eval_duration': 1732000000, 'eval_count': 8, 'eval_duration': 826000000}, id='run-8a6fa8f9-d211-46f0-ba6a-54729b6b1807-0', usage_metadata={'input_tokens': 26, 'output_tokens': 8, 'total_tokens': 34})

In [24]:
prompt = hub.pull("rlm/rag-prompt")

In [25]:
prompt

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})])

### Creating the RAG chain

In [26]:
def format_docs(similar_chunks):
    return "\n\n".join([parts.page_content for parts in similar_chunks])

# print(format_docs(similar_chunks))

In [27]:
rag_chain = (
    {"context": retriever|format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [30]:
question = "what is used to gain muscle mass?"
# question = "what is used to reduce weight?"
# question = "what are side effects of supplements?"
# question = "what are the benefits of supplements?"
# question = "what are the benefits of BCAA supplements?"

# question = "what is used to increase mass of the Earth?"

output = rag_chain.invoke(question)
print(output)

Creatine monohydrate is used to gain muscle mass, along with other supplements that support performance and recovery. The most common reasons people take supplements are to build muscle, improve health, and enhance sport-specific performance. While caffeine can provide benefits for endurance and high-intensity activities, it is not primarily used to gain muscle mass.
