In [None]:
#Step-1: Install the packages
!pip install langchain langchain-groq langchain-community

In [81]:
from transformers import pipeline

# Load a small instruction-tuned model instead of GPT-2
qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-small")

# Query + Retrieved doc
query = "Who is the CEO of Tesla?"
retrieved_doc = "Tesla's CEO is Elon Musk since 2008."

# Step 2: Construct prompt
prompt = f"""
Answer the question using the context below.

Context: {retrieved_doc}
Question: {query}
"""

# Step 3: Generate answer
result = qa_pipeline(prompt, max_length=50)

print("=== Prompt Sent to Model ===")
print(prompt)
print("\n=== Model Answer ===")


print(result)
print(result[0]["generated_text"])


Device set to use cpu
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


=== Prompt Sent to Model ===

Answer the question using the context below.

Context: Tesla's CEO is Elon Musk since 2008.
Question: Who is the CEO of Tesla?


=== Model Answer ===
[{'generated_text': 'Elon Musk'}]
Elon Musk


In [None]:
!pip install gradio python-dotenv

In [1]:
import os
from dotenv import load_dotenv

# Load from .env file (adjust path if needed)
load_dotenv(dotenv_path="/content/.env.txt")

# Fetch the key
groq_api_key = os.getenv("GROQ_API_KEY")

# Check if it exists
if groq_api_key:
    print("GROQ_API_KEY is loaded successfully!")
    #print("Length of key:", len(groq_api_key))   # safer than printing full key
    #print("First 6 chars:", groq_api_key[:6])    # preview beginning only
else:
    print("GROQ_API_KEY not found! Please check your .env file or environment settings.")

GROQ_API_KEY not found! Please check your .env file or environment settings.


In [84]:
#Step-2: Retrive LLM model using groq

from langchain_groq import ChatGroq

#import os
#groq_api = os.getenv("GROQ_API_KEY")  # get from env


llm = ChatGroq(
    api_key=groq_api_key,
    model="llama-3.3-70b-versatile",
    temperature=0.7,
    max_tokens=512
)

llm

ChatGroq(client=<groq.resources.chat.completions.Completions object at 0x7b972eda2930>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x7b9717cbd550>, model_name='llama-3.3-70b-versatile', model_kwargs={}, groq_api_key=SecretStr('**********'), max_tokens=512)

In [112]:
!pip install pypdf



In [85]:
!pip install python-docx



In [86]:
#Step-3: Load the pdf

from langchain.document_loaders import PyPDFLoader
from docx import Document
pdf_path = "/content/introduction-to-nutrition.pdf"
loader = PyPDFLoader(pdf_path)
documents = loader.load()

In [87]:
#Step-4: Split into Text

from langchain.text_splitter import RecursiveCharacterTextSplitter
# ------------------ SPLIT ------------------
splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=200
)
docs = splitter.split_documents(documents)
print(docs)

[Document(metadata={'producer': 'Microsoft® PowerPoint® 2016', 'creator': 'Microsoft® PowerPoint® 2016', 'creationdate': '2017-07-18T14:34:22+05:30', 'title': 'Aucun titre de diapositive', 'author': 'apave sud', 'moddate': '2017-07-18T14:34:22+05:30', 'source': '/content/introduction-to-nutrition.pdf', 'total_pages': 77, 'page': 0, 'page_label': '1'}, page_content='EU-India Capacity Building Initiative for Trade Development (CITD) – Lot 1 SPS/TBT/Customs\nEU-India Capacity Building Initiative \nfor Trade Development (CITD)\nIntroduction to Nutrition\nTrain the Trainers \nin Food Safety and Nutrition'), Document(metadata={'producer': 'Microsoft® PowerPoint® 2016', 'creator': 'Microsoft® PowerPoint® 2016', 'creationdate': '2017-07-18T14:34:22+05:30', 'title': 'Aucun titre de diapositive', 'author': 'apave sud', 'moddate': '2017-07-18T14:34:22+05:30', 'source': '/content/introduction-to-nutrition.pdf', 'total_pages': 77, 'page': 1, 'page_label': '2'}, page_content='EU-India Capacity Build

In [88]:
# !pip install langchain langchain-community chromadb pypdf tiktoken faiss-cpu

In [89]:
!pip install chromadb



In [90]:
#Step-5: Divide text into Embeddings and store in vector database

# ------------------ EMBEDDINGS + VECTORSTORE ------------------
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

vectordb = Chroma.from_documents(
    documents=docs,
    embedding=embeddings,
)
vectordb.persist()

retriever = vectordb.as_retriever(search_kwargs={"k": 10})  # 4 top chunks#

In [91]:
''' #Step-6: Develop a Prompt-------------static instructions

# ------------------ CUSTOM PROMPT ------------------
system_prompt = """
You are Jolly, a highly skilled and experienced **Data Scientist**.
Your role is to **clearly, accurately, and professionally** explain data science concepts, tools, workflows, and project-related details
to students, colleagues, or recruiters.

## Core Guidelines:
1. **Context-First Approach**
   - Use **only** the information from the provided CONTEXT chunks from the retriever.
   - If the CONTEXT does not contain relevant details, **politely state that the information is unavailable** instead of making assumptions.

2. **Clarity & Structure**
   - Provide explanations in a **clear, step-by-step manner**.
   - Use concise **bullet points**, **tables**, or **code snippets** where appropriate.
   - Summarize key takeaways at the end when needed.

3. **Technical Depth**
   - Explain concepts like **data preprocessing, feature engineering, EDA, model building, model evaluation, feature selection, dimensionality reduction (PCA, LDA), model deployment, and performance optimization** with technical depth.
   - Include **relevant Python libraries, commands, and coding approaches** where helpful.
   - If multiple techniques exist, **compare them politely** and guide on the **best approach**.

4. **Real-World Relevance**
   - When projects, datasets, or business problems are mentioned, explain them in a **real-world context**.
   - Use practical examples and relate them to **industry best practices**.

5. **Accuracy & Referencing**
   - Always verify responses using the **retrieved CONTEXT**.
   - When citing specific details, politely mention the **chunk/page metadata**.

6. **Tone & Professionalism**
   - Maintain a **helpful, polite, and professional** tone.
   - Avoid unnecessary storytelling unless explicitly requested.
   - Never fabricate information beyond the available CONTEXT.

## Golden Rule:
> “If it’s **not in the provided CONTEXT**, do **not** assume or hallucinate.
> Instead, respond with:
> *‘The requested information is not available in the provided context.’*”
"""
     '''




' #Step-6: Develop a Prompt-------------static instructions\n\n# ------------------ CUSTOM PROMPT ------------------\nsystem_prompt = """\nYou are Jolly, a highly skilled and experienced **Data Scientist**.\nYour role is to **clearly, accurately, and professionally** explain data science concepts, tools, workflows, and project-related details\nto students, colleagues, or recruiters.\n\n## Core Guidelines:\n1. **Context-First Approach**\n   - Use **only** the information from the provided CONTEXT chunks from the retriever.\n   - If the CONTEXT does not contain relevant details, **politely state that the information is unavailable** instead of making assumptions.\n\n2. **Clarity & Structure**\n   - Provide explanations in a **clear, step-by-step manner**.\n   - Use concise **bullet points**, **tables**, or **code snippets** where appropriate.\n   - Summarize key takeaways at the end when needed.\n\n3. **Technical Depth**\n   - Explain concepts like **data preprocessing, feature engineeri

In [92]:
system_prompt = """
You are Jolly, a professional assistant.
- Always use ONLY the information from the provided context.
- Adapt your role depending on the document type:
  * If the context is about data science → explain as a Data Scientist.
  * If the context is about nutrition → explain as a Nutrition Expert.
- If context is missing, say: "The requested information is not available in the provided context."
"""

In [93]:
#Step-7: Create a PromptTemplate

from langchain.prompts import ChatPromptTemplate

prompt_template = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "Context:\n{context}\n\nQuestion:\n{question}")
])

""""A PromptTemplate is a dynamic prompt where we inject variables like {context} and {question}.
It uses the instructions from the system prompt but also includes real-time data. It’s used when combining retrieved chunks (from a retriever or database) with the user’s query
{context} = retrieved document chunks
{question} = user’s input
"""

#User asks a question: "What is photosynthesis?" → goes into {question}.
#Your retriever or search system finds relevant documents or text chunks about photosynthesis → goes into {context}.
#The model sees both and generates an answer using the retrieved contex'''

'"A PromptTemplate is a dynamic prompt where we inject variables like {context} and {question}.\nIt uses the instructions from the system prompt but also includes real-time data. It’s used when combining retrieved chunks (from a retriever or database) with the user’s query\n{context} = retrieved document chunks\n{question} = user’s input\n'

In [94]:
system_prompt = """
You are Jolly, a professional assistant.
- Always use ONLY the information from the provided context.
- Adapt your role depending on the document type:
  * If the context is about data science → explain as a Data Scientist.
  * If the context is about nutrition → explain as a Nutrition Expert.
- If context is missing, say: "The requested information is not available in the provided context."
"""

In [95]:
'''from langchain.prompts import PromptTemplate

prompt_template = PromptTemplate(
    template= system_prompt + """

CONTEXT:
{context}

QUESTION:
{question}

Answer as Jolly, the Data Scientist.
""",
    input_variables=["context", "question"]
)
'''

'from langchain.prompts import PromptTemplate\n\nprompt_template = PromptTemplate(\n    template= system_prompt + """\n\nCONTEXT:\n{context}\n\nQUESTION:\n{question}\n\nAnswer as Jolly, the Data Scientist.\n""",\n    input_variables=["context", "question"]\n)\n'

In [96]:
#Step-8: Build Retrieval QA

from langchain.chains import RetrievalQA
retrieval_qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt_template}
)

In [97]:
#Step-9: Ask the Question

query = "Provide me Health Data to loose weight "
response = retrieval_qa.run(query)

print("\n--- Model Answer ---\n")
print(response)


--- Model Answer ---

As a Nutrition Expert, I can provide you with general health data to help with weight loss based on the provided context. 

To lose weight, it's essential to maintain a balanced diet with the right amount of nutrients. Here are some key points to consider:

1. **Fat intake**: Fat provides substances needed for growth and healthy skin. It's recommended to get 20% of your daily energy/calorie intake from fats/oils. Choose healthy options like omega-3-rich foods (fish, walnuts, seed oils) and limit saturated fats (butter, ghee, hydrogenated fats).

2. **Carbohydrate intake**: Carbohydrates are the main energy source for the human brain. Focus on complex carbohydrates (fruits, vegetables, whole grain cereals, millets, pulses, and legumes) which are rich in dietary fiber. Avoid simple carbohydrates high in calories and low in nutritional value, often found in junk foods.

3. **Dietary fiber**: Include foods high in dietary fiber (whole grain cereals, pulses, bran, gre

In [98]:
#Its giving information only from document this is Standard RAG appraoch
#If we connect any API calls then fetch the data this indicates agentic Behaviour

In [99]:
#Agentic RAG

In [100]:
#Step-10: Fetch the data from url
'''Why Use a WebLoader Instead of Requests + BeautifulSoup
Your current code works fine but scrapes only table data manually
Using LangChain loaders, we can:
Fetch the entire HTML or clean text
Integrate directly into RAG pipelines
Chunk, embed, and retrieve relevant pieces automatically'''

from langchain_community.document_loaders import WebBaseLoader

# URL to scrape
url = "https://crucible.io/insights/design/10-best-healthcare-websites-2024//"

# Initialize the loader
loader = WebBaseLoader(url)

# Load the content
docs = loader.load()

# Check the first document
print(docs[0].page_content[:11])  # Preview first 10 characters



 

Top 10


In [101]:
#!pip install langchain unstructured
#!pip install "unstructured[all-docs]"
#!pip install pdf2image pypdf pillow

In [102]:
#from langchain.document_loaders import UnstructuredURLLoader

In [103]:
#from langchain_community.document_loaders import UnstructuredURLLoader


In [104]:
#url = "https://crucible.io/insights/design/10-best-healthcare-websites-2024//"
#loader = UnstructuredURLLoader(urls=[url])
#docs = loader.load()

# Check the first document
#print(docs[0].page_content[:11])  # Preview first 10 characters

In [105]:
!pip install faiss-cpu



In [106]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

# Step 2a: Create embeddings
embeddings = HuggingFaceEmbeddings()

# Step 2b: Build FAISS vector store
vector_store = FAISS.from_documents(docs, embeddings)

# Step 2c: Create a retriever
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5})


  embeddings = HuggingFaceEmbeddings()


In [107]:
from langchain_groq import ChatGroq

#llm = ChatGroq(
  #  model="llama-3.3-70b-versatile",
  #  temperature=0,
  #  api_key="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"
#)

llm = ChatGroq(
    api_key=groq_api,
    model="llama-3.3-70b-versatile",
    temperature=0.7,
    max_tokens=512
)



In [108]:
from langchain.chains import RetrievalQA

retrieval_qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff"  # or "map_reduce" if the page is very long
)


In [109]:
from langchain.agents import Tool

web_tool = Tool(
    name="Healthcare Websites Info",
    func=retrieval_qa.run,  # The function the agent calls
    description="Use this tool to answer questions about healthcare website designs from the Crucible article."
)


In [110]:
from langchain.agents import initialize_agent, AgentType

agent = initialize_agent(
    tools=[web_tool],
    llm=llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True
)


In [111]:
query = "Which websites are listed as best healthcare websites for 2024?"
response = agent.run(query)
print(response)




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo find the best healthcare websites for 2024, I should first look for information on healthcare websites from a reliable source, such as the Crucible article.

Action: Healthcare Websites Info
Action Input: tags=["best healthcare websites", "2024"][0m
Observation: [36;1m[1;3mBased on the provided context, here are the top 10 best healthcare websites in 2024:

1. Mayo Clinic
2. Barts Health
3. CRGH (Centre for Reproductive and Genetic Health)
4. Hertility
5. Campaign Against Living Miserably (CALM)
6. WebMD
7. Unmind
8. Abortion Finder
9. Thanks Ben
10. Breastcancer.org

These websites were evaluated based on categories such as overall performance, user experience, accessibility, content, identity, and security & privacy.[0m
Thought:[32;1m[1;3mThought: I now know the final answer

Final Answer: The best healthcare websites for 2024 are:
1. Mayo Clinic
2. Barts Health
3. CRGH (Centre for Reproductive and Genetic Health)
