In [None]:
# 1. Install dependencies
%pip install langchain langchain-community langchain-core langchain-groq faiss-cpu pypdf langchain-huggingface chromadb

In [9]:
# 2. Imports
import os
from langchain_groq import ChatGroq
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS, Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory

In [3]:
# 3. Set your Groq API key
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY") or "api_key"

In [4]:
# 4. Load multiple PDFs
pdf_files = [
    "/content/financial_statements_of_yes_securities_india_limited-1.pdf",
    "/content/India - TIL - Employee Handbook (2)-1.pdf"
]

docs = []
for file in pdf_files:
    loader = PyPDFLoader(file)
    docs.extend(loader.load())

In [5]:
# 5. Split into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(docs)

In [6]:
# 6. Create embeddings + Persistent vector DB
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(
    documents,
    embeddings,
    persist_directory="./chroma_store"   # ✅ stored on disk
)

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
# 7. Init Groq LLM (use supported model)
llm = ChatGroq(
    model="llama-3.3-70b-versatile",   # ✅ latest supported
    temperature=0
)

In [13]:
# 8. Conversational Memory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True,
    output_key="answer"
)

In [14]:
# 9. Conversational Retrieval Chain (with citations)
qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),  # top-3 chunks
    memory=memory,
    return_source_documents=True
)


In [15]:
# 10. Ask questions in a loop
while True:
    query = input("Ask a question (or type 'exit'): ")
    if query.lower() == "exit":
        break

    result = qa_chain.invoke({"question": query})

    print("\nAnswer:\n", result["answer"])
    print("\nSources:")
    for doc in result["source_documents"]:
        print(" -", doc.metadata.get("source", "unknown"))
    print("\n" + "-"*50)

Ask a question (or type 'exit'): summarizations of the pdf with all the points

Answer:
 Here is a summary of the provided text with all the points:

1. **Asset Values**:
	* Vehicle: 0.07
	* Sub-Total: 15.61 (with various components)
	* Intangible - Software: 12.36
	* Sub-Total (Intangible): 12.36
	* Total: 27.97

2. **Share Information**:
	* Number of shares at the beginning of the year: 80,000,000
	* Total number of equity shares outstanding at the end of the year: 80,000,000
	* Weighted average number of equity shares at the end of the year: 80,955,000 (current year), 80,346,255 (previous year)
	* Face value per share: 10

3. **Earnings Per Share (EPS)**:
	* Basic EPS: 2.30 (current year), 0.66 (previous year)
	* Diluted EPS: 2.27 (current year), 0.65 (previous year)

4. **Corporate Social Responsibility (CSR)**:
	* Expenditure related to CSR: Rs. 0.10 Cr (current year), Rs. Nil (previous year)

5. **Gross Block, Depreciation, and Amortization**:
	* Gross Block: Not explicitly state