In [None]:
#Handling scanned PDFs
import os
from dotenv import load_dotenv

load_dotenv()
os.environ['GOOGLE_API_KEY']=os.getenv("GEMINI_API_KEY")

import shutil
poppler_bin = r"C:\Program Files\poppler-25.07.0\Library\bin"
os.environ["PATH"] = poppler_bin + os.pathsep + os.environ.get("PATH", "")
# print("pdfinfo:", shutil.which("pdfinfo"))
# print("pdftoppm:", shutil.which("pdftoppm"))

import  subprocess
os.environ["PATH"] = r"C:\Program Files\Tesseract-OCR" + os.pathsep + os.environ.get("PATH","")
print("tesseract:", shutil.which("tesseract"))
print(subprocess.check_output(["tesseract", "--version"], text=True))


In [None]:
#DataIngestion
from langchain_community.document_loaders import UnstructuredPDFLoader
loader=UnstructuredPDFLoader("panchtantra.pdf",strategy="hi_res",ocr_languages="eng", extract_images_in_pdf=True)

docs=loader.load()   


In [None]:
#Breaking into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter
chunks=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
pdf_documents=chunks.split_documents(docs)
pdf_documents

In [None]:
#Creating llm

from langchain_google_genai import ChatGoogleGenerativeAI
llm=ChatGoogleGenerativeAI(model="gemini-2.5-flash")
llm

In [None]:
#prompt
from langchain_core.prompts import ChatPromptTemplate
prompt=ChatPromptTemplate.from_template(
    """
    Answer the following question based only on the provided context. 
    Think step by step before providing a detailed answer.kwargs
    <context>
    {context}
    </context>
    Question:{input}
    
    """
    
)

In [None]:
#combining llm + prompt using chains
from langchain.chains.combine_documents import create_stuff_documents_chain
chain=create_stuff_documents_chain(llm,prompt)

In [None]:
#Chunks->embeddings and storing in db
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import Chroma

db=Chroma.from_documents(pdf_documents,GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001"))

retriever=db.as_retriever()
retriever


In [None]:
# Combining chain(llm+prompt) + retriever
from langchain.chains import create_retrieval_chain
retriever_chain=create_retrieval_chain(retriever,chain)

In [None]:
response=retriever_chain.invoke({"input":"What happened in the the jackal and the war drum?"})
response['answer']