In [3]:
!pip install langchain
!pip install langchain-community
!pip install langchain-openai
!pip install chromadb
!pip install gradio langchain langchain-community langchain-openai chromadb PyPDF2
!pip install pypdf
!pip install pytesseract pillow
!pip install sentence-transformers






In [12]:
#  Tesseract OCR
!apt-get install tesseract-ocr -y

#  pytesseract wrapper
!pip install pytesseract


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 30 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 2s (2,746 kB/s)
Selecting previously unselected package tesseract-ocr-eng.
(Reading database ... 126213 files and directories currently installed.)
Preparing to unpack .../tesseract-ocr-

In [13]:
import gradio as gr
from PIL import Image
import pytesseract

from transformers import pipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer



from langchain.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain.embeddings import SentenceTransformerEmbeddings



In [22]:

embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-large", max_length=512)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)


retriever = None
db = None

Device set to use cuda:0


In [14]:
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"

In [23]:
def process_pdf(file):
    global retriever, db

    try:
        pdf_path = file.name
        loader = PyPDFLoader(pdf_path)
        docs = loader.load()
        chunks = text_splitter.split_documents(docs)

        db = Chroma.from_documents(
            documents=chunks,
            embedding=embedding_model,
            persist_directory="./chroma_db_gradioUI",
            collection_name="gradio_new_pdf"
        )

        retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 50})

        return "✅ File processed successfully. Now ask your questions!"
    except Exception as e:
        return f"❌ Error processing PDF: {str(e)}"

In [24]:
def process_image(file):
    global retriever, db

    try:
        image = Image.open(file)
        extracted_text = pytesseract.image_to_string(image)

        doc = Document(page_content=extracted_text)
        chunks = text_splitter.split_documents([doc])

        db = Chroma.from_documents(
            documents=chunks,
            embedding=embedding_model,
            persist_directory="./chroma_db_gradioUI",
            collection_name="gradio_image"
        )

        retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 50})
        return "✅ Image processed successfully. Now ask your questions!"
    except Exception as e:
        return f"❌ Error processing image: {str(e)}"

In [25]:
def ask_question(question):
    global retriever, db

    if retriever is None:
        return "Please upload and process a PDF or image first."

    if "all details" in question.lower() or "everything" in question.lower():
        all_docs = db.get()['documents']
        combined_text = "\n".join(all_docs)
    else:
        docs = retriever.invoke(question)
        combined_text = "\n".join([doc.page_content for doc in docs])

    prompt = f"Answer the question based on the following context:\n{combined_text}\n\nQuestion: {question}"

    result = qa_pipeline(prompt, max_length=2048, do_sample=False, temperature=0.3)[0]['generated_text']
    return result.strip()


In [26]:
with gr.Blocks() as demo:
    gr.Markdown("# 📄🖼️ PDF & Image Question Answering App (Semantic Search Engine)")

    with gr.Row():
        with gr.Column():
            gr.Markdown("### 📄 Upload a PDF")
            pdf_file = gr.File(label="PDF File", file_types=[".pdf"])
            upload_btn = gr.Button("Process PDF")

        with gr.Column():
            gr.Markdown("### 🖼️ Upload an Image")
            image_file = gr.File(label="Image File", file_types=[".jpg", ".jpeg", ".png"])
            image_btn = gr.Button("Process Image")

    status = gr.Textbox(label="Status")

    upload_btn.click(fn=process_pdf, inputs=pdf_file, outputs=status)
    image_btn.click(fn=process_image, inputs=image_file, outputs=status)

    gr.Markdown("### ❓ Ask a Question")
    question_input = gr.Textbox(label="Ask a question")
    answer_output = gr.Textbox(label="Answer", lines=10)
    ask_btn = gr.Button("Get Answer")

    ask_btn.click(fn=ask_question, inputs=question_input, outputs=answer_output)

demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://841de124090c0011fa.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


