In [None]:
!pip install gradio PyMuPDF python-docx python-pptx pandas sentence-transformers llama-index llama-index-llms-openai

Collecting gradio
  Using cached gradio-5.8.0-py3-none-any.whl.metadata (16 kB)
Collecting PyMuPDF
  Using cached pymupdf-1.25.0-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting python-docx
  Using cached python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting python-pptx
  Using cached python_pptx-1.0.2-py3-none-any.whl.metadata (2.5 kB)
Collecting pandas
  Using cached pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting sentence-transformers
  Using cached sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Collecting llama-index
  Using cached llama_index-0.12.3-py3-none-any.whl.metadata (11 kB)
Collecting llama-index-llms-openai
  Using cached llama_index_llms_openai-0.3.2-py3-none-any.whl.metadata (3.3 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Using cached aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Using cached fa

In [None]:
import gradio as gr
from sentence_transformers import SentenceTransformer
import fitz  # PyMuPDF
from docx import Document as DocxDocument
from pptx import Presentation
import json
import io, os
from llama_index.core.llms import ChatMessage
from llama_index.llms.openai import OpenAI
from llama_index.indices.managed.llama_cloud import LlamaCloudIndex

# Initialize embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Function to extract text from various document types
def extract_text(file_path):
    try:
        filename = file_path.name
        with open(file_path.name, "rb") as f:
            file_binary = f.read()

        if filename.endswith(".pdf"):
            with fitz.open(stream=file_binary, filetype="pdf") as doc:
                return "".join(page.get_text() for page in doc)
        elif filename.endswith(".docx"):
            doc = DocxDocument(io.BytesIO(file_binary))
            return "\n".join(para.text for para in doc.paragraphs)
        elif filename.endswith(".pptx"):
            prs = Presentation(io.BytesIO(file_binary))
            return "\n".join(
                shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text")
            )
        elif filename.endswith(".txt"):
            return file_binary.decode("utf-8")
        elif filename.endswith(".json"):
            data = json.loads(file_binary.decode("utf-8"))
            return json.dumps(data, indent=2)
        else:
            raise ValueError("Unsupported file type.")
    except Exception as e:
        return f"Error processing file: {e}"

# Function to create embeddings and query OpenAI API
def query_documents(file_list, user_query):
    try:
        # Extract texts from files
        documents = [
            extract_text(file) for file in file_list if file is not None
        ]
        # Create embeddings
        embeddings = [embedding_model.encode(doc) for doc in documents if doc]
        query_embedding = embedding_model.encode(user_query)

        # Find the most relevant document
        scores = [sum(a * b for a, b in zip(emb, query_embedding)) for emb in embeddings]
        most_relevant_doc = documents[scores.index(max(scores))]

        # Use OpenAI API to generate a response
        llm = OpenAI(model="gpt-4",api_key="Enter your OPENAI key here",max_tokens=200,temperature=0.7)
        messages = [
                      ChatMessage(
                        role="system", content="You are a helpful assistant that retrieves relevant information from provided documents."
                    ),
                        ChatMessage(role="user", content=f"Query: {user_query}\nRelevant Context: {most_relevant_doc}"),
                    ]
        response = llm.chat(messages)
        return response
    except Exception as e:
        return f"Error querying documents: {e}"

# Gradio Interface
def interface():
    with gr.Blocks() as app:
        gr.Markdown("# RAG System for Document Retrieval")

        file_input = gr.File(label="Upload Documents", file_count="multiple")
        query_input = gr.Textbox(label="Enter Your Query", placeholder="Type your question here...")
        output = gr.Textbox(label="Response")

        def wrapper(files, query):
            return query_documents(files, query)

        query_button = gr.Button("Submit")
        query_button.click(wrapper,
                           inputs=[file_input, query_input],
                           outputs=output)

        app.launch(share=True,debug=True)

if __name__ == "__main__":
    interface()


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://042b02c65236467c59.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://042b02c65236467c59.gradio.live
