In [None]:
# Install Dependencies
!pip install -U langchain langchain-community langchain-huggingface sentence-transformers chromadb pdfplumber openpyxl transformers accelerate unstructured[all] gradio

# Imports
import os
import pdfplumber
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
from langchain.schema import Document
from transformers import pipeline
import gradio as gr

# Embedding Model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-V2")

# Folder for Chroma DB
os.makedirs("./chroma_store", exist_ok=True)

# Document Loading Function
def load_documents(uploaded_files):
    docs = []
    for file_obj in uploaded_files:
        filename = file_obj.name
        ext = filename.split('.')[-1].lower()

        try:
            # PDF Processing
            if ext == "pdf":
                text = ""
                with pdfplumber.open(file_obj.name) as pdf:
                    total_pages = len(pdf.pages)
                    print(f"Reading {filename} ({total_pages} pages)")
                    for i, page in enumerate(pdf.pages):
                        text += page.extract_text() or ""
                docs.append(Document(page_content=text, metadata={"source": filename}))

            # Word Document Processing
            elif ext == "docx":
                loader = UnstructuredWordDocumentLoader(file_obj.name)
                docs.extend(loader.load())

            # Excel File Processing (using pandas)
            elif ext in ["xls", "xlsx"]:
                print(f"Reading Excel file: {filename}")
                xl = pd.ExcelFile(file_obj.name)
                for sheet_name in xl.sheet_names:
                    df = xl.parse(sheet_name)
                    text = df.to_string(index=False)
                    docs.append(Document(page_content=text, metadata={"source": f"{filename} - {sheet_name}"}))

            else:
                print(f"Unsupported file type: {filename}")
                continue

        except Exception as e:
            print(f"Error processing {filename}: {e}")

    return docs


# Vector Store Setup
def create_vector_store(docs):
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    texts = splitter.split_documents(docs)
    vectordb = Chroma.from_documents(texts, embedding_model, persist_directory="./chroma_store")
    vectordb.persist()
    return vectordb


# Build RAG Pipeline
def build_qa_chain(vectordb):
    from langchain.chains import RetrievalQA
    from langchain.llms import HuggingFacePipeline

    flan_pipe = pipeline("text2text-generation", model="google/flan-t5-large", device=-1)
    flan_llm = HuggingFacePipeline(pipeline=flan_pipe)

    retriever = vectordb.as_retriever(search_kwargs={"k": 3})
    qa_chain = RetrievalQA.from_chain_type(
        llm=flan_llm,
        retriever=retriever,
        return_source_documents=True
    )
    return qa_chain


# Gradio Chat Logic
qa_chain = None

def process_files(files):
    global qa_chain
    docs = load_documents(files)
    vectordb = create_vector_store(docs)
    qa_chain = build_qa_chain(vectordb)
    return f"Loaded {len(docs)} documents and initialized RAG system!"

def chat_with_docs(message, history):
    global qa_chain
    if message.lower() == "exit":
        return "Exiting chat. Refresh to restart."
    if not qa_chain:
        return "Please upload and process documents first."
    result = qa_chain(message)
    sources = "\n".join([f"- {src.metadata['source']}" for src in result["source_documents"]])
    return f"**Answer:** {result['result']}\n\n**Sources:**\n{sources}"


# Gradio Interface
with gr.Blocks() as app:
    gr.Markdown("RAG_bot (PDF, DOCX, XLSX)")
    file_input = gr.File(file_count="multiple", label="Upload your documents")
    process_button = gr.Button("Process Documents")
    status_output = gr.Textbox(label="Status", interactive=False)

    chatbot = gr.ChatInterface(fn=chat_with_docs, title="Chat with your documents")

    process_button.click(process_files, inputs=file_input, outputs=status_output)

app.launch()

Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting pydantic<3.0.0,>=2.7.4 (from langchain)
  Downloading pydantic-2.11.10-py3-none-any.whl.metadata (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.6/68.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting urllib3<2.4.0,>=1.24.2 (from kubernetes>=28.1.0->chromadb)
  Using cached urllib3-2.3.0-py3-none-any.whl.metadata (6.5 kB)
Collecting pydantic-core==2.33.2 (from pydantic<3.0.0,>=2.7.4->langchain)
  Downloading pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Downloading aiofiles-24.1.0-py3-none-any.whl (15 kB)
Downloading pydantic-2.11.10-py3-none-any.whl (444 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m444.8/444.8 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-V2")
Access to the secret `HF_TOKEN` has not been granted on this notebook.
You will not be requested again.
Please restart the session if you want to be prompted again.
  self.chatbot = Chatbot(


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://8f5df49715b5445f9a.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


