In [None]:
import os

# Set the USER_AGENT environment variable
os.environ['USER_AGENT'] = 'PDF_QA/1.0'

# Rest of your imports and code
import gradio as gr
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_community import embeddings
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain.text_splitter import CharacterTextSplitter
import ollama
##############
from langchain.document_loaders import DirectoryLoader, UnstructuredPDFLoader
from langchain_community.document_loaders import OnlinePDFLoader
##############
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_community.embeddings import OllamaEmbeddings


In [None]:
def process_input(pdfs, question):
    model_local = ChatOllama(model="mistral")
    
    # Load PDFs
    docs = [PyPDFLoader(pdf.name).load() for pdf in pdfs]
    docs_list = [item for sublist in docs for item in sublist]
    
    # Split text
    # Optimized Chunking Strategy
    text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=7500,         # Smaller chunk size for better context
    chunk_overlap=100,       # Overlap to ensure context retention
    )

    doc_splits = text_splitter.split_documents(docs_list)

    #Add to vector database
    vector_db = Chroma.from_documents(  # noqa: F841
        documents = doc_splits,
        embedding = OllamaEmbeddings(model="nomic-embed-text",show_progress=True),
        collection_name="rag-chroma"
    )

    retriever = vector_db.as_retriever()

    # Define the prompt template
    after_rag_template = """Answer the question based only on the following context:
    {context}
    Question: {question}
    """
    after_rag_prompt = ChatPromptTemplate.from_template(after_rag_template)
    after_rag_chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | after_rag_prompt
        | model_local
        | StrOutputParser()
    )
    
    # Run the chain and return the result
    return after_rag_chain.invoke(question)

In [None]:
# Define Gradio interface
iface = gr.Interface(fn=process_input,
                     inputs=[gr.File(label="Upload PDF files", file_count="multiple", file_types=["pdf"]),
                             gr.Textbox(label="Question")],
                     outputs="text",
                     title="PDF Query with Ollama",
                     description="Upload PDF files and enter a question to query the documents.")
iface.launch()

In [None]:
import os
# Set the USER_AGENT environment variable
os.environ['USER_AGENT'] = 'DocumentQA/1.0'
import gradio as gr
from langchain_community.document_loaders import PyPDFLoader, OnlinePDFLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
class DocumentProcessor:
    def __init__(self):
        self.docs_list = []
        self.vector_db = None

    def load_documents(self, files):
        loaders = {
            'pdf': PyPDFLoader,
            'txt': self._load_text_file,
            'docx': OnlinePDFLoader,  # Assuming you have a specific loader for docx files
            # Add more file type loaders here if needed
        }

        for file in files:
            ext = file.name.split('.')[-1].lower()
            loader = loaders.get(ext)
            if loader:
                if ext == 'txt':
                    docs = loader(file.name)
                else:
                    docs = loader(file.name).load()
                
                # Ensure docs are in the expected format
                for doc in docs:
                    if isinstance(doc, str):
                        self.docs_list.append({"page_content": doc, "metadata": {"source": file.name}})
                    else:
                        self.docs_list.append(doc)

    def _load_text_file(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        # Return in the expected document format
        return [{"page_content": text, "metadata": {"source": file_path}}]

    def process_documents(self):
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=7500,         # Smaller chunk size for better context
            chunk_overlap=100,       # Overlap to ensure context retention
        )
        doc_splits = text_splitter.split_documents(self.docs_list)

        # Create the vectorstore only once
        self.vector_db = Chroma.from_documents(
            documents=doc_splits,
            embedding=OllamaEmbeddings(model="nomic-embed-text", show_progress=True),
            collection_name="rag-chroma"
        )

    def get_retriever(self):
        if self.vector_db is None:
            raise ValueError("Documents have not been processed. Call process_documents() first.")
        return self.vector_db.as_retriever()

class QuestionAnsweringSystem:
    def __init__(self):
        self.processor = DocumentProcessor()
        self.model_local = ChatOllama(model="mistral")
        self.prompt_template = """Answer the question based only on the following context:
        {context}
        Question: {question}
        """
    
    def load_and_process(self, files):
        self.processor.load_documents(files)
        self.processor.process_documents()

    def answer_question(self, question):
        retriever = self.processor.get_retriever()
        after_rag_prompt = ChatPromptTemplate.from_template(self.prompt_template)
        after_rag_chain = (
            {"context": retriever, "question": RunnablePassthrough()}
            | after_rag_prompt
            | self.model_local
            | StrOutputParser()
        )
        return after_rag_chain.invoke(question)

In [None]:
# Define Gradio interface
def process_input(files, question):
    qa_system = QuestionAnsweringSystem()
    qa_system.load_and_process(files)
    return qa_system.answer_question(question)

iface = gr.Interface(fn=process_input,
                     inputs=[gr.File(label="Upload Documents", file_count="multiple", file_types=["pdf", "txt", "docx"]),
                             gr.Textbox(label="Question")],
                     outputs="text",
                     title="Document Query with Ollama",
                     description="Upload documents in various formats and enter a question to query the content.")
iface.launch()

# Working perfectly fine

In [None]:
import os
# Set the USER_AGENT environment variable
os.environ['USER_AGENT'] = 'DocumentQA/1.0'
import gradio as gr
from langchain_community.document_loaders import PyPDFLoader, OnlinePDFLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document  # Import the Document class



class DocumentProcessor:
    def __init__(self):
        self.docs_list = []
        self.vector_db = None

    def load_documents(self, files):
        loaders = {
            'pdf': PyPDFLoader,
            'txt': self._load_text_file,
            'docx': OnlinePDFLoader,  # Assuming you have a specific loader for docx files
            # Add more file type loaders here if needed
        }

        for file in files:
            ext = file.name.split('.')[-1].lower()
            loader = loaders.get(ext)
            if loader:
                if ext == 'txt':
                    docs = loader(file.name)
                else:
                    docs = loader(file.name).load()

                # Ensure docs are in the expected format
                for doc in docs:
                    if isinstance(doc, Document):
                        self.docs_list.append(doc)
                    else:
                        self.docs_list.append(Document(page_content=doc.get('page_content', doc), metadata=doc.get('metadata', {"source": file.name})))

    def _load_text_file(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        # Return in the expected document format
        return [Document(page_content=text, metadata={"source": file_path})]

    def process_documents(self):
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=7500,         # Smaller chunk size for better context
            chunk_overlap=100,       # Overlap to ensure context retention
        )
        doc_splits = text_splitter.split_documents(self.docs_list)

        # Create the vectorstore only once
        self.vector_db = Chroma.from_documents(
            documents=doc_splits,
            embedding=OllamaEmbeddings(model="nomic-embed-text", show_progress=True),
            collection_name="rag-chroma"
        )

    def get_retriever(self):
        if self.vector_db is None:
            raise ValueError("Documents have not been processed. Call process_documents() first.")
        return self.vector_db.as_retriever()

class QuestionAnsweringSystem:
    def __init__(self):
        self.processor = DocumentProcessor()
        self.model_local = ChatOllama(model="mistral")
        self.prompt_template = """Answer the question based only on the following context:
        {context}
        Question: {question}
        """
    
    def load_and_process(self, files):
        self.processor.load_documents(files)
        self.processor.process_documents()

    def answer_question(self, question):
        retriever = self.processor.get_retriever()
        after_rag_prompt = ChatPromptTemplate.from_template(self.prompt_template)
        after_rag_chain = (
            {"context": retriever, "question": RunnablePassthrough()}
            | after_rag_prompt
            | self.model_local
            | StrOutputParser()
        )
        return after_rag_chain.invoke(question)

# Define Gradio interface
def process_input(files, question):
    qa_system = QuestionAnsweringSystem()
    qa_system.load_and_process(files)
    return qa_system.answer_question(question)

iface = gr.Interface(fn=process_input,
                     inputs=[gr.File(label="Upload Documents", file_count="multiple", file_types=["pdf", "txt", "docx"]),
                             gr.Textbox(label="Question")],
                     outputs="text",
                     title="Document Query with Ollama",
                     description="Upload documents in various formats and enter a question to query the content.")
iface.launch()


# New implementation:

In [None]:
import os
# Set the USER_AGENT environment variable
os.environ['USER_AGENT'] = 'DocumentQA/1.0'
import gradio as gr
from langchain_community.document_loaders import PyPDFLoader, OnlinePDFLoader
from langchain_chroma import Chroma  # Updated import
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document



class DocumentProcessor:
    def __init__(self, db_directory="vector_db"):
        # Initialize or load the vector database from disk
        self.embeddings = OllamaEmbeddings(model="nomic-embed-text", show_progress=True)
        self.vector_db = Chroma(persist_directory=db_directory,
                                embedding_function=self.embeddings,
                                collection_name="rag-chroma")

    def load_documents(self, files):
        docs_list = []
        loaders = {
            'pdf': PyPDFLoader,
            'txt': self._load_text_file,
            'docx': OnlinePDFLoader,  # Assuming you have a specific loader for docx files
            # Add more file type loaders here if needed
        }

        for file in files:
            ext = file.name.split('.')[-1].lower()
            loader = loaders.get(ext)
            if loader:
                if ext == 'txt':
                    docs = loader(file.name)
                else:
                    docs = loader(file.name).load()

                # Ensure docs are in the expected format
                for doc in docs:
                    if isinstance(doc, Document):
                        docs_list.append(doc)
                    else:
                        docs_list.append(Document(page_content=doc.get('page_content', doc), metadata=doc.get('metadata', {"source": file.name})))

        # Process and add the documents to the vector store
        self._process_and_store_documents(docs_list)

    def _load_text_file(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        return [Document(page_content=text, metadata={"source": file_path})]

    def _process_and_store_documents(self, docs_list):
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=7500,
            chunk_overlap=100,
        )
        doc_splits = text_splitter.split_documents(docs_list)

        # Add documents to the vector store
        self.vector_db.add_documents(doc_splits)
        # No need to explicitly persist; the vector store should handle it automatically

    def get_retriever(self):
        return self.vector_db.as_retriever()

class QuestionAnsweringSystem:
    def __init__(self):
        self.processor = DocumentProcessor()
        self.model_local = ChatOllama(model="mistral")
        self.prompt_template = """Answer the question based only on the following context:
        {context}
        Question: {question}
        """

    def load_and_process(self, files):
        self.processor.load_documents(files)

    def answer_question(self, question):
        retriever = self.processor.get_retriever()
        after_rag_prompt = ChatPromptTemplate.from_template(self.prompt_template)
        after_rag_chain = (
            {"context": retriever, "question": RunnablePassthrough()}
            | after_rag_prompt
            | self.model_local
            | StrOutputParser()
        )
        return after_rag_chain.invoke(question)

# Gradio Interfaces for Uploading Documents and Asking Questions
def upload_documents(files):
    qa_system = QuestionAnsweringSystem()
    qa_system.load_and_process(files)
    return "Documents uploaded and processed successfully!"

def query_documents(question):
    qa_system = QuestionAnsweringSystem()
    return qa_system.answer_question(question)

iface = gr.Blocks()

with iface:
    with gr.Tab("Upload Documents"):
        file_input = gr.File(label="Upload Documents", file_count="multiple", file_types=["pdf", "txt", "docx"])
        upload_button = gr.Button("Upload")
        upload_output = gr.Textbox(label="Upload Status")
        upload_button.click(upload_documents, inputs=file_input, outputs=upload_output)
    
    with gr.Tab("Query Documents"):
        question_input = gr.Textbox(label="Question")
        query_button = gr.Button("Ask")
        query_output = gr.Textbox(label="Answer")
        query_button.click(query_documents, inputs=question_input, outputs=query_output)

iface.launch()
