# QA over documents
Step-by-step tutorial for making a [question-answering application](https://python.langchain.com/docs/use_cases/question_answering/) over PDF documents with Gradio.

In [1]:
import os
import time
from pathlib import Path
import gradio as gr
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class CustomChatbot:
    """Custom Chatbot class to handle PDF documents and vector database."""
    
    def __init__(self, api_key: str, prompt_template: str, db_path: str = None,
                 chunk_size: int = 2000, chunk_overlap: int = 100):
        """Initialize CustomChatbot instance.

        Args:
            prompt_template: The prompt for the QA chain.
            db_path: Path to the directory where the vector database is stored.
            chunk_size: Size of chunks for text splitting. 
            chunk_overlap: Overlap between chunks for text splitting. 
        """
        os.environ['OPENAI_API_KEY'] = api_key
        self.prompt = PromptTemplate.from_template(prompt_template)
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.chain = None
        self.directory = Path(db_path) / 'index_store' if db_path else Path('..') / 'index_store'


    def set_chain(self, chain):
        self.chain = chain

    def load_and_save_vector_db(self, documents: list, model_id: str, k_search: int):
        """Load and save a vector database and initialize a QA chain."""
        vectordb = self.create_vector_db_from_docs(documents)
        vectordb.save_local(self.directory)
        self.set_chain(self.create_qa_chain(vectordb, model_id, k_search))

    def create_vector_db_from_docs(self, documents: list):
        """Create a vector database from a list of documents."""
        text_splitter = CharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
        texts = text_splitter.split_documents(documents)
        return FAISS.from_documents(texts, OpenAIEmbeddings())

    def load_pdf_files(self, files: list):
        """Load PDF files and return their contents."""
        documents = []
        for pdf_path in files:
            loader = PyPDFLoader(pdf_path)
            documents.extend(loader.load())
        return documents

    def create_qa_chain(self, vectordb, model_id: str, k_search: int):
        """Create a QA chain based on a vector database and a model identifier."""
        llm = ChatOpenAI(temperature=0, model_name=model_id)
        retriever = vectordb.as_retriever(
            search_type="similarity", search_kwargs={"k": k_search}
        )
        return RetrievalQA.from_chain_type(
            llm=llm,
            chain_type="stuff",
            retriever=retriever,
            chain_type_kwargs={"prompt": self.prompt},
            return_source_documents=True
        )

    def upload_files(self, files: list, model_id: str, k_search: int):
        """Upload files and update the vector database."""
        documents = self.load_pdf_files([str(fn.name) for fn in files])
        self.load_and_save_vector_db(documents, model_id, k_search)
        return gr.Textbox.update(placeholder='Type your message here', lines=1, interactive=True)

    def get_vector_db(self, model_id: str, k_search: int):
        """Get the existing vector database and initialize a QA chain if existing."""
        if not (self.directory).exists():
            return gr.Textbox.update(
                placeholder='Document store does not exist, please upload document(s).',
                lines=1,
                interactive=False
            )
        vectordb = FAISS.load_local(self.directory, OpenAIEmbeddings())
        self.set_chain(self.create_qa_chain(vectordb, model_id, k_search))
        return gr.Textbox.update(placeholder='Type your message here', lines=1, interactive=True)

    @staticmethod
    def user(user_message: str, history: list):
        """Handle user messages."""
        return "", history + [[user_message, None]]

    @staticmethod
    def extract_source_and_page(documents: list):
        """Extract the source and page number from the source documents used to generate answers."""
        results = {}
        for document in documents:
            source = Path(document.metadata['source']).stem
            page = document.metadata['page'] + 1
            if source in results:
                results[source].append(page)
            else:
                results[source] = [page]
        return results

    def build_response_message(self, result: str, source_and_page: dict):
        """Build a response message with information about the source and page(s)."""
        response_message = f"{result}\n\nInformasjonen er hentet fra: \n"
        source_and_page_str = ''.join(
            [f'- Kilde: {source}, Side {sorted(pages)}\n' for source, pages in source_and_page.items()]
        )
        return f"{response_message} {source_and_page_str}"

    def generate_response(self, history: list):
        """Generate a response for the user based on their query and update the conversation history."""
        try:
            response = self.chain({"query": history[-1][0]})
            source_and_page = self.extract_source_and_page(response["source_documents"])
            response_message = self.build_response_message(response['result'], source_and_page)
        except Exception as e:
            response_message = (
                "Beklager, tekstens lengde overstiger modellens maksimale kapasitet for kontekstlengde."
                "Vennligst reduser k-search eller bytt til en modell med st√∏rre kontekstlengde."
                )

        history[-1][1] = ""

        for char in response_message:
            history[-1][1] += char
            yield history
            time.sleep(0.01)

In [3]:
template = """
You are a knowledge bot. Use the following pieces of context to answer the question at the end. 
Provide a detailed answer if possible.
Write at the end that the user is responsible for checking that the information provided is correct (in Norwegian).
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}

Question: {question}

Helpful answer in Norwegian:
"""
# Get OpenAI API key, see the readme for more info
api_key = os.environ.get('OPENAI_API_KEY')

# Create a ChatBot instance
chatbot_obj = CustomChatbot(api_key, template)

# Define UI elements and interactions
with gr.Blocks() as blocks:
    with gr.Row():
        with gr.Column():
            model_dropdown = gr.Dropdown(
                choices=["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4"],
                value='gpt-3.5-turbo',
                interactive=True,
                label="Select model",
                info="Click on the üìÅ or üóÑÔ∏è button if you want to change the model during the conversation."
            )
        with gr.Column(scale=0.05):
            k_search = gr.Number(
                minimum=1,
                maximum=5,
                value=3,
                step=1,
                precision=0,
                label='K search',
                interactive=True,
                info="Number of search results to retrieve from vector database."
            )

    with gr.Row():
        chat_ui = gr.Chatbot(elem_id="chatbot", height=300)

    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(
                show_label=False,
                lines=2,
                placeholder=("Please upload document(s) or use existing document store to use the chatbot"),
                interactive=False,
                container=False,
                height=100
            )

    with gr.Row():
        with gr.Column(scale=0.1):
            db_button = gr.Button("üóÑÔ∏è Use document store")
            db_button.click(
                chatbot_obj.get_vector_db,
                [model_dropdown, k_search],
                outputs=[input_text]
            )
        with gr.Column(scale=0.1):
            upload_button = gr.UploadButton(
                "üìÅ Upload PDF document(s)",
                file_types=[".pdf"],
                file_count="multiple"
            )
            upload_button.upload(
                chatbot_obj.upload_files,
                [upload_button, model_dropdown, k_search],
                outputs=[input_text]
            )

        with gr.Column(scale=0.1):
            clear_button = gr.Button("üóëÔ∏è Clear chat")
            clear_button.click(lambda: None, None, chat_ui, queue=False)

    input_text.submit(
        chatbot_obj.user,
        [input_text, chat_ui],
        [input_text, chat_ui]
    ).then(
        chatbot_obj.generate_response,
        inputs=chat_ui,
        outputs=chat_ui
    )

# Launch UI
blocks.queue()
blocks.launch()

  input_text = gr.Textbox(


Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




3 <class 'int'>
