In [None]:
# Requirement
!pip install openai -q
!pip install langchain -q
!pip install chromadb -q
!pip install tiktoken -q
!pip install unstructured[local-inference] -q
!pip install pypdf
!pip install gradio -q
!pip install python-dotenv
!pip install tabulate

In [None]:
#nb the default google colab runtime loads PIL 8.4.0 and it won't work with unstructured document loader (see above dependency). 
import PIL
print(PIL.__version__)

In [None]:
!pip uninstall Pillow
!pip install --upgrade Pillow
print(PIL.__version__)

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import ConversationalRetrievalChain
from dotenv import load_dotenv
import os

load_dotenv()
#print(os.getenv("OPENAI_API_KEY"))

#os.environ["OPENAI_API_KEY"] = ""

from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(temperature=0,model_name="gpt-4")

In [None]:
# Data Ingestion
from langchain.document_loaders import DirectoryLoader
#from langchain.document_loaders import PyPDFLoader
#below directory loader not working with pdf file for some reason, so using PDFMinerLoader temporarily
pdf_loader = DirectoryLoader('./Reports', glob="**/*.pdf")
txt_loader = DirectoryLoader('./Reports', glob="**/*.txt")
word_loader = DirectoryLoader('./Reports', glob="**/*.docx")

#pdf_loader = PyPDFLoader("E:\Generative AI Projects\Gradio Chatbot\GradioChatbotRetrieval\Reports\Star_Atlas_ economics-paper.pdf")

loaders = [pdf_loader, txt_loader, word_loader]
#loaders = [txt_loader, word_loader]
#loaders = [pdf_loader]
documents = []
for loader in loaders:
    documents.extend(loader.load())

print(f"Total number of documents: {len(documents)}")

In [None]:
# Chunk and Embeddings
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(documents)

embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(documents, embeddings)

In [None]:
from langchain.memory import ConversationBufferMemory

# Initialise Langchain - Conversation Retrieval Chain
#qa = ConversationalRetrievalChain.from_llm(ChatOpenAI(temperature=0), vectorstore.as_retriever(), return_source_documents=True)

memory = ConversationBufferMemory(
        memory_key='chat_history', return_messages=True, output_key='answer')
qa = ConversationalRetrievalChain.from_llm(
    ChatOpenAI(temperature=0), 
    vectorstore.as_retriever(), 
    memory=memory,
    get_chat_history=lambda h : h,
    return_source_documents=True)


In [None]:
# Front end web app
from langchain.memory import ChatMessageHistory
from langchain.schema import messages_from_dict, messages_to_dict
from langchain.llms import OpenAI
from langchain.chains import ConversationChain


import gradio as gr
with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    clear = gr.Button("Clear")
    #chat_history = ()
    context = ChatMessageHistory()
    memory = ConversationBufferMemory(
        memory_key='chat_history', return_messages=True, output_key='answer')
    
    def respond(user_message, chat_history):
 
        # Get response from QA chain
        response = qa({"question": user_message, "chat_history": chat_history})
        # Append user message and response to chat history
        chat_history.append((user_message, response["answer"]))
        return gr.update(value=""), chat_history
    msg.submit(respond, [msg, chatbot], [msg, chatbot], queue=False)
    clear.click(lambda: None, None, chatbot, queue=False)

if __name__ == "__main__":
    demo.launch(debug=True)