### Overview

This project is to build chatBot to read your own PDF file. Empower user to chat with their own data and capture insights from the documents efficiently.

The demo uses HR PDF document from Microsoft's fictional company Contoso Electronics.

In [1]:
import streamlit as st
from dotenv import load_dotenv
from PyPDF2 import PdfReader 
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from htmlTemplates import css, bot_template, user_template

In [9]:
def get_pdf_text(pdf_docs):
    text  = ''
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            text = text + page.extract_text()
    return text

In [12]:
def get_text_chunks(text):
    text_splitter = CharacterTextSplitter(separator = '\n', chunk_size = 1000, 
                                          chunk_overlap = 200, length_function = len)
    chunks = text_splitter.split_text(text)
    return chunks

In [None]:
def get_vectorstore(text_chunks):
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_texts(texts = text_chunks, embedding = embeddings)
    return vectorstore

In [None]:
def get_conversation_chain(vectorstore):
    memory = ConversationBufferMemory(memory_key = 'chat_history', return_messages = True)
    llm = ChatOpenAI()
    conversation_chain = ConversationalRetrievalChain.from_llm(
                          llm = llm, retriever = vectorstore.as_retriever(), memory = memory )
    return conversation_chain

In [None]:
def handle_userinput(user_question):
    response = st.session_state.conversation({'question':user_question})
    st.session_state.chat_history = response['chat_history']
    for i, message in enumerate(st.session_state.chat_history):
        if i % 2 == 0:
            st.write(user_template.replace('{{MSG}}', message.content), unsafe_allow_html = True)
        else:
            st.write(bot_template.replace('{{MSG}}', message.content), unsafe_allow_html = True)    

In [None]:
raw_text ='''
Welcome to Contoso Electronics! We are excited to offer our employees two comprehensive health insurance plans through Northwind Health.
Northwind Health Plus
Northwind Health Plus is a comprehensive plan that provides comprehensive coverage for medical, vision, and dental services. This plan also offers prescription drug coverage, mental health and substance abuse coverage, and coverage for preventive care services. With Northwind Health Plus, you can choose from a variety of in-network providers, including primary care physicians, specialists, hospitals, and pharmacies. This plan also offers coverage for emergency services, both in-network and out-of-network.
Northwind Standard
Northwind Standard is a basic plan that provides coverage for medical, vision, and dental services. This plan also offers coverage for preventive care services, as well as prescription drug coverage. With Northwind Standard, you can choose from a variety of in-network providers, including primary care physicians, specialists, hospitals, and pharmacies. This plan does not offer coverage for emergency services, mental health and substance abuse coverage, or out-of-network services.
Comparison of Plans
Both plans offer coverage for routine physicals, well-child visits, immunizations, and other preventive care services. The plans also cover preventive care services such as mammograms, colonoscopies, and other cancer screenings.
Northwind Health Plus offers more comprehensive coverage than Northwind Standard. This plan offers coverage for emergency services, both in-network and out-of-network, as well as mental health and substance abuse coverage. Northwind Standard does not offer coverage for emergency services, mental health and substance abuse coverage, or out-of-network services.
Both plans offer coverage for prescription drugs. Northwind Health Plus offers a wider range of prescription drug coverage than Northwind Standard. Northwind Health Plus covers generic, brand-name, and specialty drugs, while Northwind Standard only covers generic and brand-name drugs.
Both plans offer coverage for vision and dental services. Northwind Health Plus offers coverage for vision exams, glasses, and contact lenses, as well as dental exams, cleanings, and fillings. Northwind Standard only offers coverage for vision exams and glasses.
Both plans offer coverage for medical services. Northwind Health Plus offers coverage for hospital stays, doctor visits, lab tests, and X-rays. Northwind Standard only offers coverage for doctor visits and lab tests.
'''
text_chunks = get_text_chunks(raw_text)

In [11]:
def main():
    load_dotenv()
    st.set_page_config(page_title='Chat With PDF', page_icon = ':books:')
    st.write(css, unsafe_allow_html = True)
        
    if 'conversation' not in st.session_state:
        st.session_state.conversation = None
        
    if 'chat_history' not in st.session_state:
        st.session_state.chat_history = None
    
    
    st.header('Chat With PDF :books:')
    user_question = st.text_input('Ask a question about your document:')
    if user_question:
        handle_userinput(user_question)
    
    # st.write(user_template.replace('{{MSG}}','Hello Robert'), unsafe_allow_html = True)
    # st.write(bot_template.replace('{{MSG}}','Hello Human'), unsafe_allow_html = True)
   
    # put components in sidebar using "with", don't add parentheses like st.sidebar()
    with st.sidebar:
        st.subheader('Your Documents')
        pdf_docs = st.file_uploader('Upload your PDF file and click on "Process"', accept_multiple_files=True)
        # button becomes true after use click it
        if st.button('Process'):
            # being user friendly to show process running before it ends
            with st.spinner("Processing..."):
                # get pdf text
                raw_text = get_pdf_text(pdf_docs)
                #st.write(raw_text)
                
                # split text into chunks
                text_chunks = get_text_chunks(raw_text)
                #st.write(text_chunks)
                
                # create vector store
                vectorstore = get_vectorstore(text_chunks)
                
                # create conversation chain
                st.session_state.conversation = get_conversation_chain(vectorstore)
            
            

In [None]:
if __name__ == '__main__':
    main()