## Load .env file

In [299]:
import openai, os
from dotenv import load_dotenv

load_dotenv()
openai.api_type = "azure"
openai.api_version = "2023-07-01-preview"


## Load pdf file

In [300]:
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader

# Load and process all pdf files in chosen directory
loader = DirectoryLoader('./data', glob="./*.pdf", loader_cls=PyPDFLoader)

original_documents = loader.load()


In [301]:
# Function to adjust page numbers in the original documents
def adjust_page_numbers(documents):
    for doc in documents:
        # Convert from 0-based to 1-based
        doc.metadata['page'] += 1  # Directly modify the original document
    return documents  # Return the modified documents (optional)

# Adjust page numbers
documents = adjust_page_numbers(original_documents)

## Text splitting

In [302]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Split the text into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=[".", ":", "&", "\n\n", "\n", " ", ""]
    )
texts = text_splitter.split_documents(documents)

texts[5]


Document(metadata={'source': 'data/BonBon FAQ.pdf', 'page': 3}, page_content='. These incidents might involve \nissues that impact individual productivity or specific tasks, but there are workarounds \navailable.  \n• Example: Inability to access a shared network folder required for a project, a software \napplication crashing intermittently, or a printer not functioning for a team that relies on \nhard copies for meetings.  \n \n3. Priority Level: High  \n• Description: Incidents categorized as "High" priority are critical and have a severe impact on \nthe user\'s ability to perform essential tasks. These incidents usually result in a significant \ndisruption of work for the affected individual or team and require  immediate attention.  \n• Example: Complete network outage preventing all users from accessing critical systems, a \nserver malfunction affecting multiple users\' access to essential data, or an email service \noutage preventing communication for the entire organization')

## Import embedding

In [303]:
from langchain_openai import AzureOpenAIEmbeddings

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_DEPLOYMENT_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
OPENAI_DEPLOYMENT_VERSION = "2024-06-01"
OPENAI_EMBEDDING_MODEL_NAME = "text-embedding-ada-002"

embeddings = AzureOpenAIEmbeddings(
    model=OPENAI_EMBEDDING_MODEL_NAME,
    azure_endpoint=OPENAI_DEPLOYMENT_ENDPOINT,
    chunk_size=1,
    api_key=OPENAI_API_KEY,
    api_version=OPENAI_DEPLOYMENT_VERSION,
)

## Create DB

In [304]:
from langchain.vectorstores import Chroma

# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'vectorstore_db'

vectordb = Chroma.from_documents(documents=texts, 
                                 embedding=embeddings,
                                 persist_directory=persist_directory)

In [305]:
# Persist the db to disk, you can run this code only once
vectordb.persist()
vectordb = None

In [306]:
# Now we can load the persisted database from disk, and use it as normal. 
vectordb = Chroma(persist_directory=persist_directory, 
                  embedding_function=embeddings)

## Create retriever (for testing purpose)

In [307]:
retriever = vectordb.as_retriever(search_kwargs={"k": 2})
answers = retriever.get_relevant_documents("How can i reset my password?")

## Import the model

In [308]:
from langchain_openai import AzureChatOpenAI

OPENAI_MODEL_NAME = "gpt-4o"

llm = AzureChatOpenAI(
    deployment_name=OPENAI_MODEL_NAME,
    azure_endpoint=OPENAI_DEPLOYMENT_ENDPOINT,
    api_version=OPENAI_DEPLOYMENT_VERSION,
    api_key=OPENAI_API_KEY,
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=3
)

## Use chaining to test the current llm and retriever

In [309]:
from langchain.chains import RetrievalQA

# Combine the retriever and LLM into a chain for question answering
qa_chain = RetrievalQA.from_chain_type(
    llm=llm, 
    retriever=retriever,
    chain_type="stuff",
    return_source_documents=True
)

# Ask a question
question = "How can I reset my password?"
result = qa_chain({"query": question}) 

# Extract the answer and source documents
answer = result["result"]
source_documents = result["source_documents"]

# Print the top 2 source documents with page number and content snippet
for idx, doc in enumerate(source_documents, 1):
    # Adjust the page number from 0-indexed to 1-indexed
    page_num = doc.metadata.get("page", "Unknown Page")
    if isinstance(page_num, int):
        page_num += 1
    
    print(f"\nDocument {idx}:")
    print(f"Source: {doc.metadata.get('source', 'Unknown Source')} (Page {page_num})")
    print(f"Content Snippet: {doc.page_content[:1000]}...")




Document 1:
Source: data/BonBon FAQ.pdf (Page 3)
Content Snippet: .  
• Remember, the priority levels may vary based on the organization's policies and the nature 
of the incident. It's important to have well -defined criteria for determining the priority of 
incidents so that the Service Desk team can effectively allocate resour ces and provide timely 
support.  
 
Frequently Asked Questions:  
 
1) Q: How do I reset my password?  
A: Go to “Where to Reset my Password for which application” web page @ the following link – 
www.anycorp.intranet.passwordreset/com . There you will be able to select application for which you 
need to reset your password and will receive further instructions....

Document 2:
Source: data/BonBon FAQ.pdf (Page 4)
Content Snippet: .  
• Remember, the priority levels may vary based on the organization's policies and the nature 
of the incident. It's important to have well -defined criteria for determining the priority of 
incidents so that the Service Desk te

## Define tools

In [310]:
from langchain.agents import Tool
from duckduckgo_search import DDGS

# Custom DuckDuckGo Chat Function
def duckduckgo_chat_function(query: str) -> str:
    """Function to use DuckDuckGo's chat capabilities."""
    # Call DDGS().chat with the specified model
    return DDGS().chat(query, model='claude-3-haiku')

# DuckDuckGo Internet Search Tool
duckduckgo_tool = Tool(
    name="Internet Search",
    description="Search the internet for answers using DuckDuckGo",
    func=duckduckgo_chat_function
)

# Knowledge Base Search Tool (retriever from Chroma vector store)
knowledge_base_tool = Tool(
    name="Knowledge Base Search (BonBon FAQ)",
    description="Search the BonBon FAQ.pdf",
    func=vectordb.as_retriever(search_kwargs={"k": 1}).get_relevant_documents
)

tools = [duckduckgo_tool, knowledge_base_tool]


## Define prompt template

In [2]:
from langchain import hub

template = """
Answer the following questions as best you can. You can use history {chat_history} to fill in unknown context. You have access to the following tools:

{tools}

Use the following format:
Remember: If any invalid format occurs, terminate and return answer
Question: the input question you must answer
Thought: you should always think about what to do
Action (if existed): the action to take, find in Search BonBon tool first, then should be one of [{tool_names}] (one of [Internet Search, Knowledge Base Search (BonBon FAQ)]). If not, terminate answer
Action Input (if presented): the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat at most 5 times.)
Thought: I now know the final answer. Let's return the answer
Final Answer: the final answer to the original input question. If use Search BonBon then include the page of the PDF file that has the question

Important: These keywords must never be translated and transformed. If one or some of them are not found, 
do not show that line and continue to the next line. If the final answer is not found, return "Answer not found". Do 
not concatenate these lines with the previous or next lines. Do not show repetitive answers or questions.
- Question:
- Action:
- Thought:
- Action Input:
- Observation:
- Final Answer:

Remember: Remember to return the page number of the PDF file that has the question if you use the BonBon FAQ.

Remember: Add newline before print out the 'Final Answer' and the 'Thought', and after the action input has been provided.

Begin!

Current conversation:
Chat History:
{chat_history}
Last line:
Human: {input}
Thought: {agent_scratchpad}
"""
prompt_react = hub.pull("hwchase17/react")
prompt_react.template 

print(prompt_react.template)



Answer the following questions as best you can. You have access to the following tools:

{tools}

Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question

Begin!

Question: {input}
Thought:{agent_scratchpad}


## Define the Agent with Conversational Memory

In [312]:
from langchain.memory import ConversationBufferMemory
from langchain.agents import AgentExecutor
from langchain.agents import create_react_agent

react_agent = create_react_agent(llm=llm, tools=tools, prompt=prompt_react)

# Define the Conversational ReAct agent with tools
conversational_agent = AgentExecutor(
    agent=react_agent,
    tools=tools,
    verbose=True,
    handle_parsing_errors=True,
    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True),
    max_iterations=3,
)



## Chatbot interaction

In [313]:
# Chatbot interaction loop
while True:
    user_input = input("input: ")
    
    # Exit condition
    if user_input.lower() in ['exit', 'quit']:
        print("Chatbot: Goodbye!")
        break
    
    # Process user input through the agent
    response = conversational_agent.invoke({"input": user_input})

    




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mQuestion: why does sun light at 12 am so dangerous
Thought: Sunlight at 12 AM is not possible because it is nighttime. It seems there might be a misunderstanding or a typo in the question. The user might be referring to sunlight at 12 PM (noon) instead. I will clarify this with the user.

Final Answer: Sunlight at 12 AM is not possible because it is nighttime. If you meant 12 PM (noon), sunlight can be dangerous due to the high intensity of UV radiation, which can cause skin damage and increase the risk of skin cancer.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mQuestion: why does sun light at 12 pm so dangerous
Thought: I should search the BonBon FAQ first to see if this question is addressed there.
Action: Knowledge Base Search (BonBon FAQ)
Action Input: why does sun light at 12 pm so dangerous
[0m[33;1m[1;3m[Document(metadata={'page': 6, 'source': 'data/BonBon FAQ.pdf'}, p