# Project: Question-Answering on Private Documents


In [None]:
#pip install openai

In [None]:
#pip install langchain_openai

In [None]:
import os
import langchain_openai
from langchain_openai import OpenAI
import sys
f = open(r'D:\Generative_AI\OpenAI\openai_key.txt')
api_key = f.read()
llm = OpenAI(openai_api_key=api_key)

openai.api_key  = os.environ['OPENAI_API_KEY']

NameError: name 'openai' is not defined

### Loading Different Document Formats



In [None]:
!pip install chromadb -q

In [None]:
# loading PDF, DOCX and TXT files as LangChain Documents
def load_document(file):
    import os
    name, extension = os.path.splitext(file)

    if extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        print(f'Loading {file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        print(f'Loading {file}')
        loader = Docx2txtLoader(file)
    elif extension == '.txt':
        from langchain.document_loaders import TextLoader
        loader = TextLoader(file)
    else:
        print('Document format is not supported!')
        return None

    data = loader.load()
    return data

In [None]:
def chunk_data(data, chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks = text_splitter.split_documents(data)
    return chunks

In [None]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.0004:.6f}')


# print_embedding_cost(chunks)

In [None]:
# create embeddings using OpenAIEmbeddings() and save them in a Chroma vector store
def create_embeddings(chunks):
    from langchain.vectorstores import Chroma
    from langchain.embeddings.openai import OpenAIEmbeddings
    embeddings = OpenAIEmbeddings()
    vector_store = Chroma.from_documents(chunks, embeddings)
    return vector_store

In [None]:
def ask_and_get_answer(vector_store, q, k=3):
    from langchain.chains import RetrievalQA
    from langchain.chat_models import ChatOpenAI

    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=0.2)
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': k})
    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

    answer = chain.run(q)
    return answer


### RUNNING CODE

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = load_document(r"D:\Generative_AI\Langchain\react.pdf")
print (f'You have {len(data)} pages in your data')

Loading files/react.pdf
You have 33 pages in your data


In [None]:
chunks = chunk_data(data)
print(len(chunks))

528


In [None]:
vector_store = create_embeddings(chunks)

In [None]:
# q = 'what is the whole document about?'
q = 'what is resoning and acting in LLMs?'
# q = 'Summarize the entire document in a few paragraphs.'

k = 5
answer = ask_and_get_answer(vector_store, q, k)
print(answer)

Reasoning and acting in LLMs refer to the integration of language models (LLMs) with the ability to reason and make decisions in a task-specific context. In this context, reasoning involves the process of logical thinking, inference, and problem-solving using the information provided in the language input. Acting, on the other hand, refers to the generation of task-specific actions based on the reasoning process. The goal is to combine these two capabilities in LLMs to create versatile and generalist agents that can perform a wide range of tasks requiring both reasoning and decision-making abilities.


In [None]:
import time

i = 1
print('Write Quit of Exit to quit.')
while True:
    q = input(f'Question #{i}: ')
    i = i+1
    if q.lower() in  ['quit', 'exit']:
        print('Quitting ... Bye Bye!')
        time.sleep(2)
        break

    answer = ask_and_get_answer(vector_store, q, 5)
    print(answer)
    print(f'\n {"-"*50} \n')


Write Quit of Exit to quit.
Question #1: What is react?
ReAct is a general paradigm that combines reasoning and acting with language models to solve various language reasoning and decision-making tasks. It is a method that utilizes large language models and allows for interaction with external sources of information. ReAct has been shown to outperform other methods in multiple trials, with performance gains ranging from 33% to 90% and averaging 62%.

 -------------------------------------------------- 

Question #2: quit
Quitting ... Bye Bye!


In [None]:
def ask_with_memory(vector_store, question, chat_history=[]):
    from langchain.chains import ConversationalRetrievalChain
    from langchain.chat_models import ChatOpenAI

    llm = ChatOpenAI(temperature=0.1)
    retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k":5})
    crc = ConversationalRetrievalChain.from_llm(llm, retriever)
    result = crc({"question": question, "chat_history": chat_history})
    chat_history.append((question, result["answer"]))

    return result, chat_history

In [None]:
chat_history = list()

q = 'What did the president say about Ketanji Brown Jackson?'
result, chat_history = ask_with_memory(vector_store, q, chat_history)

print(result['answer'])
print('-' * 100)
print(chat_history)  # for debugging

The president said that Ketanji Brown Jackson is one of our nation's top legal minds and will continue Justice Breyer's legacy of excellence.
----------------------------------------------------------------------------------------------------
[('What did the president say about Ketanji Brown Jackson?', "The president said that Ketanji Brown Jackson is one of our nation's top legal minds and will continue Justice Breyer's legacy of excellence.")]


In [None]:
q = 'Did he mention who she succeeded?'
result, chat_history = ask_with_memory(vector_store, q, chat_history)

print(result['answer'])
print('-' * 100)
print(chat_history)

He mentioned that Circuit Court of Appeals Judge Ketanji Brown Jackson will succeed Justice Breyer.
----------------------------------------------------------------------------------------------------
[('What did the president say about Ketanji Brown Jackson?', "The president said that Ketanji Brown Jackson is one of our nation's top legal minds and will continue Justice Breyer's legacy of excellence."), ('Did he mention who she succeeded?', 'He mentioned that Circuit Court of Appeals Judge Ketanji Brown Jackson will succeed Justice Breyer.')]


### Ask with Memory Loop

In [None]:
import time
i = 1

chat_history = []

print("Write Quit or Exit to quit")
while True:
    q = input(f"Question #{i}")
    i = i + 1
    if q.lower() in ["quit","exit"]:
        print("Qutting")
        time.sleep(2)
        break
    result, _ = ask_with_memory(vector_store, q, chat_history)
    print (result['answer'])
    print("----------------------------------------------------------------------")