In [1]:
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS

In [None]:
loader = UnstructuredPDFLoader("../data/pdf_data/AI-Practitioner-Handbook.pdf")
#loader = OnlinePDFLoader("https://epoch.aisingapore.org/wp-content/uploads/2023/03/AI-Practitioner-Handbook-20230324.pdf")

data = loader.load()
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

#text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

print (f'Now you have {len(texts)} documents')

In [2]:
#OPENAI_API_KEY = 'xx-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'

In [None]:
import tiktoken
# create a GPT-4 encoder instance
enc = tiktoken.encoding_for_model("gpt-3.5-turbo")

total_word_count = sum(len(doc.page_content.split()) for doc in texts)
total_token_count = sum(len(enc.encode(doc.page_content)) for doc in texts)

print(f"\nTotal word count: {total_word_count}")
print(f"\nEstimated tokens: {total_token_count}")
print(f"\nEstimated cost of embedding: ${total_token_count * 0.0004 / 1000}")

In [None]:
# embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
# vector_store = FAISS.from_documents(texts, embeddings)

# from IPython.display import display, Markdown

# search_result = vector_store.similarity_search_with_score("who is this pdf for?")
# search_result

# line_separator = "\n"# {line_separator}Source: {r[0].metadata['source']}{line_separator}Score:{r[1]}{line_separator}
# display(Markdown(f"""
# ## Search results:{line_separator}
# {line_separator.join([
#   f'''
#   ### Source:{line_separator}{r[0].metadata['source']}{line_separator}
#   #### Score:{line_separator}{r[1]}{line_separator}
#   #### Content:{line_separator}{r[0].page_content}{line_separator}
#   '''
#   for r in search_result
# ])}
# """))

# vector_store.save_local("C:/Users/meldr/aiap/langchain-question-answer/data/vector_data/")

In [3]:
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

system_template="""Use the following pieces of context to answer the users question.
Take note of the sources and include them in the answer in the format: "SOURCES: source1 source2", use "SOURCES" in capital letters regardless of the number of sources.
If you don't know the answer, just say that "I don't know", don't try to make up an answer.
----------------
{summaries}"""
messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}")
]
prompt = ChatPromptTemplate.from_messages(messages)

In [4]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQAWithSourcesChain

In [5]:
import os

path = "../data/vector_data/"
if os.path.exists(path):
  vector_store = FAISS.load_local(
      path,
      OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
  )
else:
  print(f"Missing files. Upload index.faiss and index.pkl files to {path} directory first")

In [6]:
chain_type_kwargs = {"prompt": prompt}
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, openai_api_key=OPENAI_API_KEY)  # Modify model_name if you have access to GPT-4
chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)

In [7]:
from IPython.display import display, Markdown

def print_result(result):
    output_text = f"""### Question: 
    {query}
    ### Answer: 
    {result['answer']}
    ### Sources: 
    {result['sources']}
    ### All relevant sources:
    {' '.join(list(set([doc.metadata['source'] for doc in result['source_documents']])))}
    """
    display(Markdown(output_text))

In [9]:
query = "What are the different robustness testing tools?"
result = chain(query)
print_result(result)
# result

### Question: 
    What are the different robustness testing tools?
    ### Answer: 
    There are several robustness testing tools mentioned in the AI Practitioner Handbook, including IBM ART, TextAttack, and Microsoft CheckList. The data type and model type supported by each tool varies. IBM ART is generally used for arrays of numeric data (i.e. images, audio), but is attack specific. TextAttack is used for text, while Microsoft CheckList can be used for any arbitrary format (as the mutation functions can be user-defined). The expectation functions can also be user-defined for any data type. For more information on robustness testing tools, you can refer to Chapter 6.4.4 of the AI Practitioner Handbook. (
    ### Sources: 
    ../pdf_data/AI-Practitioner-Handbook.pdf)
    ### All relevant sources:
    ../pdf_data/AI-Practitioner-Handbook.pdf
    

In [11]:
result['question']

'What are the different robustness testing tools?'

In [12]:
result['answer']

'There are several robustness testing tools mentioned in the AI Practitioner Handbook, including IBM ART, TextAttack, and Microsoft CheckList. The data type and model type supported by each tool varies. IBM ART is generally used for arrays of numeric data (i.e. images, audio), but is attack specific. TextAttack is used for text, while Microsoft CheckList can be used for any arbitrary format (as the mutation functions can be user-defined). The expectation functions can also be user-defined for any data type. For more information on robustness testing tools, you can refer to Chapter 6.4.4 of the AI Practitioner Handbook. ('

In [13]:
result['sources']

'../pdf_data/AI-Practitioner-Handbook.pdf)'

In [14]:
result['source_documents']

[Document(page_content='tions can be user-defined). Only some text\n\nmutations supported in tool.\n\nAny for black box attacks,\n\nspecific model type for spe-\n\ncific white box attacks\n\nAny for black box attacks.\n\nspecific model type for spe-\n\ncific white box attacks\n\nAny\n\nTask Type (Nature\n\nof model output)\n\nAttack-specific,\n\nMainly classification\n\nClassification,\n\nSequence-to-\n\nsequence\n\nAny (as the expecta-\n\ntion functions can be\n\nuser-defined)\n\n62\n\nChapter 6. Modelling\n\nAI Practitioner Handbook\n\nWhen To Perform Robustness Testing?\n\nUsing the robustness testing tools, you can test your model in the CI/CD pipeline, at the same level as model evaluation,\n\nto get a good evaluation of your model. Just like unit testing or integration testing, you only need to set it up once. With\n\nevery new retraining, you can compare the robustness between model versions.\n\n6.4.4 Words of Caution\n\nThe goal of robustness testing is to find areas where your