In [203]:
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQAWithSourcesChain, ConversationalRetrievalChain
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.llms import OpenAI
from langchain import LLMChain

In [205]:
loader = UnstructuredPDFLoader("../data/pdf_data/AI-Practitioner-Handbook.pdf")
#loader = OnlinePDFLoader("https://epoch.aisingapore.org/wp-content/uploads/2023/03/AI-Practitioner-Handbook-20230324.pdf")

data = loader.load()
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

#text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

print (f'Now you have {len(texts)} documents')

detectron2 is not installed. Cannot use the hi_res partitioning strategy. Falling back to partitioning with the fast strategy.


You have 1 document(s) in your data
There are 182501 characters in your document
Now you have 217 documents


In [207]:
texts

[Document(page_content='AI Practitioner Handbook\n\nContributed by Engineers from AI Singapore\n\nEdited by Kenny WJ Chua, Ryzal Kamis, Siavash Sakhavi, Anand Natarajan, Kevin Oh, Najib Ninaba, Kim Hock Ng and Laurence Liew\n\nMar 24, 2023\n\nCONTENTS\n\n.\n\n.\n\n.\n\n1 Pre-project Phase 1.1 Overview . . 1.2 How can business challenges be translated into AI problems? . . . . . 1.3 What are some data considerations when framing an AI project? 1.4 What are the considerations for reducing technical debt? . . . . . 1.5 How can an engineer assess a client’s AI readiness? . . . . .\n\n. . . . .\n\n. . . . .\n\n. . .\n\n. . .\n\n.\n\n.\n\n.\n\n.\n\n.\n\n.\n\n.\n\n. . . . . . . . . . . . . . . . . . . . . . . . . . . . .\n\n. . . . .\n\n. . .\n\n. . . . . . . . .\n\n. . . . . . . . . . . .\n\n. . .\n\n. . .\n\n. . . . .\n\n. . .\n\n. . . . .\n\n. . . . . . . . . . . . . . . . .\n\n. . . . . .\n\n2 Project Management & Technical Leadership . .\n\n. . .\n\n.\n\n.\n\n.\n\n.\n\n.\n\n.', metadata=

In [210]:
import tiktoken
# create a GPT-3.5 encoder instance
enc = tiktoken.encoding_for_model("gpt-3.5-turbo")

total_word_count = sum(len(doc.page_content.split()) for doc in texts)
total_token_count = sum(len(enc.encode(doc.page_content)) for doc in texts)

print(f"\nTotal word count: {total_word_count}")
print(f"\nEstimated tokens: {total_token_count}")
print(f"\nEstimated cost of embedding: ${total_token_count * 0.002 / 1000}")


Total word count: 29592

Estimated tokens: 38501

Estimated cost of embedding: $0.077002


In [None]:
# embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
# vector_store = FAISS.from_documents(texts, embeddings)

# from IPython.display import display, Markdown

# search_result = vector_store.similarity_search_with_score("who is this pdf for?")
# search_result

# line_separator = "\n"# {line_separator}Source: {r[0].metadata['source']}{line_separator}Score:{r[1]}{line_separator}
# display(Markdown(f"""
# ## Search results:{line_separator}
# {line_separator.join([
#   f'''
#   ### Source:{line_separator}{r[0].metadata['source']}{line_separator}
#   #### Score:{line_separator}{r[1]}{line_separator}
#   #### Content:{line_separator}{r[0].page_content}{line_separator}
#   '''
#   for r in search_result
# ])}
# """))

# vector_store.save_local("C:/Users/meldr/aiap/langchain-question-answer/data/vector_data/")

In [None]:
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

system_template="""Use the following pieces of context to answer the users question.
Take note of the sources and include them in the answer in the format: "SOURCES: source1 source2", use "SOURCES" in capital letters regardless of the number of sources.
If you don't know the answer, just say that "I don't know", don't try to make up an answer.
----------------
{summaries}"""
messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}")
]
prompt = ChatPromptTemplate.from_messages(messages)

In [6]:
import os

path = "../data/vector_data/"
if os.path.exists(path):
  vector_store = FAISS.load_local(
      path,
      OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
  )
else:
  print(f"Missing files. Upload index.faiss and index.pkl files to {path} directory first")

In [None]:
chain_type_kwargs = {"prompt": prompt}
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, openai_api_key=OPENAI_API_KEY)  # Modify model_name if you have access to GPT-4
chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)

In [None]:
from IPython.display import display, Markdown

def print_result(result):
    output_text = f"""### Question: 
    {query}
    ### Answer: 
    {result['answer']}
    ### Sources: 
    {result['sources']}
    ### All relevant sources:
    {' '.join(list(set([doc.metadata['source'] for doc in result['source_documents']])))}
    """
    display(Markdown(output_text))

In [None]:
query = "What are the different robustness testing tools?"
result = chain(query)
print_result(result)
# result

In [None]:
result['question']

In [None]:
result['answer']

In [None]:
result['sources']

In [None]:
result['source_documents']

In [5]:
path = "../data/vector_data/"
vector_store = FAISS.load_local(path, OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY))

In [None]:
system_template = """Use the following pieces of context to answer the users question.\
No matter what the question is, you should always answer it in the context of the AI Practitioner Handbook.\
Even if the question does not end in a question mark, you should still answer it as if it were a question.\
If you don't know the answer, just say that "I don't know", don't try to make up an answer.\
If the question is not related to the AI Practitioner Handbook, just say that "I don't know".\
----------------
{summaries}"""

messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}"),
]
prompt = ChatPromptTemplate.from_messages(messages)

llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
#question_generator = LLMChain(llm=llm, prompt=prompt)
#doc_chain = load_qa_with_sources_chain(llm, chain_type="stuff")

In [13]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, openai_api_key=OPENAI_API_KEY)

In [14]:
from langchain.prompts.prompt import PromptTemplate

_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.
You can assume the question about the most recent state of the AI Practioner Handbook.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""

CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

In [15]:
CONDENSE_QUESTION_PROMPT

PromptTemplate(input_variables=['chat_history', 'question'], output_parser=None, partial_variables={}, template='Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.\nYou can assume the question about the most recent state of the AI Practioner Handbook.\nChat History:\n{chat_history}\nFollow Up Input: {question}\nStandalone question:', template_format='f-string', validate_template=True)

In [17]:
chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=vector_store.as_retriever(),
    chain_type="stuff",
    #question_generator=question_generator,
    #combine_docs_chain=doc_chain,
    verbose=True,
    return_source_documents=True,
    condense_question_prompt=CONDENSE_QUESTION_PROMPT,
)

In [18]:
chat_history = []
query = "What is the role of an ai engineer?"
result = chain({"question": query, "chat_history": chat_history})



[1m> Entering new ConversationalRetrievalChain chain...[0m

[1m> Finished chain.[0m


In [11]:
result

{'question': 'What is the role of an ai engineer?',
 'chat_history': [],
 'answer': ' The role of an AI engineer is to build a solution for another company to take over and implement. They assess whether the client has the capability to take over the final solution, integrate and maintain it, as the goal is to enable companies to build their own AI capabilities in the long run. AI engineers also participate in pre-project scoping and gauge if the client is capable of taking over the AI solution.',
 'source_documents': [Document(page_content='2.2 How can I build an effective AI development team?\n\nContributor: Kenny WJ Chua, Senior AI Engineer\n\n2.2.1 Introduction\n\nThe responsbilities of a technical lead are different from those of an individual contributor. Engineers, especially those\n\nwho are who are unfamiliar with leadership responsibilities, may face a learning curve in fulfilling these responsibilities.\n\nA technical lead indirectly delivers an AI project on time and on tar

In [19]:
chat_history.append((query, result["answer"]))
query = "How can he build the team?"
result = chain({"question": query, "chat_history": chat_history})



[1m> Entering new ConversationalRetrievalChain chain...[0m

[1m> Finished chain.[0m


In [20]:
result

{'question': 'How can he build the team?',
 'chat_history': [('What is the role of an ai engineer?',
   'According to the context provided, an AI engineer is responsible for building and delivering AI projects on time and on target by building an effective AI development team. They work as technical leads for a small team of apprentices (i.e., ‘developers’), and they are supported by Project Managers and the AI and ML Ops teams to deliver the project on time, and with the right infrastructure and architecture powered by established CI/CD pipelines. Their responsibilities include preliminary data identification, cleaning and curation, followed by building, training and testing the models and finally deploying the model.')],
 'answer': 'The article provides suggestions on promoting effective teams by contextualizing general leadership principles. A technical lead indirectly delivers an AI project on time and on target by building an effective AI development team. The article requires som

In [21]:
chat_history.append((query, result["answer"]))
query = "How does he develop and deloy the solution?"
result = chain({"question": query, "chat_history": chat_history})



[1m> Entering new ConversationalRetrievalChain chain...[0m

[1m> Finished chain.[0m


In [196]:
result

{'question': 'How does he develop and deloy the solution?',
 'chat_history': [('What is the role of an ai practitioner?',
   " An AI Practitioner is someone who is responsible for developing and deploying AI projects. This includes tasks such as identifying AI ready projects, building and training models, and deploying models into production. AI Practitioners also need to assess the client's AI capabilities to ensure that they are able to take over the final solution, integrate and maintain it."),
  ('How can he build the team?',
   " Building an AI team requires recruiting the right people, setting engineering principles, cultivating a cohesive team, and simplifying and translating technical jargon. Additionally, setting up a project repository and assessing the client's AI readiness level are important considerations.")],
 'answer': ' An AI Practitioner develops and deploys an AI solution by first identifying an AI-ready project, then building an effective AI development team, and fi

In [197]:
chat_history.append((query, result["answer"]))
query = "How can he test models?"
result = chain({"question": query, "chat_history": chat_history})



[1m> Entering new ConversationalRetrievalChain chain...[0m

[1m> Finished chain.[0m


In [198]:
result

{'question': 'How can he test models?',
 'chat_history': [('What is the role of an ai practitioner?',
   " An AI Practitioner is someone who is responsible for developing and deploying AI projects. This includes tasks such as identifying AI ready projects, building and training models, and deploying models into production. AI Practitioners also need to assess the client's AI capabilities to ensure that they are able to take over the final solution, integrate and maintain it."),
  ('How can he build the team?',
   " Building an AI team requires recruiting the right people, setting engineering principles, cultivating a cohesive team, and simplifying and translating technical jargon. Additionally, setting up a project repository and assessing the client's AI readiness level are important considerations."),
  ('How does he develop and deloy the solution?',
   ' An AI Practitioner develops and deploys an AI solution by first identifying an AI-ready project, then building an effective AI dev

In [None]:
[
  [
    "What is the role of  an AI engineer?",
    "An AI engineer is responsible for developing and implementing artificial intelligence solutions. They work on AI projects, from conceptualization to deployment, and are involved in designing, building, testing, and maintaining AI systems. They also collaborate with other team members, such as data scientists and software developers, to ensure that the AI solutions meet the project requirements and are scalable, efficient, and effective. Additionally, AI engineers may also be responsible for assessing the client's AI capabilities and ensuring that the final solution can be integrated and maintained by the client."
  ],
  [
    "How does he work with the clients?",
    "According to the AI Practitioner Handbook, an AI engineer should assess a client's AI capabilities by focusing on three broad areas:\n\n1. Organizational readiness: The engineer should determine if there is an existing technical team who is able to integrate and maintain the AI models. If not, the engineer should inquire about the client's plans to hire the necessary resources. The engineer should also assess if the client's management is supportive of AI projects and team expansion if necessary, and if they allow room for experimentation and development.\n\n2. Technical readiness: The engineer should assess the client's technical infrastructure, including their data storage and processing capabilities, as well as their ability to handle the computational requirements of the AI models.\n\n3. Data readiness: The engineer should assess the quality and availability of the client's data, as well as their ability to collect and label data if necessary.\n\nBy assessing these areas, the AI engineer can determine if the client is capable of taking over the final solution, integrating and maintaining it, as the goal is to enable companies to build their own AI capabilities in the long run."
  ],
  [
    "Who are the contributors to this book?",
    "I'm sorry, I couldn't help you with that question. Is there anything else I can assist you with?"
  ],
  [
    "Who are the contributors to this book?",
    "The AI Practitioner Handbook was contributed by Engineers from AI Singapore and edited by Kenny WJ Chua, Ryzal Kamis, Siavash Sakhavi, Anand Natarajan, Kevin Oh, Najib Ninaba, Kim Hock Ng, and Laurence Liew."
  ],
  [
    "How many of them are there? Give me a number.",
    "There were 7 contributors involved in editing the AI Practitioner Handbook."
  ]
]

# AI counted the editors wrongly