In [26]:
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQAWithSourcesChain, ConversationalRetrievalChain
from langchain.prompts.prompt import PromptTemplate
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.prompts.chat import (ChatPromptTemplate,SystemMessagePromptTemplate,HumanMessagePromptTemplate,)

In [13]:
OPENAI_API_KEY="enter_your_api_key_here_as_a_string"

In [2]:
# May need to do a "pip install pypdf" to get this to work
loader = PyPDFLoader("../data/pdf_data/AI-Practitioner-Handbook.pdf")

data = loader.load()
print (f'You have {len(data)} pages in your pdf')
# number of characters in whole data
print (f'There are {len(data[4].page_content)} characters in page 5 of your pdf')

You have 94 pages in your pdf
There are 711 characters in page 5 of your pdf


In [3]:
# Show page 5 content
data[4].page_content

'AIPractitionerHandbook\nIdentifying AI Ready Projects\nAIprojectsexecutedbyAISingaporeinthe100Eprogrammearereal-worldproblemstatementsfromtheindustryand\ngothrougharigorousreviewprocessbeforeitgetsapprovedandonboardedbeforeanyengineeringworkisdone.\nLeveragingthe AIReadinessIndex(AIRI) developedbyAISingapore,weareabletoquicklyidentifyorganizationsthat\nareAIUnaware,AIAware,AIReadyorAICompetent.\nInthe100Eprogramme,team’sworkfocusesonAIReadycompaniesonly1.\nDelivering AI Ready Projects\nProjectsapprovedunderthe100Eprogrammearedeliveredin7monthsfollowinganAgilemethodology,typicallyover\n10-15sprints.\n1AISingaporeandotherSingaporeagencieshaveotherprogrammeswhichassistcompaniesthatareAIUnawareandAIAware.\n2 CONTENTS'

In [5]:
pages = loader.load_and_split()

In [6]:
type(pages)

list

In [7]:
import tiktoken

# create a GPT-3.5 encoder instance, can use other models as well
enc = tiktoken.encoding_for_model("gpt-3.5-turbo")

# estimate the number of words and tokens. Check the cost of embedding in case it's too expensive
est_total_word_count = sum(len(doc.page_content.split()) for doc in pages)
est_total_token_count = sum(len(enc.encode(doc.page_content)) for doc in pages)

print(f"\nTotal word count: {est_total_word_count}")
print(f"\nEstimated tokens: {est_total_token_count}")
print(f"\nEstimated cost of embedding: ${est_total_token_count * 0.002 / 1000}") # 0.002 is the cost per 1k tokens


Total word count: 11323

Estimated tokens: 43236

Estimated cost of embedding: $0.08647200000000001


In [None]:
# This section is used if another type of document loader is used, for example UnstructuredPDFLoader. But UnstructuredPDFLoader is unstable at the moment with the packages.

# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
# text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
# texts = text_splitter.split_documents(data)

# print (f'Now you have {len(texts)} documents')

In [8]:
# Turn the PDF into embeddings, then store them in a FAISS index

embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
vector_store = FAISS.from_documents(pages, embeddings)

In [9]:
# Show the most similar documents to the query
search_result = vector_store.similarity_search_with_score("who is this pdf for?")
search_result

[(Document(page_content='AIPractitionerHandbook\nContributedbyEngineersfromAISingapore\nEditedbyKennyWJChua,RyzalKamis,\nSiavashSakhavi,AnandNatarajan,KevinOh,\nNajibNinaba,KimHockNgandLaurenceLiew\nMar24,2023', metadata={'source': '../data/pdf_data/AI-Practitioner-Handbook.pdf', 'page': 0}),
  0.5360665),
 (Document(page_content='CHAPTER\nNINE\nCITEAISINGAPORE’SAIPRACTITIONERHANDBOOK\nThishandbookisdevelopedbyAISingapore,withthemainpurposeofbenefitingthein-houseengineeringstaff. However,\nwehopethatitwouldbenefitthepublicaswellandifyouwouldliketocitethiscontent,youmaydosowiththefollowing\nsnippetforBibTeX:\n@book {aisg_aiprac_hbook ,\nauthor ={AI Singapore },\ntitle ={AI Practitioner Handbook },\nhowpublished ={\\url {https://aisingapore.github.io/ai-practitioner-handbook/ }},\nyear ={2023 }\n}\n91', metadata={'source': '../data/pdf_data/AI-Practitioner-Handbook.pdf', 'page': 93}),
  0.5409418),
 (Document(page_content='AIPractitionerHandbook\n[Source] Summary plot of feature importan

In [None]:
# vector_store.save_local("your_file_path_here")

In [None]:
import os

# Load vectore store saved locally
path = "../data/vector_data/"
if os.path.exists(path):
  vector_store = FAISS.load_local(
      path,
      OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
  )
else:
  print(f"Missing files. Upload index.faiss and index.pkl files to {path} directory first")

In [11]:
# Trying out simple prompt templates

system_template="""Use the following pieces of context to answer the users question.
Take note of the sources and include them in the answer in the format: "SOURCES: source1 source2", use "SOURCES" in capital letters regardless of the number of sources.
If you don't know the answer, just say that "I don't know", don't try to make up an answer.
----------------
{summaries}"""
messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}")
]
prompt = ChatPromptTemplate.from_messages(messages)

In [None]:
# Addtional tuning of the prompt template
system_template = """Use the following pieces of context to answer the users question.\
No matter what the question is, you should always answer it in the context of the AI Practitioner Handbook.\
Even if the question does not end in a question mark, you should still answer it as if it were a question.\
If you don't know the answer, just say that "I don't know", don't try to make up an answer.\
If the question is not related to the AI Practitioner Handbook, just say that "I don't know".\
----------------
{summaries}"""

messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}"),
]
prompt = ChatPromptTemplate.from_messages(messages)

In [15]:
vector_store

<langchain.vectorstores.faiss.FAISS at 0x22e89d5a640>

In [16]:
# Initialize the chain
chain_type_kwargs = {"prompt": prompt}
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, openai_api_key=OPENAI_API_KEY)  # Modify model_name if you have access to GPT-4
chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)

In [25]:
query = "What is the job of an AI engineer?"
result = chain(query)
result

{'question': 'What is the job of an AI engineer?',
 'answer': 'An AI engineer is responsible for building and delivering AI projects on time and on target by building an effective AI development team. They work on identifying AI ready projects, preliminary data identification, cleaning and curation, building, training and testing the models, and finally deploying the model. They also work with AI and ML Ops teams to enable the whole end-to-end process. The AI engineer serves as a technical lead for a small team of apprentices (i.e., ‘developers’). They are responsible for assessing whether the client has the capability to take over the final solution, integrate and maintain it, as the goal is to enable companies to build their own AI capabilities in the long run. (',
 'sources': '../pdf_data/AI-Practitioner-Handbook.pdf)',
 'source_documents': [Document(page_content='2.2 How can I build an effective AI development team?\n\nContributor: Kenny WJ Chua, Senior AI Engineer\n\n2.2.1 Introdu

In [19]:
result['question']

'What is the job of an AI engineer?'

In [20]:
result['answer']

'An AI engineer is responsible for building and delivering AI projects on time and on target by building an effective AI development team. They work on identifying AI ready projects, preliminary data identification, cleaning and curation, building, training and testing the models, and finally deploying the model. They also work with AI and ML Ops teams to enable the whole end-to-end process. The AI engineer serves as a technical lead for a small team of apprentices (i.e., ‘developers’). They are responsible for assessing whether the client has the capability to take over the final solution, integrate and maintain it, as the goal is to enable companies to build their own AI capabilities in the long run. ('

In [21]:
result['sources']

'../pdf_data/AI-Practitioner-Handbook.pdf)'

In [22]:
result['source_documents']

[Document(page_content='2.2 How can I build an effective AI development team?\n\nContributor: Kenny WJ Chua, Senior AI Engineer\n\n2.2.1 Introduction\n\nThe responsbilities of a technical lead are different from those of an individual contributor. Engineers, especially those\n\nwho are who are unfamiliar with leadership responsibilities, may face a learning curve in fulfilling these responsibilities.\n\nA technical lead indirectly delivers an AI project on time and on target by building an effective AI development team.\n\nThis article provides suggestions on promoting effective teams by contextualising general leadership principles.\n\nIn AISG, an engineer serves as technical lead for a small team of apprentices (i.e., ‘developers’). Therefore, the contents\n\nof this article would be useful to any AI practitioner who is in a similar leadership role.\n\nThis article requires some basic knowledge of a AI project life cycle and typical activities that occur in each stage. The', metadata

In [27]:
# Experimenting with the prompt template for Condense Questions to perform chat history summarization. This is not used in the final product as it is not properly tuned.
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.
You can assume the question about the most recent state of the AI Practioner Handbook.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""

CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

In [28]:
CONDENSE_QUESTION_PROMPT

PromptTemplate(input_variables=['chat_history', 'question'], output_parser=None, partial_variables={}, template='Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.\nYou can assume the question about the most recent state of the AI Practioner Handbook.\nChat History:\n{chat_history}\nFollow Up Input: {question}\nStandalone question:', template_format='f-string', validate_template=True)

In [29]:
# Using a different chain type that is able to be context-aware based on chat history
chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=vector_store.as_retriever(),
    chain_type="stuff",
    verbose=True,
    return_source_documents=True,
    condense_question_prompt=CONDENSE_QUESTION_PROMPT,
)

In [30]:
chat_history = []
query = "What is the role of an ai engineer?"
result = chain({"question": query, "chat_history": chat_history})



[1m> Entering new ConversationalRetrievalChain chain...[0m

[1m> Finished chain.[0m


In [31]:
result

{'question': 'What is the role of an ai engineer?',
 'chat_history': [],
 'answer': 'According to the context provided, an AI engineer is responsible for building and delivering AI projects on time and on target by building an effective AI development team. They work as technical leads for a small team of apprentices (i.e., ‘developers’), and they are supported by Project Managers and the AI and ML Ops teams to deliver the project on time, and with the right infrastructure and architecture powered by established CI/CD pipelines. Their responsibilities include preliminary data identification, cleaning and curation, followed by building, training and testing the models and finally deploying the model.',
 'source_documents': [Document(page_content='2.2 How can I build an effective AI development team?\n\nContributor: Kenny WJ Chua, Senior AI Engineer\n\n2.2.1 Introduction\n\nThe responsbilities of a technical lead are different from those of an individual contributor. Engineers, especiall

In [32]:
chat_history.append((query, result["answer"]))
query = "How can he build the team?" # Testing to see if model knows that "he" refers to the ai engineer from previous question
result = chain({"question": query, "chat_history": chat_history})



[1m> Entering new ConversationalRetrievalChain chain...[0m

[1m> Finished chain.[0m


In [34]:
result["answer"]

"The article provides suggestions on promoting effective teams by contextualizing general leadership principles. A technical lead indirectly delivers an AI project on time and on target by building an effective AI development team. The article requires some basic knowledge of an AI project life cycle and typical activities that occur in each stage. The technical lead can model the way by demonstrating values and practices that he/she espouses. For example, a technical lead can cultivate a team habit of writing well-structured and documented code by taking on a few of such tasks himself/herself. This may be particularly fruitful when onboarding a new team of junior developers. The article also suggests that the technical lead should assess the team's strengths and weaknesses and provide opportunities for growth and development. Additionally, the technical lead should establish clear communication channels and foster a culture of collaboration and continuous learning."

In [35]:
result["chat_history"]

[('What is the role of an ai engineer?',
  'According to the context provided, an AI engineer is responsible for building and delivering AI projects on time and on target by building an effective AI development team. They work as technical leads for a small team of apprentices (i.e., ‘developers’), and they are supported by Project Managers and the AI and ML Ops teams to deliver the project on time, and with the right infrastructure and architecture powered by established CI/CD pipelines. Their responsibilities include preliminary data identification, cleaning and curation, followed by building, training and testing the models and finally deploying the model.')]

In [36]:
chat_history.append((query, result["answer"]))
query = "How does he work with clients?" # Testing to see if model knows that "he" refers to the ai engineer from previous previous question
result = chain({"question": query, "chat_history": chat_history})



[1m> Entering new ConversationalRetrievalChain chain...[0m

[1m> Finished chain.[0m


In [37]:
result["answer"] # The answer given here is not satisfactory. FURTHER tuning of prompt is required, so refer to the query_processor.py file where the tuned prompt is stored.

'Based on the given context, the most recent state of the AI Practitioner Handbook is not provided. The context only mentions that the handbook is a live project and will continue to evolve and be updated as they do more projects and learn more. The last date mentioned in the context is March 24, 2023, which could be the date of publication or the last update.'

In [38]:
result["chat_history"]

[('What is the role of an ai engineer?',
  'According to the context provided, an AI engineer is responsible for building and delivering AI projects on time and on target by building an effective AI development team. They work as technical leads for a small team of apprentices (i.e., ‘developers’), and they are supported by Project Managers and the AI and ML Ops teams to deliver the project on time, and with the right infrastructure and architecture powered by established CI/CD pipelines. Their responsibilities include preliminary data identification, cleaning and curation, followed by building, training and testing the models and finally deploying the model.'),
 ('How can he build the team?',
  "The article provides suggestions on promoting effective teams by contextualizing general leadership principles. A technical lead indirectly delivers an AI project on time and on target by building an effective AI development team. The article requires some basic knowledge of an AI project life

In [39]:
chat_history.append((query, result["answer"]))
query = "How can he test models?"
result = chain({"question": query, "chat_history": chat_history})



[1m> Entering new ConversationalRetrievalChain chain...[0m

[1m> Finished chain.[0m


In [40]:
result["answer"]

"AI engineers typically test models by performing various types of tests, such as API call tests, algorithmic correctness tests, and robustness tests. They also check for model performance and accuracy, and use metrics such as accuracy, confusion matrix, precision, recall, F1, ROC & AUC, and precision-recall curve to evaluate the model's performance. Additionally, they may use techniques such as metamorphic testing and denoising techniques to improve the model's performance. Finally, they may assess the client's AI readiness level and organizational readiness to ensure a successful project."

In [41]:
result["chat_history"]

[('What is the role of an ai engineer?',
  'According to the context provided, an AI engineer is responsible for building and delivering AI projects on time and on target by building an effective AI development team. They work as technical leads for a small team of apprentices (i.e., ‘developers’), and they are supported by Project Managers and the AI and ML Ops teams to deliver the project on time, and with the right infrastructure and architecture powered by established CI/CD pipelines. Their responsibilities include preliminary data identification, cleaning and curation, followed by building, training and testing the models and finally deploying the model.'),
 ('How can he build the team?',
  "The article provides suggestions on promoting effective teams by contextualizing general leadership principles. A technical lead indirectly delivers an AI project on time and on target by building an effective AI development team. The article requires some basic knowledge of an AI project life

In [None]:
# More experiments with questions and getting answers

# "What is the role of  an AI engineer?",
# "An AI engineer is responsible for developing and implementing artificial intelligence solutions. They work on AI projects, from conceptualization to deployment, and are involved in designing, building, testing, and maintaining AI systems. They also collaborate with other team members, such as data scientists and software developers, to ensure that the AI solutions meet the project requirements and are scalable, efficient, and effective. Additionally, AI engineers may also be responsible for assessing the client's AI capabilities and ensuring that the final solution can be integrated and maintained by the client."

# "How does he work with the clients?",
# "According to the AI Practitioner Handbook, an AI engineer should assess a client's AI capabilities by focusing on three broad areas:\n\n1. Organizational readiness: The engineer should determine if there is an existing technical team who is able to integrate and maintain the AI models. If not, the engineer should inquire about the client's plans to hire the necessary resources. The engineer should also assess if the client's management is supportive of AI projects and team expansion if necessary, and if they allow room for experimentation and development.\n\n2. Technical readiness: The engineer should assess the client's technical infrastructure, including their data storage and processing capabilities, as well as their ability to handle the computational requirements of the AI models.\n\n3. Data readiness: The engineer should assess the quality and availability of the client's data, as well as their ability to collect and label data if necessary.\n\nBy assessing these areas, the AI engineer can determine if the client is capable of taking over the final solution, integrating and maintaining it, as the goal is to enable companies to build their own AI capabilities in the long run."


# "Who are the contributors to this book?",
# "I'm sorry, I couldn't help you with that question. Is there anything else I can assist you with?"


# "Who are the contributors to this book?",
# "The AI Practitioner Handbook was contributed by Engineers from AI Singapore and edited by Kenny WJ Chua, Ryzal Kamis, Siavash Sakhavi, Anand Natarajan, Kevin Oh, Najib Ninaba, Kim Hock Ng, and Laurence Liew."


# "How many of them are there? Give me a number.",
# "There were 7 contributors involved in editing the AI Practitioner Handbook."
#  AI counted the editors wrongly!!!!