In [1]:
!pip install -qU langchain langchain-core langchain-community langchain-openai

In [2]:
!pip install -qU qdrant-client

In [3]:
!pip install -qU tiktoken pymupdf

In [4]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

In [5]:
from langchain_openai import ChatOpenAI

openai_chat_model = ChatOpenAI(model="gpt-3.5-turbo")

In [6]:
from langchain_openai.embeddings import OpenAIEmbeddings

embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

In [11]:
import tiktoken

enc = tiktoken.encoding_for_model("gpt-3.5-turbo")

In [7]:
from langchain.document_loaders import PyMuPDFLoader

docs = PyMuPDFLoader("https://d18rn0p25nwr6d.cloudfront.net/CIK-0001326801/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf").load()

In [12]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def tiktoken_len(text):
    tokens = tiktoken.encoding_for_model("gpt-3.5-turbo").encode(
        text,
    )
    return len(tokens)

In [13]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 200,
    chunk_overlap = 10,
    length_function = tiktoken_len,
)

split_chunks = text_splitter.split_documents(docs)

In [14]:
max_chunk_length = 0

for chunk in split_chunks:
  max_chunk_length = max(max_chunk_length, tiktoken_len(chunk.page_content))

print(max_chunk_length)

199


In [15]:
from langchain_community.vectorstores import Qdrant

qdrant_vectorstore = Qdrant.from_documents(
    split_chunks,
    embedding_model,
    location=":memory:",
    collection_name="Facebook 10k",
)

In [16]:
qdrant_retriever = qdrant_vectorstore.as_retriever()

In [18]:
from langchain_core.prompts import ChatPromptTemplate
RAG_PROMPT = """
CONTEXT:
{context}

QUERY:
{question}
Answer the question based on {context}, if you can't figure just say "I don't know"
"""

rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)

In [19]:
from operator import itemgetter
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough

retrieval_augmented_qa_chain = (
    # INVOKE CHAIN WITH: {"question" : "<<SOME USER QUESTION>>"}
    # "question" : populated by getting the value of the "question" key
    # "context"  : populated by getting the value of the "question" key and chaining it into the base_retriever
    {"context": itemgetter("question") | qdrant_retriever, "question": itemgetter("question")}
    # "context"  : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)
    #              by getting the value of the "context" key from the previous step
    | RunnablePassthrough.assign(context=itemgetter("context"))
    # "response" : the "context" and "question" values are used to format our prompt object and then piped
    #              into the LLM and stored in a key called "response"
    # "context"  : populated by getting the value of the "context" key from the previous step
    | {"response": rag_prompt | openai_chat_model, "context": itemgetter("context")}
)

In [20]:
response = retrieval_augmented_qa_chain.invoke({"question" : "What was the total value of 'Cash and cash equivalents' as of December 31, 2023?"})

In [21]:
response["response"].content

"The total value of 'Cash and cash equivalents' as of December 31, 2023, was $41,862 billion."

In [22]:
for context in response["context"]:
  print("Context:")
  print(context)
  print("----")

Context:
page_content='Table of Contents\n\xa0\n\xa0\nFair Value Measurement at Reporting Date Using\nDescription\nDecember 31, 2022\nQuoted\xa0Prices in Active\nMarkets for\nIdentical\xa0Assets\n(Level 1)\nSignificant Other\nObservable Inputs\n(Level 2)\nSignificant Unobservable\nInputs\n(Level 3)\nCash\n$\n6,176\xa0\nCash equivalents:\nMoney market funds\n8,305\xa0\n$\n8,305\xa0\n$\n—\xa0\n$\n—\xa0\nU.S. government and agency securities\n16\xa0\n16\xa0\n—\xa0\n—\xa0\nTime deposits\n156\xa0\n—\xa0\n156\xa0\n—\xa0\nCorporate debt securities\n28\xa0\n—\xa0\n28\xa0\n—\xa0\nTotal cash and cash equivalents\n14,681\xa0\n8,321\xa0\n184\xa0\n—\xa0\nMarketable securities:\nU.S. government securities\n8,708' metadata={'source': 'https://d18rn0p25nwr6d.cloudfront.net/CIK-0001326801/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf', 'file_path': 'https://d18rn0p25nwr6d.cloudfront.net/CIK-0001326801/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf', 'page': 108, 'total_pages': 147, 'format': 'PDF 1.4', 'title': '

In [23]:
response = retrieval_augmented_qa_chain.invoke({"question" : "Who are Meta's 'Directors' (i.e., members of the Board of Directors)?"})

In [24]:
response["response"].content

"Meta's Directors (members of the Board of Directors) are:\n1. Aaron Anderson\n2. Peggy Alford\n3. Marc L. Andreessen\n4. Andrew W. Houston\n5. Nancy Killefer\n6. Robert M. Kimmitt\n7. Sheryl K. Sandberg\n8. Tracey T. Travis\n9. Tony Xu"

In [25]:
for context in response["context"]:
  print("Context:")
  print(context)
  print("----")

Context:
page_content="(I.R.S. Employer Identification Number)\n1 Meta Way, Menlo Park, California 94025\n(Address of principal executive offices and Zip Code)\n(650)\xa0543-4800\n(Registrant's telephone number, including area code)\n__________________________\nSecurities registered pursuant to Section 12(b) of the Act:\nTitle of each class\nTrading symbol(s)\nName of each exchange on which registered\nClass A Common Stock, $0.000006 par value\nMETA\nThe Nasdaq Stock Market LLC\nSecurities registered pursuant to Section 12(g) of the Act: None\nIndicate by check mark if the registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act.\xa0\xa0\xa0\xa0Yes\xa0\xa0☒\xa0\xa0No\xa0\xa0 ☐" metadata={'source': 'https://d18rn0p25nwr6d.cloudfront.net/CIK-0001326801/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf', 'file_path': 'https://d18rn0p25nwr6d.cloudfront.net/CIK-0001326801/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf', 'page': 0, 'total_pages': 147, 'format': 'PDF 1.4', 't

In [26]:
response = retrieval_augmented_qa_chain.invoke({"question" : "What is temperature of the Sun?"})

In [27]:
response["response"].content

"I don't know"