## Setting up environnement

In [1]:
!pip install -qU langchain langchain-core langchain-community langchain-openai

In [2]:
!pip install -qU qdrant-client

In [3]:
!pip install -qU tiktoken pymupdf

In [4]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

In [17]:
from langchain_openai import ChatOpenAI

openai_chat_model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

## Loading the data

In [5]:
from langchain.document_loaders import PyMuPDFLoader

docs = PyMuPDFLoader("https://d18rn0p25nwr6d.cloudfront.net/CIK-0001326801/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf").load()

## Chunking the data

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tiktoken

def tiktoken_len(text):
    tokens = tiktoken.encoding_for_model("gpt-3.5-turbo").encode(
        text,
    )
    return len(tokens)


In [10]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 200,
    chunk_overlap = 50,
    length_function = tiktoken_len,
)

split_chunks = text_splitter.split_documents(docs)

In [11]:
len(split_chunks)

765

## Embedding and vectore storing

In [12]:
from langchain_openai.embeddings import OpenAIEmbeddings

embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

In [13]:
from langchain_community.vectorstores import Qdrant

qdrant_vectorstore = Qdrant.from_documents(
    split_chunks,
    embedding_model,
    location=":memory:",
    collection_name="Meta 10-k Fillings",
)

In [14]:
qdrant_retriever = qdrant_vectorstore.as_retriever()

## RAG Prompt

In [15]:
from langchain_core.prompts import ChatPromptTemplate

In [16]:
RAG_PROMPT = """
CONTEXT:
{context}

QUERY:
{question}

Answer the query if the context is related to it; otherwise, answer: 'Sorry, the context is unrelated to the query, I can't answer.'
"""

rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)

## RAG Chain

In [18]:
from operator import itemgetter
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough

retrieval_augmented_qa_chain = (
    # INVOKE CHAIN WITH: {"question" : "<<SOME USER QUESTION>>"}
    # "question" : populated by getting the value of the "question" key
    # "context"  : populated by getting the value of the "question" key and chaining it into the base_retriever
    {"context": itemgetter("question") | qdrant_retriever, "question": itemgetter("question")}
    # "context"  : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)
    #              by getting the value of the "context" key from the previous step
    | RunnablePassthrough.assign(context=itemgetter("context"))
    # "response" : the "context" and "question" values are used to format our prompt object and then piped
    #              into the LLM and stored in a key called "response"
    # "context"  : populated by getting the value of the "context" key from the previous step
    | {"response": rag_prompt | openai_chat_model, "context": itemgetter("context")}
)

## Response generation

In [19]:
response_1 = retrieval_augmented_qa_chain.invoke({"question" : "What was the total value of 'Cash and cash equivalents' as of December 31, 2023?"})
response_1["response"].content

"The total value of 'Cash and cash equivalents' as of December 31, 2023, was $41,862."

In [20]:
response_2 = retrieval_augmented_qa_chain.invoke({"question" : "Who are Meta's 'Directors' (i.e., members of the Board of Directors)?"})
response_2["response"].content

"Sorry, the context is unrelated to the query, I can't answer."

In [23]:
response_2["context"]

[Document(page_content='to having a skilled, inclusive and diverse workforce because we believe cognitive diversity fuels innovation. To aid in this effort, we have taken steps to reduce\nbias from our hiring processes and performance management systems, as well as offering learning and development courses for our employees.\nCorporate Information\nWe were incorporated in Delaware in July 2004. We completed our initial public offering in May 2012 and our Class\xa0A common stock is currently listed\non the Nasdaq Global Select Market under the symbol "META." Our principal executive offices are located at 1 Meta Way, Menlo Park, California 94025, and\nour telephone number is (650) 543-4800.\nMeta, the Meta logo, Meta Quest, Meta Horizon, Facebook, FB, Instagram, Oculus, WhatsApp, Reels, and our other registered or common law', metadata={'source': 'https://d18rn0p25nwr6d.cloudfront.net/CIK-0001326801/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf', 'file_path': 'https://d18rn0p25nwr6d.cloudfron

## First pipeline results analysis

- The answer to the first question is right.
- The answer to the second question is false. It shows that the pipeline is not able to retrieve the context needed to answer.
- I will have to upgrade the context retrieval part of the pipeline.

## Upgrading the chunking strategy
- As we're dealing with a large PDF including tables, let's try to adapt the chunking size to a larger value

In [24]:
text_splitter_2 = RecursiveCharacterTextSplitter(
    chunk_size = 800,
    chunk_overlap = 100,
    length_function = tiktoken_len,
)

split_chunks_2 = text_splitter_2.split_documents(docs)

In [25]:
len(split_chunks_2)

220

In [26]:
qdrant_vectorstore_2 = Qdrant.from_documents(
    split_chunks_2,
    embedding_model,
    location=":memory:",
    collection_name="Meta 10-k Fillings",
)

In [27]:
qdrant_retriever_2 = qdrant_vectorstore.as_retriever()

In [28]:
retrieval_augmented_qa_chain = (

    {"context": itemgetter("question") | qdrant_retriever_2, "question": itemgetter("question")}

    | RunnablePassthrough.assign(context=itemgetter("context"))

    | {"response": rag_prompt | openai_chat_model, "context": itemgetter("context")}
)

In [29]:
response_1b = retrieval_augmented_qa_chain.invoke({"question" : "What was the total value of 'Cash and cash equivalents' as of December 31, 2023?"})
response_1b["response"].content

"The total value of 'Cash and cash equivalents' as of December 31, 2023, was $41,862."

In [30]:
response_2b = retrieval_augmented_qa_chain.invoke({"question" : "Who are Meta's 'Directors' (i.e., members of the Board of Directors)?"})
response_2b["response"].content

"Sorry, the context is unrelated to the query, I can't answer."

## Second pipeline results analysis
- The second pipeline shows the same results as the first one. The context retrieval is still not working properly, despite the larger chunking size.

## Upgrading the retrieval strategy
- While conserving the same chunking strategy as in the first pipeline, I will try to upgrade the retrieval strategy by using the MultiQueryRetriever.

In [36]:
from langchain.retrievers import MultiQueryRetriever

multiquery_retriever = MultiQueryRetriever.from_llm(retriever=qdrant_retriever_2, llm=openai_chat_model)

In [37]:
retrieval_augmented_qa_chain_3 = (

    {"context": itemgetter("question") | multiquery_retriever, "question": itemgetter("question")}

    | RunnablePassthrough.assign(context=itemgetter("context"))

    | {"response": rag_prompt | openai_chat_model, "context": itemgetter("context")}
)

In [38]:
response_1c = retrieval_augmented_qa_chain.invoke({"question" : "What was the total value of 'Cash and cash equivalents' as of December 31, 2023?"})
response_1c["response"].content

"The total value of 'Cash and cash equivalents' as of December 31, 2023, was $41,862."

In [39]:
response_2c = retrieval_augmented_qa_chain.invoke({"question" : "Who are Meta's 'Directors' (i.e., members of the Board of Directors)?"})
response_2c["response"].content

"Sorry, the context is unrelated to the query, I can't answer."

In [40]:
response_2c["context"]

[Document(page_content='to having a skilled, inclusive and diverse workforce because we believe cognitive diversity fuels innovation. To aid in this effort, we have taken steps to reduce\nbias from our hiring processes and performance management systems, as well as offering learning and development courses for our employees.\nCorporate Information\nWe were incorporated in Delaware in July 2004. We completed our initial public offering in May 2012 and our Class\xa0A common stock is currently listed\non the Nasdaq Global Select Market under the symbol "META." Our principal executive offices are located at 1 Meta Way, Menlo Park, California 94025, and\nour telephone number is (650) 543-4800.\nMeta, the Meta logo, Meta Quest, Meta Horizon, Facebook, FB, Instagram, Oculus, WhatsApp, Reels, and our other registered or common law', metadata={'source': 'https://d18rn0p25nwr6d.cloudfront.net/CIK-0001326801/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf', 'file_path': 'https://d18rn0p25nwr6d.cloudfron