# url: https://python.langchain.com/docs/tutorials/retrievers/

In [7]:
# DOCUMENT: https://python.langchain.com/api_reference/core/documents/langchain_core.documents.base.Document.html

# A document is an object, which contains a chunk of text that belongs to some source (e.g. a file, a web page, etc.). It has its own metadata, which can be used to identify the source of the document (more on tutorial url.)

from langchain_core.documents import Document
from dotenv import load_dotenv
load_dotenv()

# you can create your own documents too... or extract them from text, as we will in the next cell.

documents = [
    Document(
        page_content="Dogs are great companions, known for their loyalty and friendliness.",
        metadata={"source": "mammal-pets-doc"},
    ),
    Document(
        page_content="Cats are independent pets that often enjoy their own space.",
        metadata={"source": "mammal-pets-doc"},
    ),
]

# a document contains the foll 3 fields - 
# 1. page_content - the text of the document
# 2. metadata - the metadata of the document
# 3. id - the id of the document

# you can access the fields of a document using the dot notation

print(documents[0].page_content)
print(documents[0].metadata)
print(documents[0].id)

Dogs are great companions, known for their loyalty and friendliness.
{'source': 'mammal-pets-doc'}
None


In [8]:
# loading a pdf file's contents into a list of documents

from langchain_community.document_loaders import PyPDFLoader

file_path = "/Users/ketankunkalikar/Desktop/langchain/example_data/nke-10k-2023.pdf"
loader = PyPDFLoader(file_path)

docs = loader.load()

print(len(docs))

# pypdfloader is one of the many document loaders available in langchain. For pypdfloader, each document is a page of the pdf file.

print(docs[0].page_content)
print(docs[0].metadata)
print(docs[0].id)

107
Table of Contents
UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-K
(Mark One)
☑  ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934
FOR THE FISCAL YEAR ENDED MAY 31, 2023
OR
☐  TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934
FOR THE TRANSITION PERIOD FROM                         TO                         .
Commission File No. 1-10635
NIKE, Inc.
(Exact name of Registrant as specified in its charter)
Oregon 93-0584541
(State or other jurisdiction of incorporation) (IRS Employer Identification No.)
One Bowerman Drive, Beaverton, Oregon 97005-6453
(Address of principal executive offices and zip code)
(503) 671-6453
(Registrant's telephone number, including area code)
SECURITIES REGISTERED PURSUANT TO SECTION 12(B) OF THE ACT:
Class B Common Stock NKE New York Stock Exchange
(Title of each class) (Trading symbol) (Name of each exchange on which registered)
SECURITIES REGISTERED P

In [9]:
# sometimes, for information retrieval purposes, it makes sense to further split the documents into smaller chunks. This holds true for pypdfloader especially, where each document is an entire page. You want the chunks to be 'contextually atomic', such that each chunk is a coherent piece of information that could be the answer to a query, and not just contain random pieces of text that do not fit together.

# we use the recursivecharactertextsplitter to split the documents into smaller chunks. NOTE that these chunks are still document objects.

from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

print(len(all_splits))

# you can see that the number of documents after splitting is much higher than the number of documents before splitting. This is because the text splitter is splitting the documents into smaller chunks based on the size you set.

# print(all_splits[0].page_content)
print(all_splits[4].metadata)
print(all_splits[4].id)

516
{'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2023-07-20T16:22:00-04:00', 'title': '0000320187-23-000039', 'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'subject': 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31', 'keywords': '0000320187-23-000039; ; 10-K', 'moddate': '2023-07-20T16:22:08-04:00', 'source': '/Users/ketankunkalikar/Desktop/langchain/example_data/nke-10k-2023.pdf', 'total_pages': 107, 'page': 0, 'page_label': '1', 'start_index': 3276}
None


In [10]:
# creating embeddings for the documents. Once you've created embeddings for the documents, you can apply vector search on them. This is how basic Text RAG works.

from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [11]:
vector_1 = embeddings.embed_query(all_splits[0].page_content)
vector_2 = embeddings.embed_query(all_splits[1].page_content)

assert len(vector_1) == len(vector_2)
print(f"Generated vectors of length {len(vector_1)}\n")
print(vector_1[:10])

# Once you've created embeddings for all the documents, you'll want to store them somewhere. Storing them in a vector store allows for efficient similarity search on the documents while querying.

Generated vectors of length 768

[0.04747237637639046, 0.021675799041986465, -0.00901806354522705, 0.005356728099286556, 0.02555772289633751, -0.010230298154056072, -0.008413928560912609, 0.03930393233895302, 0.021570468321442604, -0.02409539930522442]


In [12]:
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="tutorial_collection",
    embedding_function=embeddings,
    persist_directory="/Users/ketankunkalikar/Desktop/langchain/vector_stores",  # Where to save data locally, remove if not necessary
)

In [None]:
# you can also create an inmemory vector store apparently... 

# from langchain_core.vectorstores import InMemoryVectorStore

# vector_store = InMemoryVectorStore(embeddings)

In [13]:
ids = vector_store.add_documents(documents=all_splits)

In [None]:
# Once we've instantiated a VectorStore that contains documents, we can query it. VectorStore includes methods for querying:

# Synchronously and asynchronously;
# By string query and by vector;
# With and without returning similarity scores;
# By similarity and maximum marginal relevance (to balance similarity with query to diversity in retrieved results).
# The methods will generally include a list of Document objects in their outputs.

In [14]:
results = vector_store.similarity_search(
    "How many distribution centers does Nike have in the US?"
)

print(results[0])

page_content='operations. We also lease an office complex in Shanghai, China, our headquarters for our Greater China geography, occupied by employees focused on implementing our
wholesale, NIKE Direct and merchandising strategies in the region, among other functions.
In the United States, NIKE has eight significant distribution centers. Five are located in or near Memphis, Tennessee, two of which are owned and three of which are
leased. Two other distribution centers, one located in Indianapolis, Indiana and one located in Dayton, Tennessee, are leased and operated by third-party logistics
providers. One distribution center for Converse is located in Ontario, California, which is leased. NIKE has a number of distribution facilities outside the United States,
some of which are leased and operated by third-party logistics providers. The most significant distribution facilities outside the United States are located in Laakdal,' metadata={'author': 'EDGAR Online, a division of Donnelley Fi

In [15]:
# async query.

results = await vector_store.asimilarity_search("When was Nike incorporated?")

print(results[0])

page_content='Table of Contents
PART I
ITEM 1. BUSINESS
GENERAL
NIKE, Inc. was incorporated in 1967 under the laws of the State of Oregon. As used in this Annual Report on Form 10-K (this "Annual Report"), the terms "we," "us," "our,"
"NIKE" and the "Company" refer to NIKE, Inc. and its predecessors, subsidiaries and affiliates, collectively, unless the context indicates otherwise.
Our principal business activity is the design, development and worldwide marketing and selling of athletic footwear, apparel, equipment, accessories and services. NIKE is
the largest seller of athletic footwear and apparel in the world. We sell our products through NIKE Direct operations, which are comprised of both NIKE-owned retail stores
and sales through our digital platforms (also referred to as "NIKE Brand Digital"), to retail accounts and to a mix of independent distributors, licensees and sales' metadata={'page_label': '4', 'page': 3, 'subject': 'Form 10-K filed on 2023-07-20 for the period ending 20

In [16]:
# Note that providers implement different scores; the score here
# is a distance metric that varies inversely with similarity.

results = vector_store.similarity_search_with_score("What was Nike's revenue in 2023?")
doc, score = results[0]
print(f"Score: {score}\n")
print(doc)

Score: 0.37252235412597656

page_content='Table of Contents
YEAR ENDED MAY 31,
(Dollars in millions) 2023 2022 2021
REVENUES
North America $ 21,608 $ 18,353 $ 17,179 
Europe, Middle East & Africa 13,418 12,479 11,456 
Greater China 7,248 7,547 8,290 
Asia Pacific & Latin America 6,431 5,955 5,343 
Global Brand Divisions 58 102 25 
Total NIKE Brand 48,763 44,436 42,293 
Converse 2,427 2,346 2,205 
Corporate 27 (72) 40 
TOTAL NIKE, INC. REVENUES $ 51,217 $ 46,710 $ 44,538 
EARNINGS BEFORE INTEREST AND TAXES
North America $ 5,454 $ 5,114 $ 5,089 
Europe, Middle East & Africa 3,531 3,293 2,435 
Greater China 2,283 2,365 3,243 
Asia Pacific & Latin America 1,932 1,896 1,530 
Global Brand Divisions (4,841) (4,262) (3,656)
Converse 676 669 543 
Corporate (2,840) (2,219) (2,261)
Interest expense (income), net (6) 205 262 
TOTAL NIKE, INC. INCOME BEFORE INCOME TAXES $ 6,201 $ 6,651 $ 6,661 
ADDITIONS TO PROPERTY, PLANT AND EQUIPMENT
North America $ 283 $ 146 $ 98 
Europe, Middle East & Africa 2

In [17]:
# you can also convert your query into a vector manually, and then do similarity search by vector in the vectorstore instead.

embedding = embeddings.embed_query("How were Nike's margins impacted in 2023?")

results = vector_store.similarity_search_by_vector(embedding)
print(results[0])

page_content='Table of Contents
GROSS MARGIN
FISCAL 2023 COMPARED TO FISCAL 2022
For fiscal 2023, our consolidated gross profit increased 4% to $22,292 million compared to $21,479 million for fiscal 2022. Gross margin decreased 250 basis points to
43.5% for fiscal 2023 compared to 46.0% for fiscal 2022 due to the following:
*Wholesale equivalent
The decrease in gross margin for fiscal 2023 was primarily due to:
• Higher NIKE Brand product costs, on a wholesale equivalent basis, primarily due to higher input costs and elevated inbound freight and logistics costs as well as
product mix;
• Lower margin in our NIKE Direct business, driven by higher promotional activity to liquidate inventory in the current period compared to lower promotional activity in
the prior period resulting from lower available inventory supply;
• Unfavorable changes in net foreign currency exchange rates, including hedges; and
• Lower off-price margin, on a wholesale equivalent basis.
This was partially offset by:'

In [21]:
# RETRIEVERS.

# Retrievers are runnables, so they have a set of methods you can use to call them. They are usually used to standardise the process of extracting information from various sources. Even if the only source you're working with is a vector store, it still makes sense to implement a retriever on top of it so that you can query the vector store in a more user friendly + more customizable way too.

# NOTE that retrievers are not just for vector stores, it could be used for api's, databases, etc.

# below is a custom implementation of a retriever, note that there's a bunch of retrievers already implemented in langchain, especially for vector stores.

from typing import List

from langchain_core.documents import Document
from langchain_core.runnables import chain


@chain # this is a decorator that makes the function a runnable.
def retriever(query: str) -> List[Document]:
    return vector_store.similarity_search(query, k=1)


answer_docs_stack = retriever.batch(
    [
        "How many distribution centers does Nike have in the US?",
        "When was Nike incorporated?",
    ],
)

for docs in answer_docs_stack:
    for doc in docs:
        print(doc)
        print("-"*100)

page_content='operations. We also lease an office complex in Shanghai, China, our headquarters for our Greater China geography, occupied by employees focused on implementing our
wholesale, NIKE Direct and merchandising strategies in the region, among other functions.
In the United States, NIKE has eight significant distribution centers. Five are located in or near Memphis, Tennessee, two of which are owned and three of which are
leased. Two other distribution centers, one located in Indianapolis, Indiana and one located in Dayton, Tennessee, are leased and operated by third-party logistics
providers. One distribution center for Converse is located in Ontario, California, which is leased. NIKE has a number of distribution facilities outside the United States,
some of which are leased and operated by third-party logistics providers. The most significant distribution facilities outside the United States are located in Laakdal,' metadata={'page': 26, 'moddate': '2023-07-20T16:22:08-04:00',

In [None]:
retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 1},
)

retriever.batch(
    [
        "How many distribution centers does Nike have in the US?",
        "When was Nike incorporated?",
    ],
)