In [2]:
import os
from dotenv import load_dotenv
from langchain.chat_models import init_chat_model

load_dotenv()

api_key = os.getenv("QWEN_API_KEY")
base_url = os.getenv("QWEN_API_BASE_URL")

In [3]:
from langchain_core.documents import Document

documents = [
    Document(
        page_content="Dogs are great companions, known for their loyalty and friendliness.",
        metadata={"source": "mammal-pets-doc"},
    ),
    Document(
        page_content="Cats are independent pets that often enjoy their own space.",
        metadata={"source": "mammal-pets-doc"},
    ),
]

In [4]:
from langchain_community.document_loaders import PyPDFLoader

file_path = r"C:\Users\WLZX\Downloads\nke-10k-2023.pdf"
loader = PyPDFLoader(file_path)

docs = loader.load()

print(len(docs))

107


In [5]:
print(f"{docs[0].page_content[:200]}\n")
print(docs[0].metadata)

Table of Contents
UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-K
(Mark One)
☑  ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934
F

{'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2023-07-20T16:22:00-04:00', 'title': '0000320187-23-000039', 'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'subject': 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31', 'keywords': '0000320187-23-000039; ; 10-K', 'moddate': '2023-07-20T16:22:08-04:00', 'source': 'C:\\Users\\WLZX\\Downloads\\nke-10k-2023.pdf', 'total_pages': 107, 'page': 0, 'page_label': '1'}


In [7]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

len(all_splits)

516

In [30]:
all_splits[0].page_content[:100]

'Table of Contents\nUNITED STATES\nSECURITIES AND EXCHANGE COMMISSION\nWashington, D.C. 20549\nFORM 10-K\n'

langchain的openai接口不支持阿里云的api，解析格式可能不匹配，使用社区提供的dashscope接口

In [26]:
from langchain_community.embeddings import DashScopeEmbeddings
embeddings = DashScopeEmbeddings(
    model="text-embedding-v4",dashscope_api_key=api_key
)

In [27]:
vector_1 = embeddings.embed_query(all_splits[0].page_content)
vector_2 = embeddings.embed_query(all_splits[1].page_content)

assert len(vector_1) == len(vector_2)
print(f"Generated vectors of length {len(vector_1)}\n")
print(vector_1[:10])

Generated vectors of length 1024

[-0.024219179525971413, 0.044021837413311005, 0.03432842716574669, 0.0021132633555680513, 0.03424239158630371, 0.11247794330120087, 0.02024717628955841, 0.006122906692326069, -0.024391252547502518, 0.07003343850374222]


In [28]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)

阿里云的embedding服务最大批次为10，所以需要手动拆分

In [32]:
batch_size = 10  # 根据错误信息设置批处理大小
total_docs = len(all_splits)
ids = []

print(f"总共有 {total_docs} 个文档块需要添加。")

for i in range(0, total_docs, batch_size):
    # 获取当前批次的文档
    batch = all_splits[i:i + batch_size]
    
    # 将当前批次添加到向量存储中
    batch_ids = vector_store.add_documents(documents=batch)
    if batch_ids:
        ids.extend(batch_ids)

    print(f"成功添加了批次 {i//batch_size + 1}，包含 {len(batch)} 个文档。")

print("\n所有文档已成功添加到向量存储中。")

总共有 516 个文档块需要添加。
成功添加了批次 1，包含 10 个文档。
成功添加了批次 2，包含 10 个文档。
成功添加了批次 3，包含 10 个文档。
成功添加了批次 4，包含 10 个文档。
成功添加了批次 5，包含 10 个文档。
成功添加了批次 6，包含 10 个文档。
成功添加了批次 7，包含 10 个文档。
成功添加了批次 8，包含 10 个文档。
成功添加了批次 9，包含 10 个文档。
成功添加了批次 10，包含 10 个文档。
成功添加了批次 11，包含 10 个文档。
成功添加了批次 12，包含 10 个文档。
成功添加了批次 13，包含 10 个文档。
成功添加了批次 14，包含 10 个文档。
成功添加了批次 15，包含 10 个文档。
成功添加了批次 16，包含 10 个文档。
成功添加了批次 17，包含 10 个文档。
成功添加了批次 18，包含 10 个文档。
成功添加了批次 19，包含 10 个文档。
成功添加了批次 20，包含 10 个文档。
成功添加了批次 21，包含 10 个文档。
成功添加了批次 22，包含 10 个文档。
成功添加了批次 23，包含 10 个文档。
成功添加了批次 24，包含 10 个文档。
成功添加了批次 25，包含 10 个文档。
成功添加了批次 26，包含 10 个文档。
成功添加了批次 27，包含 10 个文档。
成功添加了批次 28，包含 10 个文档。
成功添加了批次 29，包含 10 个文档。
成功添加了批次 30，包含 10 个文档。
成功添加了批次 31，包含 10 个文档。
成功添加了批次 32，包含 10 个文档。
成功添加了批次 33，包含 10 个文档。
成功添加了批次 34，包含 10 个文档。
成功添加了批次 35，包含 10 个文档。
成功添加了批次 36，包含 10 个文档。
成功添加了批次 37，包含 10 个文档。
成功添加了批次 38，包含 10 个文档。
成功添加了批次 39，包含 10 个文档。
成功添加了批次 40，包含 10 个文档。
成功添加了批次 41，包含 10 个文档。
成功添加了批次 42，包含 10 个文档。
成功添加了批次 43，包含 10 个文档。
成功添加了批次 44，包含 10 个文档。
成功添加了批次 45，包含 10 个文档。
成

In [33]:
results = vector_store.similarity_search(
    "How many distribution centers does Nike have in the US?"
)

print(results[0])

page_content='direct to consumer operations sell products through the following number of retail stores in the United States:
U.S. RETAIL STORES NUMBER
NIKE Brand factory stores 213 
NIKE Brand in-line stores (including employee-only stores) 74 
Converse stores (including factory stores) 82 
TOTAL 369 
In the United States, NIKE has eight significant distribution centers. Refer to Item 2. Properties for further information.
2023 FORM 10-K 2' metadata={'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2023-07-20T16:22:00-04:00', 'title': '0000320187-23-000039', 'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'subject': 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31', 'keywords': '0000320187-23-000039; ; 10-K', 'moddate': '2023-07-20T16:22:08-04:00', 'source': 'C:\\Users\\WLZX\\Downloads\\nke-10k-2023.pdf', 'total_pages': 107, 'page': 4, 'page_label': '5', 'start_index': 3125}


In [34]:
results = await vector_store.asimilarity_search("When was Nike incorporated?")

print(results[0])

page_content='Table of Contents
PART I
ITEM 1. BUSINESS
GENERAL
NIKE, Inc. was incorporated in 1967 under the laws of the State of Oregon. As used in this Annual Report on Form 10-K (this "Annual Report"), the terms "we," "us," "our,"
"NIKE" and the "Company" refer to NIKE, Inc. and its predecessors, subsidiaries and affiliates, collectively, unless the context indicates otherwise.
Our principal business activity is the design, development and worldwide marketing and selling of athletic footwear, apparel, equipment, accessories and services. NIKE is
the largest seller of athletic footwear and apparel in the world. We sell our products through NIKE Direct operations, which are comprised of both NIKE-owned retail stores
and sales through our digital platforms (also referred to as "NIKE Brand Digital"), to retail accounts and to a mix of independent distributors, licensees and sales' metadata={'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'cr

In [35]:
# Note that providers implement different scores; the score here
# is a distance metric that varies inversely with similarity.

results = vector_store.similarity_search_with_score("What was Nike's revenue in 2023?")
doc, score = results[0]
print(f"Score: {score}\n")
print(doc)

Score: 0.6764825167702695

page_content='Enterprise Resource Planning Platform, data and analytics, demand sensing, insight gathering, and other areas to create an end-to-end technology foundation, which we
believe will further accelerate our digital transformation. We believe this unified approach will accelerate growth and unlock more efficiency for our business, while driving
speed and responsiveness as we serve consumers globally.
FINANCIAL HIGHLIGHTS
• In fiscal 2023, NIKE, Inc. achieved record Revenues of $51.2 billion, which increased 10% and 16% on a reported and currency-neutral basis, respectively
• NIKE Direct revenues grew 14% from $18.7 billion in fiscal 2022 to $21.3 billion in fiscal 2023, and represented approximately 44% of total NIKE Brand revenues for
fiscal 2023
• Gross margin for the fiscal year decreased 250 basis points to 43.5% primarily driven by higher product costs, higher markdowns and unfavorable changes in foreign
currency exchange rates, partially offset 

In [36]:
embedding = embeddings.embed_query("How were Nike's margins impacted in 2023?")

results = vector_store.similarity_search_by_vector(embedding)
print(results[0])

page_content='Table of Contents
GROSS MARGIN
FISCAL 2023 COMPARED TO FISCAL 2022
For fiscal 2023, our consolidated gross profit increased 4% to $22,292 million compared to $21,479 million for fiscal 2022. Gross margin decreased 250 basis points to
43.5% for fiscal 2023 compared to 46.0% for fiscal 2022 due to the following:
*Wholesale equivalent
The decrease in gross margin for fiscal 2023 was primarily due to:
• Higher NIKE Brand product costs, on a wholesale equivalent basis, primarily due to higher input costs and elevated inbound freight and logistics costs as well as
product mix;
• Lower margin in our NIKE Direct business, driven by higher promotional activity to liquidate inventory in the current period compared to lower promotional activity in
the prior period resulting from lower available inventory supply;
• Unfavorable changes in net foreign currency exchange rates, including hedges; and
• Lower off-price margin, on a wholesale equivalent basis.
This was partially offset by:'