# Prerequisites

##### Install all the needed dependencies

In [1]:
%pip install --upgrade --quiet  langchain langchain-core langchain-community langchain-text-splitters langchain-milvus langchain-openai sec-api


Note: you may need to restart the kernel to use updated packages.


# Data Extraction and Preprocessing

In [1]:
%pip install sec-api

Collecting sec-api
  Downloading sec_api-1.0.25-py3-none-any.whl.metadata (51 kB)
Downloading sec_api-1.0.25-py3-none-any.whl (19 kB)
Installing collected packages: sec-api
Successfully installed sec-api-1.0.25
Note: you may need to restart the kernel to use updated packages.


##### https://sec-api.io
##### Using this API key to extract the filings from the SEC

In [1]:
from sec_api import XbrlApi

In [2]:
xbrlApi = XbrlApi("Your-API-Key") #Replace with your API Key
htm_url="<https://www.sec.gov/ix?doc=/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm>" 
xbrl_json = xbrlApi.xbrl_to_json(htm_url=htm_url)

##### We want only the ‘Cover Page’ section:

In [3]:
xbrl_json=xbrlApi.xbrl_to_json(htm_url=htm_url)   
result = [f"{key}: {value}\\n" for key, value in xbrl_json["CoverPage"].items()]
print(result)

['DocumentType: 10-K\\n', 'DocumentAnnualReport: true\\n', 'CurrentFiscalYearEndDate: --09-28\\n', 'DocumentPeriodEndDate: 2024-09-28\\n', 'DocumentTransitionReport: false\\n', 'EntityFileNumber: 001-36743\\n', 'EntityRegistrantName: Apple Inc.\\n', 'EntityIncorporationStateCountryCode: CA\\n', 'EntityTaxIdentificationNumber: 94-2404110\\n', 'EntityAddressAddressLine1: One Apple Park Way\\n', 'EntityAddressCityOrTown: Cupertino\\n', 'EntityAddressStateOrProvince: CA\\n', 'EntityAddressPostalZipCode: 95014\\n', 'CityAreaCode: 408\\n', 'LocalPhoneNumber: 996-1010\\n', "Security12bTitle: [{'period': {'startDate': '2023-10-01', 'endDate': '2024-09-28'}, 'segment': {'dimension': 'us-gaap:StatementClassOfStockAxis', 'value': 'us-gaap:CommonStockMember'}, 'value': 'Common Stock, $0.00001 par value per share'}, {'period': {'startDate': '2023-10-01', 'endDate': '2024-09-28'}, 'segment': {'dimension': 'us-gaap:StatementClassOfStockAxis', 'value': 'aapl:A0.000Notesdue2025Member'}, 'value': '0.000

# Building a RAG pipeline

### Data Ingestion with the text splitting:

In [4]:
from langchain.text_splitter import Language, RecursiveCharacterTextSplitter

In [5]:
class SimpleDocument:
    def __init__(self, content):
        self.page_content = content
        self.metadata = {}  # Initialize metadata attribute with an empty dictionary as its default value

from langchain_text_splitters import RecursiveCharacterTextSplitter

# Concatenate all strings in the result list into a single long string
long_text = "".join(result)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
simple_doc = SimpleDocument(long_text)
texts = text_splitter.split_documents([simple_doc]) 

### Create Embeddings:

In [6]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

### Store the embeddings in the Milvus DB:

In [8]:
from langchain_milvus import Milvus

vectorstore = Milvus.from_documents(  
    documents=texts,
    embedding=embeddings,
    connection_args={
        "uri": "http://localhost:19530",
    },
    drop_old=True,  # Drop the old Milvus collection if it exists
)


### Loading the Language Model:

In [19]:
query = "Please summarize the 10-K of Apple to stakeholders."
vectorstore.similarity_search(query, k=1)

[Document(metadata={'pk': 454737724161656766}, page_content="DocumentType: 10-K\\nDocumentAnnualReport: true\\nCurrentFiscalYearEndDate: --09-28\\nDocumentPeriodEndDate: 2024-09-28\\nDocumentTransitionReport: false\\nEntityFileNumber: 001-36743\\nEntityRegistrantName: Apple Inc.\\nEntityIncorporationStateCountryCode: CA\\nEntityTaxIdentificationNumber: 94-2404110\\nEntityAddressAddressLine1: One Apple Park Way\\nEntityAddressCityOrTown: Cupertino\\nEntityAddressStateOrProvince: CA\\nEntityAddressPostalZipCode: 95014\\nCityAreaCode: 408\\nLocalPhoneNumber: 996-1010\\nSecurity12bTitle: [{'period': {'startDate': '2023-10-01', 'endDate': '2024-09-28'}, 'segment': {'dimension': 'us-gaap:StatementClassOfStockAxis', 'value': 'us-gaap:CommonStockMember'}, 'value': 'Common Stock, $0.00001 par value per share'}, {'period': {'startDate': '2023-10-01', 'endDate': '2024-09-28'}, 'segment': {'dimension': 'us-gaap:StatementClassOfStockAxis', 'value': 'aapl:A0.000Notesdue2025Member'}, 'value': '0.000%

In [20]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)



PROMPT_TEMPLATE = """
Human: You are an AI assistant, and provides answers to questions by using fact based and statistical information when possible.
Use the following pieces of information to provide a concise answer to the question enclosed in <question> tags.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
<context>
{context}
</context>

<question>
{question}
</question>

The response should be specific and use statistics or numbers when possible.

Assistant:"""

prompt = PromptTemplate(
    template=PROMPT_TEMPLATE, input_variables=["context", "question"]
)
retriever = vectorstore.as_retriever()


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)



In [21]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


res = rag_chain.invoke(query)
res


"The 10-K of Apple for the fiscal year ending on September 28, 2024, shows that the company is a Large Accelerated Filer and a well-known seasoned issuer. Apple's common stock shares outstanding as of October 18, 2024, were 15,115,823,000. The company's public float was valued at $2,628,553,000,000 as of March 29, 2024. Apple's financial statements did not have any error corrections, and the auditor attested to the effectiveness of the company's internal control over financial reporting. The 10-K also includes information about the company's securities listed on NASDAQ and the incorporation of proxy statement details for the 2025 annual meeting of shareholders."