In [1]:
!pip3 install --upgrade --quiet langchain langchain-community
!pip3 install --upgrade --quiet pypdf pandas streamlit python-dotenv

In [2]:
# Import Langchain modules
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

# Other modules and packages
import os
import tempfile
import streamlit as st  
import pandas as pd
from dotenv import load_dotenv
import tiktoken



For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
load_dotenv()

True

In [4]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

## Defining LLM

In [5]:
llm = ChatOpenAI(model="gpt-4o-mini", api_key=OPENAI_API_KEY)
llm.invoke("Tell me a joke about cats")

  llm = ChatOpenAI(model="gpt-4o-mini", api_key=OPENAI_API_KEY)


AIMessage(content='Why did the cat sit on the computer?\n\nBecause it wanted to keep an eye on the mouse!', additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 21, 'prompt_tokens': 13, 'total_tokens': 34, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini', 'system_fingerprint': 'fp_b8bc95a0ac', 'finish_reason': 'stop', 'logprobs': None}, id='run-076ac7c5-547b-480a-b62b-fbdc91174233-0')

## Processing of PDF Documents

In [6]:
loader = PyPDFLoader('/Users/mukundshrivas/Downloads/Python Projects GRAD/.vscode/data/aapl-20240928.pdf')
pages = loader.load()
pages

Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 22 0 (offset 0)
Ignoring wrong pointing object 228 0 (offset 0)


[Document(metadata={'producer': 'macOS Version 15.3.2 (Build 24D81) Quartz PDFContext', 'creator': 'Safari', 'creationdate': "D:20250324152430Z00'00'", 'title': 'aapl-20240928', 'author': 'Mukund Shrivastava', 'moddate': "D:20250324152430Z00'00'", 'source': '/Users/mukundshrivas/Downloads/Python Projects GRAD/.vscode/data/aapl-20240928.pdf', 'total_pages': 97, 'page': 0, 'page_label': '1'}, page_content='24/03/25, 11:24\u202fAMaapl-20240928\nPage 1 of 97https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm\nUNITED STATES\nSECURITIES AND EXCHANGE COMMISSIONWashington, D.C. 20549\nFORM 10-K\n(Mark One)\n☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFor the ﬁscal year ended September\xa028, 2024\nor\n☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFor the transition period from \xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 to \xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\x

### Split Documents

In [7]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size= 1500,
                                               chunk_overlap=250, 
                                               length_function=len,
                                                separators= ["\n\n","\n"," " ])
chunks = text_splitter.split_documents(pages)

In [8]:
for i, chunk in enumerate(chunks):
    if "total current assets" in chunk.page_content.lower():
        print(f"Found in chunk {i}:")
        print(chunk.page_content)


Found in chunk 126:
24/03/25, 11:24 AMaapl-20240928
Page 52 of 97https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm
Apple Inc.
CONSOLIDATED BALANCE SHEETS
(In millions, except number of shares, which are reﬂected in thousands, and par value)
September 28,2024 September 30,2023
ASSETS:
Current assets:
Cash and cash equivalents $ 29,943  $ 29,965 
Marketable securities 35,228  31,590 
Accounts receivable, net 33,410  29,508 
Vendor non-trade receivables 32,833  31,477 
Inventories 7,286  6,331 
Other current assets 14,287  14,695 
Total current assets 152,987  143,566 
Non-current assets:
Marketable securities 91,479  100,544 
Property, plant and equipment, net 45,680  43,715 
Other non-current assets 74,834  64,758 
Total non-current assets 211,993  209,017 
Total assets $ 364,980  $ 352,583 
LIABILITIES AND SHAREHOLDERS’ EQUITY:
Current liabilities:
Accounts payable $ 68,960  $ 62,611 
Other current liabilities 78,304  58,829 
Deferred revenue 8,249  8,

In [9]:
def get_embedding_function():
    embeddings = OpenAIEmbeddings(
        model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY
    )
    return embeddings

embedding_function = get_embedding_function()
test_vector = embedding_function.embed_query("cat")

  embeddings = OpenAIEmbeddings(


In [10]:
from langchain.evaluation import load_evaluator

evaluator = load_evaluator(evaluator="embedding_distance", 
                            embeddings=embedding_function)

evaluator.evaluate_strings(prediction="markets", reference="capital")

{'score': 0.15376928728742412}

In [11]:
evaluator.evaluate_strings(prediction="Street", reference="Wall")

{'score': 0.1429715643438052}

In [12]:
import uuid

def create_vectorstore(chunks, embedding_function, vectorstore_path):

    # Create a list of unique ids for each document based on the content
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]
    
    # Ensure that only unique docs with unique ids are kept
    unique_ids = set()
    unique_chunks = []
    
    unique_chunks = [] 
    for chunk, id in zip(chunks, ids):     
        if id not in unique_ids:       
            unique_ids.add(id)
            unique_chunks.append(chunk) 

    # Create a new Chroma database from the documents
    vectorstore = Chroma.from_documents(documents=unique_chunks, 
                                        ids=list(unique_ids),
                                        embedding=embedding_function, 
                                        persist_directory = vectorstore_path)

    vectorstore.persist()
    
    return vectorstore

In [13]:
# Create vectorstore
vectorstore = create_vectorstore(chunks=chunks, 
                                 embedding_function=embedding_function, 
                                 vectorstore_path="vectorstore_test")

  vectorstore.persist()


## Query for Relevant Database

In [14]:
# Load vectorstore
vectorstore = Chroma(persist_directory="vectorstore_chroma", embedding_function=embedding_function)

  vectorstore = Chroma(persist_directory="vectorstore_chroma", embedding_function=embedding_function)


In [15]:
embeddings = OpenAIEmbeddings()  # Ensure your API key is set
vectorstore = Chroma.from_documents(chunks, embeddings, collection_name="aapl")

In [16]:
#Verifying
data = vectorstore._collection.get()
print("Number of documents indexed:", len(data['documents']))

Number of documents indexed: 216


In [17]:
results = vectorstore.similarity_search("Total current assets September 28 2024", k=5)
for res in results:
    print(res.page_content[:300])


24/03/25, 11:24 AMaapl-20240928
Page 52 of 97https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm
Apple Inc.
CONSOLIDATED BALANCE SHEETS
(In millions, except number of shares, which are reﬂected in thousands, and par value)
September 28,2024 September 30,2023
ASSETS:
C
September 28, 2024 and September 30, 2023 (in millions):
Lease-Related Assets and Liabilities Financial Statement Line Items 2024 2023
Right-of-use assets:
Operating leases Other non-current assets $ 10,234  $ 10,661 
Finance leases Property, plant and equipment, net 1,069  1,015 
Total right-of-use
(1) Includes general and administrative compensation costs, various nonrecurring charges, and other separately managed
costs.
The following tables show net sales for 2024, 2023 and 2022 and long-lived assets as of September  28, 2024 and
September 30, 2023 for countries that individually accounted f
24/03/25, 11:24 AMaapl-20240928
Page 66 of 97https://www.sec.gov/Archives/edgar/data/320193/00003

In [18]:
# Create retriever and get relevant chunks
retriever = vectorstore.as_retriever(search_type="similarity") # Uses Cosine Distance to determine similarity
relevant_chunks = retriever.invoke("What were the Total Current Assets for the September 28,2024")
relevant_chunks

[Document(metadata={'author': 'Mukund Shrivastava', 'creationdate': "D:20250324152430Z00'00'", 'creator': 'Safari', 'moddate': "D:20250324152430Z00'00'", 'page': 51, 'page_label': '52', 'producer': 'macOS Version 15.3.2 (Build 24D81) Quartz PDFContext', 'source': '/Users/mukundshrivas/Downloads/Python Projects GRAD/.vscode/data/aapl-20240928.pdf', 'title': 'aapl-20240928', 'total_pages': 97}, page_content='24/03/25, 11:24\u202fAMaapl-20240928\nPage 52 of 97https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm\nApple Inc.\nCONSOLIDATED BALANCE SHEETS\n(In millions, except number of shares, which are reﬂected in thousands, and par value)\nSeptember 28,2024 September 30,2023\nASSETS:\nCurrent assets:\nCash and cash equivalents $ 29,943\xa0 $ 29,965\xa0\nMarketable securities 35,228\xa0 31,590\xa0\nAccounts receivable, net 33,410\xa0 29,508\xa0\nVendor non-trade receivables 32,833\xa0 31,477\xa0\nInventories 7,286\xa0 6,331\xa0\nOther current assets 14,287\xa0

In [19]:
# Prompt template
PROMPT_TEMPLATE = """
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING.

{context}

---

Answer the question based on the above context: {question}
"""

In [20]:
# Concatenate context text
context_text = "\n\n---\n\n".join([doc.page_content for doc in relevant_chunks])

# Create prompt
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, 
                                question="What were the Total Current Assets for the September 28,2024")
print(prompt)

Human: 
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING.

24/03/25, 11:24 AMaapl-20240928
Page 52 of 97https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm
Apple Inc.
CONSOLIDATED BALANCE SHEETS
(In millions, except number of shares, which are reﬂected in thousands, and par value)
September 28,2024 September 30,2023
ASSETS:
Current assets:
Cash and cash equivalents $ 29,943  $ 29,965 
Marketable securities 35,228  31,590 
Accounts receivable, net 33,410  29,508 
Vendor non-trade receivables 32,833  31,477 
Inventories 7,286  6,331 
Other current assets 14,287  14,695 
Total current assets 152,987  143,566 
Non-current assets:
Marketable securities 91,479  100,544 
Property, plant and equipment, net 45,680  43,715 
Other non-current assets 74,834  64,758 
Total non-current assets 211,993  209,017 
Total asse

## Generate Responses

In [21]:
llm.invoke(prompt)

AIMessage(content='The Total Current Assets for September 28, 2024, were $152,987 million.', additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 21, 'prompt_tokens': 1875, 'total_tokens': 1896, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini', 'system_fingerprint': 'fp_b8bc95a0ac', 'finish_reason': 'stop', 'logprobs': None}, id='run-b2190a5e-a745-4c4d-a347-b3022bc98c1e-0')

## Using LangChain Expression Language

In [22]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt_template
            | llm
        )
rag_chain.invoke("What’s the company’s debt situation?")

AIMessage(content="As of September 28, 2024, the company's debt situation includes the following key points:\n\n1. **Total Debt**: The company has outstanding fixed-rate notes totaling $97.3 billion, with $10.9 billion of this amount payable within the next 12 months. The total term debt, after accounting for unamortized premiums and discounts, is $96.7 billion.\n\n2. **Interest Payments**: Future interest payments associated with these notes total $38.5 billion, with $2.6 billion payable within the next 12 months.\n\n3. **Commercial Paper**: The company has $10.0 billion of commercial paper outstanding, all of which is also payable within the next 12 months. The weighted-average interest rate for this commercial paper is 5.00%.\n\n4. **Lease Obligations**: The company has fixed lease payment obligations totaling $15.6 billion, with $2.0 billion payable within 12 months.\n\n5. **Cash Resources**: The company believes that its cash balances, totaling $140.8 billion, along with cash gene