### Loading docs with langchain pyPDFLoader

In [14]:

# !pip install langchain-community pypdf
from langchain_community.document_loaders import PyPDFLoader

file_path = "data/May Apple 10q.pdf"
loader = PyPDFLoader(file_path)

docs = loader.load()

# The number of pages
print(f"The number of pages is {len(docs)}")
print()
print(f"The first page's first 200 characters is:\n{docs[0].page_content[:200]}\n")
 
print(f"The metadata is {docs[0].metadata}")



The number of pages is 29

The first page's first 200 characters is:
UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-Q
(Mark One)
☒  QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the quarterl

The metadata is {'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2025-05-02T06:05:09-04:00', 'title': '0000320193-25-000057', 'author': 'EDGAR® Online LLC, a subsidiary of OTC Markets Group', 'subject': 'Form 10-Q filed on 2025-05-02 for the period ending 2025-03-29', 'keywords': '0000320193-25-000057; ; 10-Q', 'moddate': '2025-05-02T06:07:30-04:00', 'source': 'data/May Apple 10q.pdf', 'total_pages': 29, 'page': 0, 'page_label': '1'}


### Splitting up docs into chunks

In [2]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
#Character based splitting, token based splitting is also avaible
#Overlap ensures context is not lost
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)
print("Performing character based splitting")
print(f"The number of total splits(chunks) is {len(all_splits)}")

Performing character based splitting
The number of total splits(chunks) is 108


### Making Embedings

In [3]:
#Pull embed model from ollama$
#!ollama pull nomic-embed-text
 
# Install dependancy
# !pip install -U langchain_ollama


In [4]:
from langchain_ollama import OllamaEmbeddings

embed = OllamaEmbeddings(
    model="nomic-embed-text"
)


In [None]:
# Sample first two chunks and embed them
vector_1 = embed.embed_query(all_splits[0].page_content)
vector_2 = embed.embed_query(all_splits[1].page_content)

print("Preview what vector of first chunk looks like")
assert len(vector_1) == len(vector_2)
print(f"Generated vectors of length {len(vector_1)}\n")
print(vector_1[:10])

Preview what vector of first chunk looks like
Generated vectors of length 768

[0.007543127, 0.06163994, -0.20194702, -0.032155994, 0.03740007, -0.027106242, 0.055874683, 0.032962874, 0.043230098, 0.0026530274]


In [7]:
#Embed all chunks
all_pages = [curr_split.page_content for curr_split in all_splits]
all_vectors = embed.embed_documents(all_pages)


### Saving in a FAISS Vector Store

In [None]:
# Dependancy
# pip install -qU langchain-community
# pip install faiss-cpu


In [8]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS


""" Facebook AI Similarity Search (FAISS) is a library for efficient similarity search and clustering of dense vectors.
It contains algorithms that search in sets of vectors of any size, up to ones that possibly do not fit in RAM. 
It also includes supporting code for evaluation and parameter tuning."""

#Determine dimnesions of embedding vectors dynamically by testing an example string
embedding_dim = len(embed.embed_query("hello world"))
#initlize a flat faiis store that uses L1
index = faiss.IndexFlatL2(embedding_dim)
#Constructing a vector store instance
vector_store = FAISS(
    embedding_function=embed,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)
#Populating with our vectors
ids = vector_store.add_documents(documents=all_splits)



In [None]:
#What does apple sell? <-- test question
question = "What does apple sell?"

print(f"\nSearching for documents relevant to: '{question}'")
# Use the vector store to find similar documents
# It automatically embeds the user's question and compares it to the chunk vectors.
results = vector_store.similarity_search(query=question)


#  Inspect the Results ---

print("\nFound the following relevant chunks:")
for i, doc in enumerate(results):
    print(f"\n--- Result {i+1} ---")
    print(doc.page_content)




Searching for documents relevant to: 'What does apple sell?'

Found the following relevant chunks:

--- Result 1 ---
iPhone $ 46,841 $ 45,963 2 % $ 115,979 $ 115,665 — %
Mac 7,949 7,451 7 % 16,936 15,231 11 %
iPad 6,402 5,559 15 % 14,490 12,582 15 %
Wearables, Home and Accessories 7,522 7,913 (5)% 19,269 19,866 (3)%
Services 26,645 23,867 12 % 52,985 46,984 13 %
Total net sales $ 95,359 $ 90,753 5 % $ 219,659 $ 210,328 4 %
iPhone
iPhone net sales increased during the second quarter of 2025 compared to the second quarter of 2024 due primarily to higher net sales of Pro models. Year-over-year iPhone net sales were relatively flat during the first six months of 2025.
Mac
Mac net sales increased during the second quarter and first six months of 2025 compared to the same periods in 2024 due primarily to higher net sales of bothlaptops and desktops.
iPad
iPad net sales increased during the second quarter and first six months of 2025 compared to the same periods in 2024 due primarily to high

### Making The Augmented Prompt

In [11]:
my_prompt = input("Ask a question about the document: ")


print(f"\nSearching for documents relevant to: '{my_prompt}'")
# Use the vector store to find similar documents
# It automatically embeds the user's question and compares it to the chunk vectors.
results = vector_store.similarity_search(query=my_prompt)


retrieved_context = "\nFound the following relevant chunks from documents"

for i, doc in enumerate(results):
    retrieved_context+=(f"\n--- Result {i+1} ---")
    retrieved_context += f"\n{doc.page_content}" # Add a newline for readability
    # retrieved_context += f"\nMetadata: {doc.metadata}" # Convert metadata to string and add a label

augmented_prompt = "Answer this question: " + my_prompt + '\n Using this context \n' + retrieved_context 


Searching for documents relevant to: 'How did iPhone sales perform compared to the same quarter last year? '


In [12]:
print(augmented_prompt)

Answer this question: How did iPhone sales perform compared to the same quarter last year? 
 Using this context 

Found the following relevant chunks from documents
--- Result 1 ---
iPhone $ 46,841 $ 45,963 2 % $ 115,979 $ 115,665 — %
Mac 7,949 7,451 7 % 16,936 15,231 11 %
iPad 6,402 5,559 15 % 14,490 12,582 15 %
Wearables, Home and Accessories 7,522 7,913 (5)% 19,269 19,866 (3)%
Services 26,645 23,867 12 % 52,985 46,984 13 %
Total net sales $ 95,359 $ 90,753 5 % $ 219,659 $ 210,328 4 %
iPhone
iPhone net sales increased during the second quarter of 2025 compared to the second quarter of 2024 due primarily to higher net sales of Pro models. Year-over-year iPhone net sales were relatively flat during the first six months of 2025.
Mac
Mac net sales increased during the second quarter and first six months of 2025 compared to the same periods in 2024 due primarily to higher net sales of bothlaptops and desktops.
iPad
iPad net sales increased during the second quarter and first six months of

### Generating Response

In [99]:
# pip install ollama

In [13]:
import ollama

# Define the messages to send to the model
# The 'role' can be 'system', 'user', or 'assistant'
messages = [
    {
        'role': 'system',
        'content': 'You are a helpful assistant.',
    },
    {
        'role': 'user',
        'content': augmented_prompt,
    },
]

# Send the request to the Llama 3.1 model
# The model name must match what you pulled with 'ollama pull'
response = ollama.chat(model='llama3.1', messages=messages)

# Print the content of the response
print(response['message']['content'])




According to the provided information, iPhone sales increased by 2% during the second quarter of 2025 compared to the same quarter in 2024. This means that the company sold slightly more iPhones in 2025 than it did in 2024, but not significantly more. The year-over-year growth was relatively flat for the first six months of 2025, with a minimal increase of -0.35% ($115,979 compared to $115,665).
