### Dependancies


Create and activate virtual enviroment first!

```
python3 -m venv venv
source venv/bin/activate
```

Select Kernel in IDE

Pip installs:
```
!pip install jupyter
!pip install
!pip install -U langchain langchain-ollama langchain-community langchain-text-splitters faiss-cpu pypdf
```

Download and run ollama from internet. Import modesl:
```
!ollama pull llama3.1
!ollama pull nomic-embed-text
```

In [None]:
# a
# !python3 -m venv venv
# !source venv/bin/activate
# !pip install jupyter
# !pip install
# !pip install -U langchain langchain-ollama langchain-community langchain-text-splitters faiss-cpu pypdf
# !ollama pull llama3.1
# !ollama pull nomic-embed-text


### Load Document

In [None]:

from langchain_community.document_loaders import PyPDFLoader

file_path = "Apple10k.pdf"
loader = PyPDFLoader(file_path)

docs = loader.load()

# The number of pages
print(f"The number of pages is {len(docs)}")
print()
print(f"The first page's first 200 characters is:\n{docs[0].page_content[:200]}\n")
 
print(f"The metadata is {docs[0].metadata}")



The number of pages is 121

The first page's first 200 characters is:
UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-K
(Mark One)
☒  ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the fiscal year

The metadata is {'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2024-11-01T06:05:37-04:00', 'title': '0000320193-24-000123', 'author': 'EDGAR® Online LLC, a subsidiary of OTC Markets Group', 'subject': 'Form 10-K filed on 2024-11-01 for the period ending 2024-09-28', 'keywords': '0000320193-24-000123; ; 10-K', 'moddate': '2024-11-01T06:06:09-04:00', 'source': 'Apple10k.pdf', 'total_pages': 121, 'page': 0, 'page_label': '1'}


### Split the document into chunks

In [9]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
#Character based splitting, token based splitting is also avaible
#Overlap ensures context is not lost
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)
print("Performing character based splitting")
print(f"The number of total splits(chunks) is {len(all_splits)}")

Performing character based splitting
The number of total splits(chunks) is 549


### Initialize Embedding Model

In [10]:
from langchain_ollama import OllamaEmbeddings

embed = OllamaEmbeddings(
    model="nomic-embed-text"
)


### Create the vector store (Embed Chunks)

In [11]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS


""" Facebook AI Similarity Search (FAISS) is a library for efficient similarity search and clustering of dense vectors.
It contains algorithms that search in sets of vectors of any size, up to ones that possibly do not fit in RAM. 
It also includes supporting code for evaluation and parameter tuning."""

#Determine dimnesions of embedding vectors dynamically by testing an example string
embedding_dim = len(embed.embed_query("hello world"))
#initlize a flat faiis store that uses L1
index = faiss.IndexFlatL2(embedding_dim)
#Constructing a vector store instance
vector_store = FAISS(
    embedding_function=embed,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)
#Populating with our vectors
ids = vector_store.add_documents(documents=all_splits)



In [None]:
#test question
question = "How much does apple make?"

print(f"\nSearching for documents relevant to: '{question}'")
# Use the vector store to find similar documents
# It automatically embeds the user's question and compares it to the chunk vectors.
results = vector_store.similarity_search(query=question,k=5)


#  Inspect the Results ---

print("\nFound the following relevant chunks:")
for i, doc in enumerate(results):
    print(f"\n--- Result {i+1} ---")
    print(doc.page_content)




Searching for documents relevant to: 'How much does apple make?'

Found the following relevant chunks:

--- Result 1 ---
Apple Inc.
CONSOLIDATED STATEMENTS OF OPERATIONS(In millions, except number of shares, which are reflected in thousands, and per-share amounts)
Years ended
September 28,2024 September 30,2023 September 24,2022
Net sales:
   Products $ 294,866 $ 298,085 $ 316,199 
   Services 96,169 85,200 78,129 
Total net sales 391,035 383,285 394,328 
Cost of sales:
   Products 185,233 189,282 201,471 
   Services 25,119 24,855 22,075 
Total cost of sales 210,352 214,137 223,546 
Gross margin 180,683 169,148 170,782 
Operating expenses:
Research and development 31,370 29,915 26,251 
Selling, general and administrative 26,097 24,932 25,094 
Total operating expenses 57,467 54,847 51,345 
Operating income 123,216 114,301 119,437 
Other income/(expense), net 269 (565) (334)
Income before provision for income taxes 123,485 113,736 119,103 
Provision for income taxes 29,749 16,741 19,30

### Setting Up LLM

In [21]:
from langchain_community.llms import Ollama

#Doing smaller model for dev speed
llm = Ollama(model="llama3.1")


  llm = Ollama(model="llama3.1")


### Setting Up System Prompt

In [None]:
from langchain_core.prompts import ChatPromptTemplate


system_prompt = """
        You are an AI that will answer questions about Apple's financial history.
        If an answer is unknown, simply state that and refrain from speculation.
        Cite relevant  sections, acts, or provisions in your response.

        Previous conversations:
        {history}

        Document context:
        {context}
    """

qa_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            ("human", "{input}"),
        ]
    )


: 

In [22]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.messages import HumanMessage, AIMessage


def chainingFunction():
    retriever = vector_store.as_retriever()
    history = []



    while True:
        query = input("Question:")

        print("User:", query, "\n")
        if query.lower() in ["quit","exit","bye"]:
            print("Bot: Goodbye!")
            break

        history.append({"role": "user", "content": HumanMessage(content=query)})

        if query:
            relevant_docs = retriever.invoke(query)
            context_documents_str = "\n\n".join(doc.page_content for doc in relevant_docs)
        else:
            context_documents_str = ""

        qa_prompt_local  = qa_prompt.partial(
            history=history,
            context=context_documents_str
        )

        llm_chain = { "input": RunnablePassthrough() } | qa_prompt_local  | llm

        result = llm_chain.invoke(query)

        history.append({"role": "assistant", "content": AIMessage(content=result)})

        print("Bot:", result, "\n\n")



In [23]:
chainingFunction()


User: Hi 

Bot: Welcome to our conversation about Apple's financial history! We can discuss various aspects of their financial performance, including their revenue growth, product introductions, and geographic expansion.

What would you like to know about Apple's financial history? 


User: exit 

Bot: Goodbye!
