In [6]:
from pypdf import PdfReader
import os

# Load the secrets file
current_file_path = os.path.abspath("based.ipynb")
# print(f"current file: {current_file_path}")

# Get the parent directory of the current file's directory
parent_directory = os.path.dirname(current_file_path)
# print(f"parent file: {parent_directory}")

# Get the parent directory of the parent directory
Child_DIR = os.path.dirname(parent_directory)
# print(f"child file: {Child_DIR}")

# Define the path to the client_secret.json file
CLIENT_SECRET_FILE = os.path.join(Child_DIR, '2023_Annual_Report.pdf')
print(f"client file: {CLIENT_SECRET_FILE}")

reader = PdfReader(CLIENT_SECRET_FILE)
pdf_texts = [p.extract_text().strip() for p in reader.pages]

# Filter the empty strings
pdf_texts = [text for text in pdf_texts if text]

print(pdf_texts[0])


client file: d:\Dell\Documents\code\langchain\RAG\chromadb\2023_Annual_Report.pdf
1 Dear shareholders, colleagues, customers, and partners,  
We are living through a time of historic challenge and opportunity. As I write this, the world faces ongoing economic, social , 
and geopolitical volatility. At the same time, we have entered a new age of AI that will fundamentally transform productivity  
for every individual, organization, and industry on earth, and help us address some of our most pressing challenges.  
This next generation of AI will reshape every software category and every business, including our own. Forty -eight years 
after its founding, Microsoft remains a consequential company because time and time again —from PC/Server, to 
Web/Internet, to Cloud/Mob ile—we have adapted to technological paradigm shifts. Today, we are doing so once again, as 
we lead this new era.  
Amid this transformation, our mission to empower every person and every organization on the planet to ac

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter

character_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=1000,
    chunk_overlap=0
)
character_split_texts = character_splitter.split_text('\n\n'.join(pdf_texts))

print(character_split_texts[10])
print(f"\nTotal chunks: {len(character_split_texts)}")

3 Industry  
Across industries, we are rapidly becoming the partner of choice for any organization looking to generate real value from 
AI. In healthcare, for example, we introduced the world’s first fully automated clinical documentation application, DAX 
Copilot. The a pplication helps physicians reduce documentation time by half, freeing them to spend more time face to face 
with patients. And Epic will integrate it directly into its electronic health records system.  
And, in retail, we introduced new tools to help companies manage their day -to-day operations and digitize their physical 
stores.  
Modern work  
We are rapidly evolving Microsoft 365 into an AI -first platform that enables every individual to amplify their creativity and 
productivity, with both our established applications like Office and Teams, as well as new apps like Designer, Stream, and 
Loop. M icrosoft 365 is designed for today’s digitally connected, distributed workforce.

Total chunks: 344


In [9]:
import chromadb.utils.embedding_functions as embedding_functions
# from langchain_openai import OpenAIEmbeddings
import chromadb
import os

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                api_key=OPENAI_API_KEY,
                model_name="text-embedding-3-small"
            )

chroma_client = chromadb.Client()
chroma_collection = chroma_client.create_collection("2023_annual_reports", embedding_function=openai_ef)

ids = [str(i) for i in range(len(character_split_texts))]

chroma_collection.add(ids=ids, documents=character_split_texts)
print(chroma_collection.count())


344


In [11]:
query = "What was the total revenue?"

results = chroma_collection.query(query_texts=[query], n_results=2)
retrieved_documents = results['documents'][0]

for document in retrieved_documents:
    print(document)
    print('\n')

Year Ended June  30, 2023  2022  2021  
        
United States (a) $ 106,744   $ 100,218   $ 83,953   
Other countries   105,171    98,052    84,135         
Total  $  211,915   $  198,270   $  168,088           
(a) Includes billings to OEMs and certain multinational organizations because of the nature of these businesses and the 
impracticability of determining the geographic source of the revenue.  
Revenue, classified by significant product and service offerings, was as follows:  
  
(In millions)          
        
Year Ended June  30, 2023  2022  2021  
        
Server products and cloud services  $ 79,970   $ 67,350   $ 52,589   
Office products and cloud services   48,728    44,862    39,872   
Windows   21,507    24,732    22,488   
Gaming   15,466    16,230    15,370   
LinkedIn   15,145    13,816    10,289   
Search and news advertising   12,208    11,591    9,267   
Enterprise Services   7,722    7,407    6,943   
Devices   5,521    7,306    7,143


and expenses are transla

In [13]:
import os
from langchain_openai import ChatOpenAI

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

llm = ChatOpenAI(model="gpt-4o-mini", api_key=os.environ.get("OPENAI_API_KEY"))

In [14]:
def rag(query, retrieved_documents):
    information = "\n\n".join(retrieved_documents)

    messages = [
        {
            "role": "system",
            "content": "You are a helpful expert financial research assistant. Your users are asking questions about information contained in an annual report."
            "You will be shown the user's question, and the relevant information from the annual report. Answer the user's question using only this information."
        },
        {"role": "user", "content": f"Question: {query}. \n Information: {information}"}
    ]
    
    response = llm.invoke(messages)
    content = response.content
    return content

In [15]:
output = rag(query=query, retrieved_documents=retrieved_documents)

print(output)

The total revenue for the year ended June 30, 2023, was $211,915 million.
