In [1]:
import uuid
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings

In [2]:
import pickle

In [3]:
with open("../artifacts/summaries/table_summaries.pkl", "rb") as table_file:
    table_summaries = pickle.load(table_file)
with open("../artifacts/summaries/text_summaries.pkl", "rb") as text_file:
    text_summaries = pickle.load(text_file)

In [4]:
with open("../artifacts/original/table_original.pkl", "rb") as table_file:
    table = pickle.load(table_file)
with open("../artifacts/original/text_original.pkl", "rb") as text_file:
    text = pickle.load(text_file)

In [5]:
def create_multi_vector_retriever(vectorstore, text_summaries, texts, table_summaries, tables):

    store = InMemoryStore()
    id_key = "fintech-rag"
    
    retriever = MultiVectorRetriever(
        vectorstore = vectorstore,
        docstore = store,
        id_key = id_key,
    )
    
    def add_documents(retriever, doc_summaries, doc_contents):

        doc_ids = [str(uuid.uuid4()) for _ in doc_contents]

        summary_docs = [
            Document(page_content = str(s), metadata = {id_key: doc_ids[i]}) 
            for i, s in enumerate(doc_summaries)
        ]

        retriever.vectorstore.add_documents(summary_docs)
        retriever.docstore.mset(list(zip(doc_ids, doc_contents)))
    
    if text_summaries:
        add_documents(retriever, text_summaries, texts)
    
    if table_summaries:
        add_documents(retriever, table_summaries, tables)
    
    return retriever


In [6]:
vectorestore = Chroma(
    collection_name = "rag-model",
    embedding_function = OpenAIEmbeddings()
)

  vectorestore = Chroma(


In [7]:
retriever = create_multi_vector_retriever(
    vectorstore = vectorestore,
    table_summaries = table_summaries,
    tables = table,
    text_summaries = text_summaries,
    texts = text
)

In [8]:
retriever

MultiVectorRetriever(vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x00000294B5B614D0>, docstore=<langchain_core.stores.InMemoryStore object at 0x00000294B6DD9C90>, id_key='fintech-rag', search_kwargs={})

In [9]:
query = "what is total current assets as of dec 31 2022 for alphabet inc?"
docs = retriever.invoke(query)

In [10]:
docs

['<table><thead><tr><th></th><th colspan="2">As of December 31,</th></tr><tr><th></th><th>2022</th><th>2023 (unaudited)</th></tr></thead><tbody><tr><td colspan="3">Assets</td></tr><tr><td colspan="3">Current assets:</td></tr><tr><td>Cash and cash equivalents</td><td>21,879 $</td><td>24,048</td></tr><tr><td>Marketable securities</td><td>91,883</td><td>86,868</td></tr><tr><td>Total cash, cash equivalents, and marketable securities</td><td>113,762</td><td>110,916</td></tr><tr><td>Accounts receivable, net</td><td>40,258</td><td>47,964</td></tr><tr><td>Other current assets</td><td>10,775</td><td>12,650</td></tr><tr><td>Total current assets</td><td>164,795</td><td>171,530</td></tr><tr><td>Non-marketable securities</td><td>30,492</td><td>31,008</td></tr><tr><td>Deferred income taxes</td><td>5,261</td><td>12,169</td></tr><tr><td>Property and equipment, net</td><td>112,668</td><td>134,345</td></tr><tr><td>Operating lease assets</td><td>14,381</td><td>14,091</td></tr><tr><td>Goodwill</td><td>28,

In [11]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

In [15]:
from operator import itemgetter
from langchain.schema.runnable import RunnablePassthrough

# Prompt template
template = """Answer the question based only on the following context, which can include text and tables:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# LLM
model = ChatOpenAI(temperature=0,model="gpt-4o-mini")

# RAG pipeline
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [16]:
response = chain.invoke(query)
print(response)

Total current assets as of December 31, 2022 for Alphabet Inc. is $164,795.


In [17]:
query2 = "What non-GAAP financial measures does the company use?"

response = chain.invoke(query2)
print(response)

The company uses the following non-GAAP financial measures: free cash flow, constant currency revenues, and percentage change in constant currency revenues.


In [21]:
query3 = "What is the net income in the quarter ended in 2022 in cash flow?"

response = chain.invoke(query3)
print(response)

The net income in the quarter ended December 31, 2022, is $13,624.
