### LangTrace AI

In [1]:
from dotenv import load_dotenv, find_dotenv
import os

In [2]:
load_dotenv(find_dotenv())

LANGTRACE_API_KEY = os.getenv("LANGTRACE_API_KEY")

In [3]:
from langtrace_python_sdk import langtrace

langtrace.init(api_key = LANGTRACE_API_KEY)

[32mInitializing Langtrace SDK..[39m
[37m⭐ Leave our github a star to stay on top of our updates - https://github.com/Scale3-Labs/langtrace[39m
Skipping openai due to error while instrumenting: No module named 'openai.resources.responses'
[34mExporting spans to Langtrace cloud..[39m


  from .autonotebook import tqdm as notebook_tqdm


### MVR

In [1]:
import uuid
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
import pickle

In [2]:
with open("../artifacts/summaries/table_summaries.pkl", "rb") as table_file:
    table_summaries = pickle.load(table_file)
with open("../artifacts/summaries/text_summaries.pkl", "rb") as text_file:
    text_summaries = pickle.load(text_file)

In [3]:
with open("../artifacts/original/table_original.pkl", "rb") as table_file:
    table = pickle.load(table_file)
with open("../artifacts/original/text_original.pkl", "rb") as text_file:
    text = pickle.load(text_file)

In [4]:
def create_multi_vector_retriever(vectorstore, text_summaries, texts, table_summaries, tables):

    store = InMemoryStore()
    id_key = "fintech-rag"
    
    retriever = MultiVectorRetriever(
        vectorstore = vectorstore,
        docstore = store,
        id_key = id_key,
    )
    
    def add_documents(retriever, doc_summaries, doc_contents):

        doc_ids = [str(uuid.uuid4()) for _ in doc_contents]

        summary_docs = [
            Document(page_content = str(s), metadata = {id_key: doc_ids[i]}) 
            for i, s in enumerate(doc_summaries)
        ]

        retriever.vectorstore.add_documents(summary_docs)
        retriever.docstore.mset(list(zip(doc_ids, doc_contents)))
    
    if text_summaries:
        add_documents(retriever, text_summaries, texts)
    
    if table_summaries:
        add_documents(retriever, table_summaries, tables)
    
    return retriever


In [5]:
persist_directory = "../Database"

In [6]:
vectorestore = Chroma(
    collection_name = "rag-model",
    embedding_function = OpenAIEmbeddings(),
    persist_directory = persist_directory
)

  vectorestore = Chroma(


In [7]:
retriever = create_multi_vector_retriever(
    vectorstore = vectorestore,
    table_summaries = table_summaries,
    tables = table,
    text_summaries = text_summaries,
    texts = text
)

In [8]:
vectorestore.persist()

  vectorestore.persist()


### LLM

In [10]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from operator import itemgetter
from langchain.schema.runnable import RunnablePassthrough
from langchain.memory import ConversationBufferMemory

In [11]:
memory = ConversationBufferMemory(return_messages = True)

  memory = ConversationBufferMemory(return_messages = True)


In [13]:
# Prompt template
prompt = ChatPromptTemplate.from_template(
    """
    Conversation History:
    {history}

    Document Context:
    {context}

    Current Question:
    {question}

    Generate a response that thoughtfully integrates both the conversation history \
    and the provided document context, emphasizing finance-specific details when applicable. \
    The answer should be concise and clear, ranging between 10 and 200 words based on the \
    complexity of the question. Only include information directly supported by the given context.

    Important Instructions:
    - If the current question relates solely to previous inquiries or lacks new context, first verify the conversation history(especially the last question asked). If it is not connected to any prior question, reply with: "The pdf doesn't contain context regarding the question."
    - For finance-related inquiries, incorporate appropriate financial terminology and domain expertise.
    - Do not add any external details not present in the document context.
    - Highlight all critical numbers and percentages in **bold**.

    Policies:
    - NEVER infer relationships between financial concepts.
    - PRESERVE the original context's numerical precision.
    - Strictly adhere to the provided document context (mvr); avoid introducing external details.
    - Use clear, user-friendly language throughout the response.
    - Ensure all information is derived solely from the given context and conversation history.
    - Maintain accuracy and clarity without unnecessary elaboration.
    - Use bullet points where necessary.

    IF THE OUTPUT CANNOT BE GENERATED FROM THE CONTEXT, JUST REPLY WITH - "The pdf doesn't contain context regarding the question."
    """
        )


# LLM
model = ChatOpenAI(temperature = 0, model = "gpt-4o-mini")

In [14]:
# Chain
chain = (
    {
        "context": itemgetter("question") | retriever,
        "question": itemgetter("question"),
        "history": lambda x: memory.load_memory_variables({})["history"],
    }
    | prompt
    | model
    | StrOutputParser()
)

In [15]:
user_input = "What was the revenue in Q1 2023?"
output = chain.invoke({"question": user_input})

In [16]:
output

"The pdf doesn't contain context regarding the question."