## RAG with Business Document Analysis

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

In [None]:
from langchain_community.document_loaders import UnstructuredPDFLoader

file_path = ".\data\businessReport.pdf"
loader = UnstructuredPDFLoader(file_path)
docs = loader.load()

In [None]:
# Split document into small chunks
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(docs)

# get embedding models, create vectorstore, store chunks in vectorstore

from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

from langchain_core.vectorstores import InMemoryVectorStore
vector_store = InMemoryVectorStore(embeddings)

# adding documents in vectorstores
document_ids = vector_store.add_documents(documents=all_splits)

In [None]:
all_splits

In [None]:
question = "What are the key business strategies and financial performance indicators mentioned in this report?"

In [None]:
# extract chunks which matches with your query
search_results = vector_store.similarity_search_with_score(question, k=10)

# conbine text from differrent chunks into one long string
doc_content = "\n\n".join(doc.page_content for (doc,score) in search_results)

In [None]:
prompt_template = """You are a business analyst consultant. Use the following document excerpts to provide strategic insights and analysis. Focus on key business metrics, market trends, and strategic recommendations. Keep your analysis under 500 words.
Business Question: {question} 
Document Context: {context} 
Strategic Analysis:"""

from langchain.chat_models import init_chat_model
llm = init_chat_model("o1-mini", model_provider="openai")

response = llm.invoke(prompt_template.format(
    context=doc_content,
    question=question))
    
print(response.content)

In [None]:
from IPython.display import Markdown
Markdown(response.content)