# RAG Data Retrieval and Re-Ranking

## Overview
Intelligent document retrieval for Financial docs

### Steps
1. **Set-up** - Initialize ChromaDB vector store, embeddings and LLM <br>
2. **Metadata Extraction** - Extract company name, doc type, fiscal year/quarter from user query <br>
3. **Keyword Generation** - Generate 5 SEC filing-specific keywords for ranking <br>
4. **Filtered Search** - Retrieve documents using metadata filters and content keywords (MMR Search) <br>
5. **BM25 Re-Ranking** - Re-rank results using BM25Plus on heading + content chunks to improve relevance <br>

### 1. Imports

In [1]:
# 1. Imports
from dotenv import load_dotenv
from langchain_chroma import Chroma
from langchain_ollama import ChatOllama, OllamaEmbeddings
from langchain_core.documents import Document
from pathlib import Path

from scripts.schemas import ChunkMetadata, RankingKeywords

from docling.document_converter import DocumentConverter
import hashlib


load_dotenv()

True

### 2. Configurations

In [2]:
DATA_DIR = "data"
CHROMA_DIR = "./chroma_financial_db"
COLLECTION_NAME = "financial_docs"
EMBEDDING_MODEL = "nomic-embed-text:latest"
BASE_URL = "http://localhost:11434"
LLM_MODEL = "qwen3:latest"

### 3. Embeddings and Vector Store

In [3]:
embeddings = OllamaEmbeddings(model=EMBEDDING_MODEL, base_url=BASE_URL)

vector_store = Chroma(
    collection_name=COLLECTION_NAME,
    embedding_function=embeddings,
    persist_directory=CHROMA_DIR
)

llm = ChatOllama(model=LLM_MODEL, base_url=BASE_URL)

### 4. Extract Filters and Ranking Keywords

In [4]:
def extract_filters(user_query: str):
    llm_structured = llm.with_structured_output(ChunkMetadata)

    prompt = f"""Extract metadata filters from the query. Return None for fields not mentioned.

                USER QUERY: {user_query}

                COMPANY MAPPINGS:
                - Amazon/AMZN -> amazon
                - Google/Alphabet/GOOGL/GOOG -> google
                - Apple/AAPL -> apple
                - Microsoft/MSFT -> microsoft
                - Tesla/TSLA -> tesla
                - Nvidia/NVDA -> nvidia
                - Meta/Facebook/FB -> meta

                DOC TYPE:
                - Annual report -> 10-k
                - Quarterly report -> 10-q
                - Current report -> 8-k

                EXAMPLES:
                "Amazon Q3 2024 revenue" -> {{"company_name": "amazon", "doc_type": "10-q", "fiscal_year": 2024, "fiscal_quarter": "q3"}}
                "Apple 2023 annual report" -> {{"company_name": "apple", "doc_type": "10-k", "fiscal_year": 2023}}
                "Tesla profitability" -> {{"company_name": "tesla"}}

                Extract metadata:
                """
    metadata = llm_structured.invoke(prompt)
    filters = metadata.model_dump(exclude_none=True)

    return filters

In [5]:
extract_filters("What is Google's annual Revenue in 2024?")

{'company_name': 'google', 'doc_type': '10-k', 'fiscal_year': 2024}

In [6]:
extract_filters("What is Google's annual Revenue in 2024?")

{'company_name': 'google', 'doc_type': '10-k', 'fiscal_year': 2024}

### 5. Generate Ranking Keywords

In [7]:
def generate_ranking_keywords(user_query: str):
    prompt = f"""Generate EXACTLY 5 financial keywords from SEC filings terminology.

                USER QUERY: {user_query}

                USE EXACT TERMS FROM 10-K/10-Q FILINGS:

                STATEMENT HEADINGS:
                "consolidated statements of operations", "consolidated balance sheets", "consolidated statements of cash flows", "consolidated statements of stockholders equity"

                INCOME STATEMENT:
                "revenue", "net revenue", "cost of revenue", "gross profit", "operating income", "net income", "earnings per share"

                BALANCE SHEET:
                "total assets", "cash and cash equivalents", "total liabilities", "stockholders equity", "working capital", "long-term debt"

                CASH FLOWS:
                "cash flows from operating activities", "net cash provided by operating activities", "cash flows from investing activities", "free cash flow", "capital expenditures"

                RULES:
                - Return EXACTLY 5 keywords
                - Use exact phrases from SEC filings
                - Match query topic (revenue -> revenue terms, cash -> cash flow terms)
                - Use "cash flows" (plural), "stockholders equity"

                EXAMPLES:
                "revenue analysis" -> ["revenue", "net revenue", "total revenue", "consolidated statements of operations", "net sales"]
                "cash flow performance" -> ["consolidated statements of cash flows", "cash flows from operating activities", "net cash provided by operating activities", "free cash flow", "operating activities"]
                "balance sheet strength" -> ["consolidated balance sheets", "total assets", "stockholders equity", "cash and cash equivalents", "long-term debt"]

                Generate EXACTLY 5 keywords:
                """
    llm_structured = llm.with_structured_output(RankingKeywords)
    result = llm_structured.invoke(prompt)

    return result.keywords

In [8]:
generate_ranking_keywords("What is Google's annual Revenue in 2024?")

['consolidated statements of operations',
 'revenue',
 'net revenue',
 'gross profit',
 'operating income']

### 6. Search the Doc from vectorDB

In [9]:
def build_search_kwargs(filters, ranking_keywords, k=3):
    search_kwargs = {"k": k, "fetch_k": k*20}

    if filters:
        if len(filters) == 1:
            search_kwargs['filter'] = filters
        else:
            filters_condition = [{k:v} for k, v in filters.items()]
            search_kwargs['filter'] = {"$and": filters_condition}
    
    # Add document content filters using ranking keywords
    if ranking_keywords:
        if len(ranking_keywords) == 1:
            search_kwargs['where_documnets'] = {"$contains": ranking_keywords[0]}
        else:
            search_kwargs['where_documents'] = {
                "$or": [
                    {"$contains": keyword} for keyword in ranking_keywords
                ]
            }
    return search_kwargs

In [10]:
def search_docs(query, filters={}, ranking_keywords=[], k=3):
    """
        Search documents with metadata and content filters.

        Args:
            query (str): Search query text
            filters (dict): Metadata filters (e.g., {"compnay_name": "amazon", "fiscal_year": 2023})
            ranking_keywords (list): Keywords for content filtering (document must contain at least one)
            k (int): Number of results (default = 5)
        
        Returns:
            list: Matching documents objects
        
        Example:
            docs = search_docs(
                query="Analyze cash flow"
                filters={"company_name": "amazon", "doc_type": "10-k"}
                ranking_keywords=["cash flow", "liquidity"]
                k=10
            )
    """

    search_kwargs = build_search_kwargs(filters, ranking_keywords)

    retriever = vector_store.as_retriever(
        search_type = "mmr",
        search_kwargs = search_kwargs
    )

    return retriever.invoke(query)

In [11]:
query = "Show me amazon's cashflow in 2023"
search_docs(query)

[Document(id='f363a4fd-6fa9-4f7d-83c1-b3bd1bb09435', metadata={'company_name': 'amazon', 'doc_type': '10-q', 'page': 23, 'fiscal_year': 2024, 'source_file': 'amazon 10-q q2 2024.pdf', 'file_hash': '552615e47708aa125f69140f69d6bbd4c45f99981e4ed384f2d727d0665da8a8', 'fiscal_quarter': 'q2'}, page_content="\n\nHealthcare, Inc. (One Medical) in 2023 with cash on hand. In Q3 2023, we invested $1.25 billion in a convertible note from Anthropic. In Q1 2024, we invested $2.75 billion in a second convertible note.\n\nCash provided by (used in) financing activities was $(6.5) billion and $(4.5) billion for Q2 2023 and Q2 2024, and $(185) million and $(5.7) billion for the six months ended June 30, 2023 and 2024. Cash inflows from financing activities resulted from proceeds from short-term debt, and other and long-term debt of $4.4 billion and $525 million for Q2 2023 and Q2 2024, and $17.2 billion and $863 million for the six months ended June 30, 2023 and 2024. Cash outflows from financing activ

In [12]:
query = "Show me amazon's cashflow in 2023"
filters = extract_filters(query)
ranking_keywords = generate_ranking_keywords(query)

results = search_docs(query, filters, ranking_keywords, k=20)

In [13]:
results

[Document(id='824a1603-ee65-4984-96f4-f3978bd01757', metadata={'company_name': 'amazon', 'fiscal_year': 2023, 'source_file': 'amazon 10-k 2023.pdf', 'file_hash': '6e5549c7b20b0fbc5f482397070a1e85cbf8643c801ff570903f52366b11154f', 'doc_type': '10-k', 'page': 25}, page_content='\n\nAWS sales increased 13% in 2023, compared to the prior year. The sales growth primarily reflects increased customer usage, partially offset by pricing changes, primarily driven by long-term customer contracts.\n\n## Operating Income (Loss)\n\nOperating income (loss) by segment is as follows (in millions):\n\n|                         | Year Ended December 31,   | Year Ended December 31,   |\n|-------------------------|---------------------------|---------------------------|\n|                         | 2022                      | 2023                      |\n| Operating Income (Loss) |                           |                           |\n| North America           | $ (2,847)                 | $ 14,877     

In [14]:
ranking_keywords

['consolidated statements of cash flows',
 'cash flows from operating activities',
 'net cash provided by operating activities',
 'free cash flow',
 'capital expenditures']

### 7. Extract Documents Headings and Sub Headings for Re-Ranking

In [15]:
import re

def extract_heading_with_content(text):
    """
    Extract markdown headings with one paragraph of content after them.

    Args:
        text: Document text content
    
    Returns:
        List of extracted heading + content chunks
    """
    chunks = []

    sections = text.split('\n\n')
    i = 0
    while i < len(sections):
        section = sections[i].strip()
        pattern = r"^#+\s+"

        if re.match(pattern, section):
            heading = section

            if i + 1 < len(sections):
                next_content = sections[i+1].strip()
                chunk = f"{heading}\n\n{next_content}"
                i = i + 2

            else:
                chunk = heading
                i = i + 1
            
            chunks.append(chunk)
        
        else:
            i = i + 1
    
    return chunks

In [16]:
heading_content = extract_heading_with_content(results[2].page_content)

In [17]:
heading_content

["## Available Information\n\nOur investor relations website is amazon.com/ir and we encourage investors to use it as a way of easily finding information about us. We promptly make available on this website, free of charge, the reports that we file or furnish with the Securities and Exchange Commission ('SEC'), corporate governance information (including our Code of Business Conduct and Ethics), and select press releases.",
 '## Executive Officers and Directors\n\nThe following tables set forth certain information regarding our Executive Officers and Directors as of January 24, 2024:',
 '## Information About Our Executive Officers\n\n| Name                  |   Age | Position                                                               |\n|-----------------------|-------|------------------------------------------------------------------------|\n| Jeffrey P. Bezos      |    60 | Executive Chair                                                        |\n| Andrew R. Jassy       |    56 | 

### Rank Documents using BM25Plus

In [18]:
from rank_bm25 import BM25Plus
def rank_documents_by_keywords(docs, keywords, k=5):
    """
    Rank documents using BM25Plus on heading+content chunks.
    
    Args:
        docs: List of Document objects to rank
        keywords: List of keywords to rank by
        k: Number of top documents to return
    
    Returns:
        List of top-k Document objects sorted by BM25 score
    """

    if not docs or not keywords:
        print("Either no doc or keyword found!")
        return docs
    
    query_tokens = " ".join(keywords).lower().split(" ")

    doc_chunks = []
    for doc in docs:
        chunks = extract_heading_with_content(doc.page_content)
        combined = " ".join(chunks) if chunks else doc.page_content

        doc_chunks.append(combined.lower().split(' '))
    
    # Rank Using BM25plus
    bm25 = BM25Plus(doc_chunks)
    doc_scores = bm25.get_scores(query_tokens)

    ranked_indices = sorted(range(len(docs)), key=lambda i: doc_scores[i], reverse=True)

    for rank, idx in enumerate(ranked_indices[:k], 1):
        print(f"  [{rank}] Doc {idx}: Score={doc_scores[idx]:.4f}")

    return [docs[i] for i in ranked_indices[:k]]

In [19]:
reranked_results = rank_documents_by_keywords(results, ranking_keywords)

  [1] Doc 1: Score=40.9868
  [2] Doc 0: Score=24.5054
  [3] Doc 2: Score=19.0782
