In [7]:
!pip install pdfplumber
!pip install camelot-py[cv]
!pip install --upgrade pymupdf
!pip install google-generativeai
!pip install faiss-cpu
!pip install transformers tqdm pandas pytesseract pillow easyocr langchain langchain-community langchain_openai faiss-cpu rank_bm25 pdf2image
!pip install sentence_transformers





Collecting sentence_transformers
  Downloading sentence_transformers-5.1.1-py3-none-any.whl.metadata (16 kB)
Collecting scikit-learn (from sentence_transformers)
  Downloading scikit_learn-1.7.2-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting joblib>=1.2.0 (from scikit-learn->sentence_transformers)
  Using cached joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn->sentence_transformers)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading sentence_transformers-5.1.1-py3-none-any.whl (486 kB)
Downloading scikit_learn-1.7.2-cp311-cp311-win_amd64.whl (8.9 MB)
   ---------------------------------------- 0.0/8.9 MB ? eta -:--:--
   ---------------------------------------- 8.9/8.9 MB 61.7 MB/s eta 0:00:00
Using cached joblib-1.5.2-py3-none-any.whl (308 kB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn, sentence_transformers
Successfu

In [3]:
import re
import os, glob
import pdfplumber
import camelot
import pymupdf
import numpy as np
import pandas as pd
from pathlib import Path
import google.generativeai as genai
import time
import faiss, json
import collections


  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4
  from .autonotebook import tqdm as notebook_tqdm


# Agent CFO — Performance Optimization & Design

---
This is the starter notebook for your project. Follow the required structure below.


You will design and optimize an Agent CFO assistant for a listed company. The assistant should answer finance/operations questions using RAG (Retrieval-Augmented Generation) + agentic reasoning, with response time (latency) as the primary metric.

Your system must:
*   Ingest the company’s public filings.
*   Retrieve relevant passages efficiently.
*   Compute ratios/trends via tool calls (calculator, table parsing).
*   Produce answers with valid citations to the correct page/table.


## 1. Config & Secrets

Fill in your API keys in secrets. **Do not hardcode keys** in cells.

In [1]:
# Example:
# os.environ['GEMINI_API_KEY'] = 'your-key-here'
# os.environ['OPENAI_API_KEY'] = 'your-key-here'

COMPANY_NAME = "Google"


## 2. Data Download (Dropbox)

*   Annual Reports: last 3–5 years.
*   Quarterly Results Packs & MD&A (Management Discussion & Analysis).
*   Investor Presentations and Press Releases.
*   These files must be submitted later as a deliverable in the Dropbox data pack.
*   Upload them under `/content/data/`.

Scope limit: each team will ingest minimally 15 PDF files total.


In [4]:
DATA_DIR = "00-data"

# Annual reports (10-Ks)
annual_files = glob.glob(f"{DATA_DIR}/annuals/*.pdf")

# # Quarterly reports (10-Qs)
quarterly_files = glob.glob(f"{DATA_DIR}/quarterlies/*.pdf")

# # Press releases
# press_files = glob.glob(f"{DATA_DIR}/press_releases/*.pdf")

# Presentations
presentation_files = glob.glob(f"{DATA_DIR}/presentations/*.pdf")

# Supplements
supplement_files = glob.glob(f"{DATA_DIR}/supplements/*.pdf")

# # Transcripts
# transcript_files = glob.glob(f"{DATA_DIR}/transcripts/*.pdf")

In [5]:
# for folder in ["annuals", "quarterlies", "press_releases", "presentations", "supplements", "transcripts"]:
for folder in ["annuals", "quarterlies", "presentations", "supplements"]:

    files = glob.glob(f"{DATA_DIR}/{folder}/*.pdf")
    print(f"{folder}: {len(files)} files")

annuals: 2 files
quarterlies: 8 files
presentations: 2 files
supplements: 0 files


## 3. System Requirements

**Retrieval & RAG**
*   Use a vector index (e.g., FAISS, LlamaIndex) + a keyword filter (BM25/ElasticSearch).
*   Citations must include: report name, year, page number, section/table.

**Agentic Reasoning**
*   Support at least 3 tool types: calculator, table extraction, multi-document compare.
*   Reasoning must follow a plan-then-act pattern (not a single unstructured call).

**Instrumentation**
*   Log timings for: T_ingest, T_retrieve, T_rerank, T_reason, T_generate, T_total.
*   Log: tokens used, cache hits, tools invoked.
*   Record p50/p95 latencies.

### Embeddings

In [8]:
from sentence_transformers import SentenceTransformer, util

# load E5-base-v2
model = SentenceTransformer("intfloat/e5-base-v2")

def embed_text_query(s):
    # E5 expects prefix, and stripping/normalizing helps
    return model.encode(f"query: {s.strip().lower()}", normalize_embeddings=True)

def embed_text_passage(s):
    # E5 expects prefix, and stripping/normalizing helps
    return model.encode([f"passage: {chunk_text.strip().lower()}" for chunk_text in s],
                        convert_to_numpy=True,
                        normalize_embeddings=True,
                        show_progress_bar=True)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


### Ingestion pipeline (Table)

In [9]:
# TODO: Implement ingestion pipeline (TABLE)
def extract_tables_from_page(pdf_path, page_num):
    """
    Extract tables from a PDF page by detecting colored header fills.
    """

    # Load the document and page
    doc = pymupdf.open(pdf_path)
    page = doc[page_num - 1]
    page_height = page.rect.height

    def is_nonwhite(rgb, thr=0.05):
        r, g, b = rgb
        return abs(1-r) + abs(1-g) + abs(1-b) > thr

    # --- 1. Get all fills ---
    fills = [
        (d["rect"], d["fill"]) for d in page.get_drawings()
        if d["type"] == "f" and d.get("fill")
    ]

    # --- 2. Keep only colored fills (blue/gray) ---
    colored = [f for f in fills if is_nonwhite(f[1]) and f[0].x1 - f[0].x0 > 100]
    colored.sort(key=lambda f: f[0].y0)

    # --- 3. Group colored fills into tables ---
    tables = []
    if colored:
        cur = [colored[0]]
        for f in colored[1:]:
            if abs(f[0].y0 - cur[-1][0].y1) < 25:  # stacked fills = same table
                cur.append(f)
            else:
                tables.append(cur)
                cur = [f]
        tables.append(cur)

    # --- 4. Process each detected table ---
    extracted_tables = []

    for idx, tgroup in enumerate(tables, 1):
        first_color = tgroup[0]
        y_bottom = max(f[0].y1 for f in tgroup) + 10
        y_top = first_color[0].y0

        # Find header region above this table
        header_y0 = y_top - 40
        x_left = min(f[0].x0 for f in tgroup)
        x_right = max(f[0].x1 for f in tgroup) + 100

        clip = pymupdf.Rect(x_left, header_y0 - 5, x_right, y_bottom + 5)

        try:
            # Convert clip to Camelot coords
            y1_cam = page_height - clip.y1
            y2_cam = page_height - clip.y0
            table_area = f"{clip.x0},{y1_cam},{clip.x1},{y2_cam}"

            tables_camelot = camelot.read_pdf(
                pdf_path,
                flavor="stream",
                table_areas=[table_area],
                pages=str(page_num)
            )

            if tables_camelot:
                # print(f"Camelot Table {idx} extracted")
                df = tables_camelot[0].df
                extracted_tables.append({
                    'source': pdf_path,
                    'table_num': idx,
                    'page': page_num,
                    'clip': clip,
                    'dataframe': df
                })
            else:
                print(f"Table {idx}: No table found")
        except Exception as e:
            print(f"Table {idx} failed: {e}")

    doc.close()
    return extracted_tables

### Ingestion pipeline (Text)

In [10]:
# --- Helpers ---
def clean_table(table):
    """Clean raw Camelot table output."""
    print ("Raw table:", table)
    return [
        [(cell or "").strip().replace("\n", " ") for cell in row]
        for row in table
    ]

def _normalize(s: str) -> str:
    s = (s or "").lower()
    # unify whitespace & quotes
    s = s.replace("\n", " ").replace("’", "'").replace("–", "-").replace("—", "-")
    s = " ".join(s.split())
    return s


def is_valid_table(table, numeric_threshold: float = 0.25) -> bool:
    """Return True if the table has enough numeric-looking cells to be considered real data."""
    if not table or not table[0]:
        return False

    cells = sum(len(r) for r in table)
    numeric_cells = 0
    num_pattern = re.compile(r"^\(?[+-]?\d[\d,\.]*\)?$")  # matches 5,439 or (1,200) etc.

    for row in table:
        for cell in row:
            cell = str(cell).strip().replace("$", "").replace("%", "")
            if num_pattern.match(cell):
                numeric_cells += 1

    return (numeric_cells / cells) >= numeric_threshold

In [11]:
# @title
# TODO: Implement ingestion pipeline (TEXT)
SECTION_EXAMPLES = {
    # --- Cover / Administrative ---
    "cover_page": [
        "united states securities and exchange commission form 10 k annual report pursuant to section 13 or 15d",
        "united states securities and exchange commission form 10 q quarterly report pursuant to section 13 or 15d",
        "cover page showing registrant name commission file number and state of incorporation",
        "front page identifying registrant address telephone number and fiscal year end",
    ],

    # --- Management Discussion ---
    "mdna": [
        "managements discussion and analysis of financial condition and results of operations",
        "md&a explaining liquidity capital resources and operating performance",
        "discussion and analysis of results of operations comparing current and prior periods",
        "analysis of changes in revenues costs cash flows and capital expenditures",
    ],

    # --- Risk Factors ---
    "risk_factors": [
        "risk factors that may affect future financial performance or share price",
        "discussion of material risks and uncertainties facing the company",
        "factors that could cause actual results to differ materially from forward looking statements",
    ],

    # --- Financial Highlights / Summary Data ---
    "summary_financial_data": [
        "selected financial data summarizing key performance indicators for the past five years",
        "summary of consolidated financial information and operating results",
        "selected financial highlights including revenue net income and earnings per share",
    ],

    # --- Income Statement ---
    "income_statement": [
        "consolidated statements of income showing revenue expenses and net income",
        "statement of operations or profit and loss reporting revenues and operating income",
        "consolidated statements of comprehensive income including other comprehensive income items",
        "income statement presenting total revenues cost of goods sold gross profit and net earnings",
    ],

    # --- Balance Sheet ---
    "balance_sheet": [
        "consolidated balance sheets showing assets liabilities and shareholders equity",
        "statement of financial position listing current assets long term liabilities and total equity",
        "balance sheet detailing cash accounts receivable inventories property plant and equipment",
    ],

    # --- Cash Flow Statement ---
    "cash_flow": [
        "consolidated statements of cash flows showing cash inflows and outflows from operating investing and financing activities",
        "statement of cash flows reconciling net income to net cash provided by operating activities",
        "cash flow statement detailing capital expenditures debt repayment and dividend payments",
    ],

    # --- Shareholders’ Equity ---
    "equity": [
        "consolidated statements of shareholders equity showing changes in retained earnings dividends and stock issuance",
        "statement of changes in stockholders equity presenting share repurchases and comprehensive income",
        "equity statement showing common stock treasury stock retained earnings and accumulated other comprehensive income",
    ],

    # --- Notes to Financial Statements ---
    "financial_statements": [
        "notes to consolidated financial statements providing accounting policies commitments contingencies and segment information",
        "footnotes accompanying consolidated financial statements describing significant accounting policies",
        "notes to financial statements detailing income taxes stock compensation and earnings per share",
        "supplementary information supporting consolidated financial statements",
    ],

    # --- Market Risk Disclosures ---
    "market_risk_disclosures": [
        "quantitative and qualitative disclosures about market risk",
        "discussion of exposure to interest rate foreign currency commodity and credit risk",
        "sensitivity analysis of market risk instruments",
    ],

    # --- Controls and Procedures ---
    "controls_procedures": [
        "controls and procedures section discussing disclosure controls and internal control over financial reporting",
        "evaluation of disclosure controls and procedures and changes in internal control",
        "managements report on internal control over financial reporting",
    ],

    # --- Legal Proceedings ---
    "legal_proceedings": [
        "description of material pending legal proceedings and litigation",
        "legal proceedings section detailing lawsuits claims and regulatory actions",
        "information about legal matters affecting the company",
    ],

    # --- Segment Information ---
    "segment_info": [
        "segment information describing operating segments geographic areas and major customers",
        "disclosure of business segments including revenue and profit by segment",
        "note providing details of segment performance and intersegment eliminations",
    ],

    # --- Signatures ---
    "signatures": [
        "signatures section signed on behalf of the registrant and principal officers",
        "signatures of directors executive officers and principal accounting officer",
        "signed by the registrant pursuant to the securities exchange act of 1934",
    ],

    # --- Exhibits ---
    "exhibits": [
        "exhibits and financial statement schedules",
        "list of exhibits and certifications required by form 10k or 10q",
        "exhibit index listing contracts and subsidiary information",
    ],

    # --- Fallback ---
    "other": [
        "miscellaneous sections not classified elsewhere including general disclosures appendices or cover letters",
    ],
}

In [12]:
SECTION_EMBS = {
    sec: [embed_text_query(ex) for ex in examples]
    for sec, examples in SECTION_EXAMPLES.items()
}


def classify_section(text, table):
    page_text = _normalize(text)
    headers = _normalize(" ".join(table[0])) if table else ""
    first_col = _normalize(" ".join(row[0] for row in table[1:])) if table else ""

    combined = f"{page_text} {headers} {first_col}"
    emb = embed_text_query(combined)

    scores = {
        sec: max(util.cos_sim(emb, e).item() for e in embs)
        for sec, embs in SECTION_EMBS.items()
    }

    best = max(scores, key=scores.get)
    return best if scores[best] > 0.35 else "other"

In [13]:
pdf_path = []
for folder in ["annuals", "quarterlies", "supplements"]:
    files = glob.glob(f"{DATA_DIR}/{folder}/*.pdf")
    pdf_path.extend(files)

print(f"Processing {len(pdf_path)} PDFs from all folders")
print("PDF paths:", pdf_path[:3], "...")

# keep track sections
sections = {}
output = {}

for pdfFile in pdf_path:
    pdf_name = os.path.basename(pdfFile)
    output[pdf_name] = {}

    print(f"\n=== Processing: {pdf_name} ===")

    # Step 1: extract raw text with pdfplumber
    with pdfplumber.open(pdfFile) as pdf:
        for i, page in enumerate(pdf.pages, start=1):
            text = page.extract_text() or ""
            section_text = classify_section(text, [[]])  # classifying page text only without tables
            sections[section_text] = sections.get(section_text, 0) + 1

            # Step 2: extract tables
            tables_pymupdf = extract_tables_from_page(pdfFile, i)

            tables = []
            for t in tables_pymupdf:
                df = t["dataframe"]
                raw_table = df.values.tolist()

                # if not is_valid_table(raw_table):
                #     # Skip tables that are mostly text, like footnotes or headers
                #     print(f"[SKIP] Page {i} – Non-numeric table filtered out")
                #     continue

                cleaned_table = clean_table(raw_table)
                section_table = classify_section(text, cleaned_table)

                # Track section counts
                sections[section_table] = sections.get(section_table, 0) + 1

                # Skip noise like signatures
                if section_table == "other" and "signature" in text.lower():
                    continue

                markdown_text = pd.DataFrame(cleaned_table).to_markdown(index=False)

                tables.append({
                    "section": section_table,
                    "header" : cleaned_table[0] if cleaned_table else [],
                    "rows" : cleaned_table[1:] if len(cleaned_table) > 1 else [],
                    "markdown": markdown_text
                })

            print(f"Page {i} → Text length: {len(text) if text else 0}, Tables Kept: {len(tables)}")

            output[pdf_name][i] = {
                "page_section": section_text,
                "text": text,
                "tables": tables
            }

print ("Section distribution:", sections)

# Step 3: Create directory if it doesn't exist and dump to JSON
output_path = f"{DATA_DIR}/test.json"
os.makedirs(os.path.dirname(output_path), exist_ok=True)

with open(output_path, "w") as f:
    json.dump(output, f, indent=4)

print(f"\nOutput saved to: {output_path}")

Processing 10 PDFs from all folders
PDF paths: ['00-data/annuals\\goog-10-k-2023-final.pdf', '00-data/annuals\\goog-10-k-2024.pdf', '00-data/quarterlies\\goog-10-q-q1-2023.pdf'] ...

=== Processing: goog-10-k-2023-final.pdf ===
Page 1 → Text length: 2689, Tables Kept: 0
Page 2 → Text length: 3256, Tables Kept: 0
Page 3 → Text length: 1439, Tables Kept: 0
Page 4 → Text length: 3836, Tables Kept: 0
Page 5 → Text length: 4988, Tables Kept: 0
Page 6 → Text length: 5387, Tables Kept: 0
Page 7 → Text length: 4960, Tables Kept: 0
Page 8 → Text length: 4765, Tables Kept: 0
Page 9 → Text length: 4034, Tables Kept: 0
Page 10 → Text length: 5339, Tables Kept: 0
Page 11 → Text length: 5262, Tables Kept: 0
Page 12 → Text length: 5572, Tables Kept: 0
Page 13 → Text length: 5822, Tables Kept: 0
Page 14 → Text length: 5441, Tables Kept: 0
Page 15 → Text length: 5912, Tables Kept: 0
Page 16 → Text length: 5236, Tables Kept: 0
Page 17 → Text length: 6050, Tables Kept: 0
Page 18 → Text length: 6081, Tabl

In [14]:
# Step 3: Create directory if it doesn't exist and dump to JSON
output_path = f"{DATA_DIR}/test.json"
os.makedirs(os.path.dirname(output_path), exist_ok=True)

with open(output_path, "w") as f:
    json.dump(output, f, indent=4)

print(f"\nOutput saved to: {output_path}")


Output saved to: 00-data/test.json


### Ingestion pipeline (Slides)

In [None]:
# TODO: Implement ingestion pipeline (SLIDES)


### Chunk

In [15]:
# load the json file
with open(f"{DATA_DIR}/test.json", "r") as f:
    doc = json.load(f)

chunks = []

for fileDoc , docContent in doc.items():
    for page_num, content in docContent.items():
        page_section = content.get("page_section", "unknown")
        text = content.get("text", "")
        tables = content.get("tables", [])

        if text.strip():
            chunks.append({
                "id": f"{fileDoc}-page-{page_num}-text",
                "text": f"Financial filing text section: {text}",
                "metadata": {"document": fileDoc, "page_number": page_num, "page_section": page_section, "chunk_type": "prose"}
            })

        if tables:
            for t_index, table in enumerate(tables):
                    table_text = "\n".join([", ".join(row) for row in table.get("rows", [])])
                    table_markdown = table.get("markdown", "")

                    chunks.append({
                        "id": f"{fileDoc}-page-{page_num}-table-{t_index}",
                        "text": f"Financial statement table: {table_text}",
                        "markdown": table_markdown,
                        "metadata": {
                            "document": fileDoc,
                            "page_number": page_num,
                            "page_section": page_section,
                            "chunk_type": "table",
                            "table_index": t_index
                            }
                    })

print(f"Created {len(chunks)} chunks")

Created 1355 chunks


In [16]:
text = [ chunk["text"] for chunk in chunks ]

embeddings = embed_text_passage (text)

print (f"Embeddings shape: {embeddings.shape}")

# Create a FAISS index - IP for normalized embeddings
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)
print (f"FAISS index contains {index.ntotal} vectors.")

# save it locally
output_dir = f"{DATA_DIR}/base"
os.makedirs(output_dir, exist_ok=True)

# storing the index
faiss.write_index(index, f"{output_dir}/base.faiss")
print(f"Index saved to {output_dir}/base.faiss")

# store the chunks
with open(f"{DATA_DIR}/base/chunks.json", "w") as f:
    json.dump(chunks, f, indent=4)

Batches: 100%|██████████| 43/43 [00:04<00:00,  9.13it/s]

Embeddings shape: (1355, 768)
FAISS index contains 1355 vectors.
Index saved to 00-data/base/base.faiss





### Retrieval

In [17]:
index = {}

def init_indexes():
    global index
    documents_base_dir = f"{DATA_DIR}/base/base.faiss"
    index["index"] = faiss.read_index(documents_base_dir)
    index["chunks"] = json.load(open(f"{DATA_DIR}/base/chunks.json"))
    print (f"chunks type : {type(index['chunks'])}, length: {len(index['chunks'])}")

def search_query(query, k=5):
    global index
    query_embedding = embed_text_query(query)  # Convert query to vector

    D, I = index["index"].search(np.array([query_embedding]), k=k)
    # D = distances/scores
    # I = indices of top k matching chunks

    results = [
        {
            "rank": rank + 1,
            "score": float(D[0][rank]),
            "text": index["chunks"][identified_chunk_idx]["text"],
            "markdown": index["chunks"][identified_chunk_idx].get("markdown", ""),
            "metadata": index["chunks"][identified_chunk_idx]["metadata"]
        }
        for rank, identified_chunk_idx in enumerate(I[0])
    ]

    return results

#### Output print helper

In [19]:
from textwrap import shorten
from tabulate import tabulate

def pretty_print_results(results, show_table=False):
    table_data = []
    for r in results:
        meta = r["metadata"]
        chunk_type = meta.get("chunk_type", "unknown")
        section = meta.get("page_section", "unknown")
        doc = meta.get("document", "unknown")
        page = meta.get("page_number", "?")
        score = f"{r['score']:.3f}"
        
        # shorten text for preview
        preview = shorten(r['text'], width=120, placeholder="…")
        table_data.append([r['rank'], score, chunk_type, section, doc, page, preview])

    headers = ["Rank", "Score", "Type", "Section", "Document", "Page", "Preview"]
    print(tabulate(table_data, headers=headers, tablefmt="github"))

    # print markdown tables
    if show_table:
        for r in results:
            if r["metadata"]["chunk_type"] == "table" and r.get("markdown"):
                print(f"\n Table from {r['metadata']['document']} (p.{r['metadata']['page_number']}):\n")
                print(r["markdown"])
                print("\n" + "-"*80 + "\n")


In [20]:
# Initialize once
init_indexes()

# Search
results = search_query("Show Operating Expenses for 2023 and 2024", k=5)

# print(results)
pretty_print_results(results, show_table=True)

chunks type : <class 'list'>, length: 1355
|   Rank |   Score | Type   | Section                | Document              |   Page | Preview                                                                                                                  |
|--------|---------|--------|------------------------|-----------------------|--------|--------------------------------------------------------------------------------------------------------------------------|
|      1 |   0.888 | table  | income_statement       | goog-10-q-q3-2024.pdf |     42 | Financial statement table: , , September 30,, September 30, , , 2023 2024, 2023 2024 General and administrative…         |
|      2 |   0.882 | table  | summary_financial_data | goog-10-q-q2-2024.pdf |     41 | Financial statement table: , , June 30,, , June 30, , , 2023 2024, , 2023 2024 General and administrative expenses, $,…  |
|      3 |   0.876 | table  | summary_financial_data | goog-10-q-q1-2024.pdf |     37 | Financial statement table

## 4. Baseline Pipeline

### Agent Config and Imports

In [30]:
from langchain.agents import initialize_agent, AgentType
from langchain.tools import tool
from langchain_openai import ChatOpenAI
from langchain.agents import AgentExecutor
from langchain.chains import LLMMathChain
from langchain.schema import SystemMessage, HumanMessage

In [None]:
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0) # consider 4o (will be a lot better but bye bye johnathan's credits) 

#### Retriever Tool

In [23]:
@tool("retriever", return_direct=False)
def retriever_tool(query: str) -> str:
    """Retrieve the most relevant text or table snippets for a given finance question."""
    results = search_query(query, k=5)
    formatted = []
    for r in results:
        meta = r["metadata"]
        src = f"{meta['document']} (p.{meta['page_number']}) [{meta['page_section']}]"
        formatted.append(f"{src}\n{r['text'][:800]}")
    return "\n\n---\n\n".join(formatted)


#### Calculator Tool

In [24]:
@tool("calculator", return_direct=False)
def calculator_tool(expression: str) -> str:
    """Safely evaluate a mathematical expression, e.g. (165 - 150) / 150 * 100."""
    try:
        result = eval(expression, {"__builtins__": {}})
        return str(result)
    except Exception as e:
        return f"Error evaluating: {e}"


#### Agent

In [None]:
tools = [retriever_tool, calculator_tool]

# ReAct agent
# agent = initialize_agent(
#     tools=tools,
#     llm=llm,
#     agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
#     verbose=True
# )

agent = initialize_agent(
    tools=tools,
    llm=llm,
    agent_type=AgentType.OPENAI_FUNCTIONS,
    verbose=True
)

In [29]:
# limit runtime + iterations
agent_executor = AgentExecutor.from_agent_and_tools(
    agent=agent.agent,
    tools=tools,
    max_iterations=3,
    max_execution_time=60
)

# guiding prompt
system_prompt = """
You are a financial analyst assistant that can use tools.

Rules:
- Use the retriever tool **once** to fetch financial data.
- If you need to compute ratios, use the calculator tool.
- After obtaining the necessary data, STOP and provide the Final Answer.
- Do NOT call retriever more than once.
- Return both JSON and readable prose output.

Output format:
{
  "query": "...",
  "data_values": [...],
  "computed_values": [...],
  "citations": [{"report": "...", "page": ..., "section": "..."}],
  "tools": ["retriever", "calculator"],
  "tools_count": 2
}

Then provide a short summary table and explanation.
"""


In [32]:
query = "Show Operating Expenses for 2023 and 2024."

response = agent.invoke({
    "input": [
        SystemMessage(content=system_prompt),
        HumanMessage(content=query)
    ]
})
print(response["output"])




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo answer the question about Operating Expenses for 2023 and 2024, I will first retrieve the relevant financial data using the retriever tool.

Action: retriever  
Action Input: "Operating Expenses for 2023 and 2024"  
[0m
Observation: [36;1m[1;3mgoog-10-q-q3-2024.pdf (p.42) [income_statement]
Financial statement table: , , September 30,, September 30,
, , 2023 2024, 2023 2024
General and administrative expenses, $, 3,979  $  3,599, $  11,219  $  9,783
General and administrative expenses as a percentage of, , , 

---

goog-10-k-2024.pdf (p.40) [financial_statements]
Financial statement table: , , , Year Ended December 31,, , , 
, , 2023, , , 2024, 
General and administrative expenses, $, 16,425, , $, , 14,188
General and administrative expenses as a percentage of revenues, , , 5 %, , , 4 %

---

goog-10-k-2024.pdf (p.40) [financial_statements]
Financial statement table: , , , Year Ended December 31,, 
, , 2023, , 2024
Rese

## 5. Benchmark Runner

Run these 3 standardized queries. Produce JSON then prose answers with citations. These are the standardized queries.

*   Gross Margin Trend (or NIM if Bank)
    *   Query: "Report the Gross Margin (or Net Interest Margin, if a bank) over the last 5 quarters, with values."
    *   Expected Output: A quarterly table of Gross Margin % (or NIM % if bank).

*   Operating Expenses (Opex) YoY for 3 Years
    *   Query: "Show Operating Expenses for the last 3 fiscal years, year-on-year comparison."
    *   Expected Output: A 3-year Opex table (absolute numbers and % change).

*   Operating Efficiency Ratio
    *   Query: "Calculate the Operating Efficiency Ratio (Opex ÷ Operating Income) for the last 3 fiscal years, showing the working."
    *   Expected Output: Table with Opex, Operating Income, and calculated ratio for 3 years.

In [None]:
# TODO: Implement benchmark runner


## 6. Instrumentation

Log timings: T_ingest, T_retrieve, T_rerank, T_reason, T_generate, T_total. Log tokens, cache hits, tools.

In [None]:
# Example instrumentation schema
import pandas as pd
logs = pd.DataFrame(columns=['Query','T_ingest','T_retrieve','T_rerank','T_reason','T_generate','T_total','Tokens','CacheHits','Tools'])
logs

## 7. Optimizations

**Required Optimizations**

Each team must implement at least:
*   2 retrieval optimizations (e.g., hybrid BM25+vector, smaller embeddings, dynamic k).
*   1 caching optimization (query cache or ratio cache).
*   1 agentic optimization (plan pruning, parallel sub-queries).
*   1 system optimization (async I/O, batch embedding, memory-mapped vectors).

In [None]:
# TODO: Implement optimizations


## 8. Results & Plots

Show baseline vs optimized. Include latency plots (p50/p95) and accuracy tables.

In [None]:
# TODO: Generate plots with matplotlib
