In [None]:
!pip install pdfplumber
!pip install camelot-py[cv]
!pip install --upgrade pymupdf
!pip install google-generativeai
!pip install faiss-cpu
!pip install transformers tqdm pandas pytesseract pillow easyocr langchain langchain-community langchain_openai faiss-cpu rank_bm25 pdf2image
!pip install sentence_transformers
!pip install python-dotenv

^C



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting pymupdf
  Downloading pymupdf-1.26.6-cp310-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.6-cp310-abi3-win_amd64.whl (18.4 MB)
   ---------------------------------------- 0.0/18.4 MB ? eta -:--:--
   ---------------------------------------- 0.1/18.4 MB 1.7 MB/s eta 0:00:12
   - -------------------------------------- 0.7/18.4 MB 8.4 MB/s eta 0:00:03
   ---- ----------------------------------- 1.9/18.4 MB 15.3 MB/s eta 0:00:02
   --------- ------------------------------ 4.4/18.4 MB 25.7 MB/s eta 0:00:01
   ------------------- -------------------- 9.0/18.4 MB 40.9 MB/s eta 0:00:01
   ------------------------------- -------- 14.3/18.4 MB 93.0 MB/s eta 0:00:01
   ---------------------------------------  18.4/18.4 MB 108.8 MB/s eta 0:00:01
   ---------------------------------------- 18.4/18.4 MB 92.9 MB/s eta 0:00:00
Installing collected packages: pymupdf
  Attempting uninstall: pymupdf
    Found existing installation: PyMuPDF 1.26.5
    Uninstalling PyMuPDF-1.26.5:


[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
from dotenv import load_dotenv
import re
import os, glob
import pdfplumber
import camelot
import pymupdf
import numpy as np
import pandas as pd
from pathlib import Path
import time
import faiss, json
import collections
import fitz
import io
from PIL import Image
import pytesseract
from langchain_core.documents import Document
from openai import OpenAI
from TableRetrieval.table_ingestion import stage1_extract_and_save
from TableRetrieval.table_ingestion import store_in_faiss, save_metadata_mapping
from TableRetrieval.table_agentic_rag import TableAgenticRAG

load_dotenv()

# Agent CFO — Performance Optimization & Design

---
This is the starter notebook for your project. Follow the required structure below.


You will design and optimize an Agent CFO assistant for a listed company. The assistant should answer finance/operations questions using RAG (Retrieval-Augmented Generation) + agentic reasoning, with response time (latency) as the primary metric.

Your system must:
*   Ingest the company’s public filings.
*   Retrieve relevant passages efficiently.
*   Compute ratios/trends via tool calls (calculator, table parsing).
*   Produce answers with valid citations to the correct page/table.


## 1. Config & Secrets

Fill in your API keys in secrets. **Do not hardcode keys** in cells.

In [None]:
import os

# Example:
# os.environ['GEMINI_API_KEY'] = 'your-key-here'
# os.environ['OPENAI_API_KEY'] = 'your-key-here'

COMPANY_NAME = "Google"

CHUNK_SIZE = 500  # number of words per chunk 

def generate_test_log_path_name(base_path: str): 
    # create the directory if not exist 
    os.makedirs(base_path, exist_ok=True) 
    existing_files = [f for f in os.listdir(base_path) if f.startswith("test_") and f.endswith(".json")] 
    existing_indices = [int(f.split("_")[1].split(".")[0]) for f in existing_files if f.split("_")[1].split(".")[0].isdigit()] 
    next_index = max(existing_indices) + 1 if existing_indices else 1 

    return f"{base_path}/test_{next_index}.json"


## 2. Data Download (Dropbox)

*   Annual Reports: last 3–5 years.
*   Quarterly Results Packs & MD&A (Management Discussion & Analysis).
*   Investor Presentations and Press Releases.
*   These files must be submitted later as a deliverable in the Dropbox data pack.
*   Upload them under `/content/data/`.

Scope limit: each team will ingest minimally 15 PDF files total.


In [None]:
DATA_DIR = "00-data"

# Annual reports (10-Ks)
annual_files = glob.glob(f"{DATA_DIR}/annuals/*.pdf")

# # Quarterly reports (10-Qs)
quarterly_files = glob.glob(f"{DATA_DIR}/quarterlies/*.pdf")

# Presentations
presentation_files = glob.glob(f"{DATA_DIR}/presentations/*.pdf")

In [None]:
for folder in ["annuals", "quarterlies", "presentations"]:

    files = glob.glob(f"{DATA_DIR}/{folder}/*.pdf")
    print(f"{folder}: {len(files)} files")

## 3. System Requirements

**Retrieval & RAG**
*   Use a vector index (e.g., FAISS, LlamaIndex) + a keyword filter (BM25/ElasticSearch).
*   Citations must include: report name, year, page number, section/table.

**Agentic Reasoning**
*   Support at least 3 tool types: calculator, table extraction, multi-document compare.
*   Reasoning must follow a plan-then-act pattern (not a single unstructured call).

**Instrumentation**
*   Log timings for: T_ingest, T_retrieve, T_rerank, T_reason, T_generate, T_total.
*   Log: tokens used, cache hits, tools invoked.
*   Record p50/p95 latencies.

### Embeddings

In [None]:
def create_embeddings(chunks, model="text-embedding-3-small"):
    """
    Create embeddings for chunks using OpenAI.
    Returns: chunks with 'embedding' field added
    """
    print(f"Creating embeddings with {model}...\n")
    print(f"   Total chunks: {len(chunks)}")
    
    client = OpenAI()

    # Split into batches of 50
    batch_size = 50
    total_batches = (len(chunks) + batch_size - 1) // batch_size

    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i + batch_size]
        batch_num = i // batch_size + 1
        print(f"   Processing batch {batch_num} of {total_batches}...")

        contents = [chunk['content'] for chunk in batch]
        
        # Embed batch
        response = client.embeddings.create(
            model=model,
            input=contents
        )
        
        # Add embeddings to chunks
        for j, chunk in enumerate(batch):
            chunk['embedding'] = response.data[j].embedding

        # Small delay to respect rate limits
        if i + batch_size < len(chunks):
                time.sleep(0.6) # Adjust as needed
    
    print(f"Created {len(chunks)} embeddings\n")
    return chunks

### Table Ingestion

In [None]:
def stage2_create_embeddings(json_file):
    """
    Load extracted tables from JSON and create embeddings.
    This is where you spend OpenAI tokens.
    """
    load_dotenv()
    
    DATA_DIR = "00-data"
    
    print("="*80)
    print("STAGE 2: CREATING EMBEDDINGS")
    print("="*80)
    print()
    
    # Load the extracted tables
    print(f"Loading: {json_file}")
    with open(json_file, 'r') as f:
        table_chunks = json.load(f)
    
    print(f"Loaded {len(table_chunks)} tables")
    
    print("\nCreating embeddings...")
    embedded_chunks = create_embeddings(table_chunks)
    
    print("\nStoring in FAISS...")
    faiss_index = store_in_faiss(
        embedded_chunks,
        faiss_index_path=f"{DATA_DIR}/base/faiss_table_index"
    )
    
    print("\nSaving metadata...")
    save_metadata_mapping(
        embedded_chunks,
        mapping_path=f"{DATA_DIR}/base/faiss_table_metadata.json"
    )
    
    print(f"\n{'='*80}")
    print("COMPLETE!")
    print("="*80)
    print(f"FAISS index: {DATA_DIR}/base/faiss_table_index")
    print(f"Metadata: {DATA_DIR}/base/faiss_table_metadata.json")
    print()

In [None]:
# Insert table ingestion code here
stage1_extract_and_save()

In [None]:
stage2_create_embeddings(f"{DATA_DIR}/extracted_tables.json")

### Slides Ingestion

In [None]:
from ImageRetrieval.ImageRetrieval import ImageRetriever
from ImageRetrieval.slides_extractor import extract_slides_fitz
from ImageRetrieval.ocr_extractor import ocr_folder
from ImageRetrieval.image_faiss_builder import (
    create_image_embeddings,
    store_image_faiss,
    save_image_metadata
)

IMAGE_INDEX_PATH = "00-data/base/faiss_image_index"
IMAGE_META_PATH  = "00-data/base/faiss_image_metadata.json"

slide_docs = []

pdf_files = glob.glob("00-data/presentations/*.pdf")
print(f"[ImagePipeline] Found {len(pdf_files)} presentation PDFs.")

for pdf in pdf_files:
    pdf_name = os.path.splitext(os.path.basename(pdf))[0]
    out_dir = f"00-data/presentations/slides_{pdf_name}"

    extract_slides_fitz(pdf, out_dir)
    slide_docs.extend(ocr_folder(out_dir, label=pdf_name))

print(f"[ImagePipeline] Total OCR slide documents: {len(slide_docs)}")

# Embed
embeddings = create_image_embeddings(slide_docs)

# FAISS + metadata
store_image_faiss(embeddings, IMAGE_INDEX_PATH)
save_image_metadata(slide_docs, IMAGE_META_PATH)


### Text Ingestion

In [None]:
from TextRetrieval.TextExtractor import extract_text_from_pdf
extract_text_from_pdf(); 


## TEXT FAISS BUILDER
### CREATE THE CHUNKS 
### BUILD THE INDICES BASE OFF THE CHUNKS

In [None]:
from TextRetrieval.TextFaissBuilder import create_chunks, built_indices

chunks = create_chunks();
built_indices(chunks); 

## 4. Baseline Pipeline

**Baseline (starting point)**
*   Naive chunking.
*   Single-pass vector search.
*   One LLM call, no caching.

In [None]:
# TODO: Implement baseline retrieval + generation


## 5. Benchmark Runner

Run these 3 standardized queries. Produce JSON then prose answers with citations. These are the standardized queries.

*   Gross Margin Trend (or NIM if Bank)
    *   Query: "Report the Gross Margin (or Net Interest Margin, if a bank) over the last 5 quarters, with values."
    *   Expected Output: A quarterly table of Gross Margin % (or NIM % if bank).

*   Operating Expenses (Opex) YoY for 3 Years
    *   Query: "Show Operating Expenses for the last 3 fiscal years, year-on-year comparison."
    *   Expected Output: A 3-year Opex table (absolute numbers and % change).

*   Operating Efficiency Ratio
    *   Query: "Calculate the Operating Efficiency Ratio (Opex ÷ Operating Income) for the last 3 fiscal years, showing the working."
    *   Expected Output: Table with Opex, Operating Income, and calculated ratio for 3 years.

In [None]:
# TODO: Implement benchmark runner


## 6. Instrumentation

Log timings: T_ingest, T_retrieve, T_rerank, T_reason, T_generate, T_total. Log tokens, cache hits, tools.

In [None]:
# Example instrumentation schema
import pandas as pd
logs = pd.DataFrame(columns=['Query','T_ingest','T_retrieve','T_rerank','T_reason','T_generate','T_total','Tokens','CacheHits','Tools'])
logs

## 7. Optimizations

**Required Optimizations**

Each team must implement at least:
*   2 retrieval optimizations (e.g., hybrid BM25+vector, smaller embeddings, dynamic k).
*   1 caching optimization (query cache or ratio cache).
*   1 agentic optimization (plan pruning, parallel sub-queries).
*   1 system optimization (async I/O, batch embedding, memory-mapped vectors).

In [None]:
# TODO: Implement optimizations


### Table Agentic Optimization

In [None]:
FAISS_INDEX = f"{DATA_DIR}/base/faiss_table_index"
METADATA_JSON = f"{DATA_DIR}/base/faiss_table_metadata.json"

# Create the agent
table_agent = TableAgenticRAG(
    faiss_index_path=FAISS_INDEX,
    metadata_json_path=METADATA_JSON
)

query = "Show Operating Expenses for the last 3 fiscal years, year-on-year comparison."


print("\nRunning agent query...\n")
result = table_agent.query(query, verbose=True)


print("\n====================== ANSWER ======================")
print(result["answer"])
print("===================================================\n")


print("Sources:", result["sources"])

In [None]:
query2 = "What is the operating expense for the last 3 fiscal years, year-on-year comparison."
print("\nRunning agent query 2...\n")
result = table_agent.query(query2, verbose=True)

print("\n====================== ANSWER ======================")
print(result["answer"])
print("===================================================\n")


print("Sources:", result["sources"])

### TEXT Agentic Optimization

In [None]:
# from TextRetrieval.Agent import text_agent_executor

# query = "" \
# "What is the operating expense for the last 3 fiscal years, year-on-year comparison."
# text_agent_executor(query)

### Image Query

In [None]:
from PIL import Image
IMAGE_INDEX = "00-data/base/faiss_image_index"
IMAGE_META  = "00-data/base/faiss_image_metadata.json"

image_retriever = ImageRetriever(IMAGE_INDEX, IMAGE_META)
query = "What are the operating expenses?"
results = image_retriever.search(query, k=5)
results
path = results[0]["metadata"]["image_path"]
Image.open(path)


## Unified Agent Query Response

In [None]:
# from UnifiedCFOAgent import create_cfo_agent

# agent = create_cfo_agent()

# query = "Show Operating Expenses for the last 3 fiscal years, year-on-year comparison."

# # response = agent.invoke({"input": query})

# response = agent.invoke({"input": query}, return_intermediate_steps=True)

# print(response)


In [1]:
from UnifiedCFOAgent import query_cfo_agent
question = "Show Operating Expenses for the last 3 fiscal years, year-on-year comparison."
response = query_cfo_agent(
    question,
    cache_threshold=0.85,
    use_cache=True
)

print(response["answer"])
print(response["metadata"])

Cache MISS

[INFO] Initializing FAISS indices from disk...
[TextSectionRetriever] Loaded 15 section indices from 00-data\sections


  agent = initialize_agent(




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: To provide the Operating Expenses for the last 3 fiscal years and a year-on-year comparison, I need to retrieve the financial tables that include Operating Expenses data from the company's annual or quarterly filings.
Action: retrieve_table
Action Input: Operating Expenses last 3 fiscal years year-on-year comparison[0m
Observation: [33;1m[1;3m{"query": "Operating Expenses last 3 fiscal years year-on-year comparison", "answer": "To perform a year-on-year comparison of Google's operating expenses over the last three fiscal years, we will focus on the General and Administrative (G&A) expenses as provided in the retrieved data. The data for the fiscal years 2021, 2022, and 2023 is available, and we will calculate the percentage change year-on-year.\n\n### General and Administrative Expenses (G&A) Year-on-Year Comparison\n\n#### Data Summary:\n- **2021 G&A Expenses:** $13,510 million (Source: goog-10-k-2022)\n- **2022 

In [2]:
question = "What is the operating expense for the last 3 fiscal years, year-on-year comparison."
response = query_cfo_agent(
    question,
    cache_threshold=0.85,
    use_cache=True
)

print(response["answer"])
print(response["metadata"])

Cache HIT:
  Similarity: 0.9201
  Cached Query: Show Operating Expenses for the last 3 fiscal years, year-on-year comparison.

The Operating Expenses (specifically General and Administrative Expenses) for the last three fiscal years are as follows:

- **2021**: $13,510 million
- **2022**: $15,724 million (16.39% increase from 2021)
- **2023**: $16,425 million (4.46% increase from 2022)

This indicates a consistent increase in operating expenses, with a notable slowdown in growth from 2022 to 2023.
{'sources': ['goog-10-k-2024', 'goog-10-q-q1-2023', 'goog-10-q-q2-2025', 'goog-10-q-q3-2022', 'goog-10-q-q1-2025', 'goog-10-k-2023-final', 'goog-10-q-q1-2024', 'goog-10-k-2022'], 'image_paths': []}


In [None]:
question = "Calculate the Operating Efficiency Ratio (Opex ÷ Operating Income) for the last 3 fiscal years, showing the working."
response = query_cfo_agent(
    question,
    cache_threshold=0.85,
    use_cache=True
)

print(response)

Cache MISS

[INFO] FAISS indices already initialized, reusing existing indices.
[TextSectionRetriever] Loaded 15 section indices from 00-data\sections


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo calculate the Operating Efficiency Ratio (Opex ÷ Operating Income) for the last 3 fiscal years, I need to retrieve the operating expenses (Opex) and operating income for those years. I will first look for the relevant financial tables that contain this information.

Action: retrieve_table  
Action Input: "Operating Expenses and Operating Income for the last 3 fiscal years"  [0m
Observation: [33;1m[1;3m{"query": "Operating Expenses and Operating Income for the last 3 fiscal years", "answer": "To address the query regarding Operating Expenses and Operating Income for the last three fiscal years, we will extract and calculate the necessary data from the provided documents.\n\n### Operating Expenses\n\nOperating expenses typically include costs such as General and Administrat

In [3]:
question = "What are the operating expenses?"
response = query_cfo_agent(
    question,
    cache_threshold=0.85,
    use_cache=True
)

print(response)

Cache MISS

[INFO] FAISS indices already initialized, reusing existing indices.
[TextSectionRetriever] Loaded 15 section indices from 00-data\sections


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo find the operating expenses, I should retrieve the relevant financial table that contains this information. 
Action: retrieve_table 
Action Input: "operating expenses" [0m
Observation: [33;1m[1;3m{"query": "operating expenses", "answer": "To address the query regarding operating expenses, we will focus on the General and Administrative (G&A) expenses as they are a component of operating expenses. We will analyze the data provided for the first two quarters of 2025 and compare it with the corresponding periods in 2024.\n\n### General and Administrative Expenses Analysis\n\n#### Q1 2025 vs. Q1 2024\n- **Q1 2025 G&A Expenses**: $3,539 million\n- **Q1 2024 G&A Expenses**: $3,026 million\n\n**Calculation of Change:**\n\\[ \\text{Change} = \\frac{\\text{Q1 2025} - \\text{Q1 202

In [5]:
question = "Where can i get a graphic representation of the operating expenses?"
response = query_cfo_agent(
    question,
    cache_threshold=0.85,
    use_cache=True
)

print(response)

Cache HIT:
  Similarity: 1.0000
  Cached Query: Where can i get a graphic representation of the operating expenses?

{'query': 'Where can i get a graphic representation of the operating expenses?', 'answer': "You can find a graphic representation of the operating expenses in Alphabet's earnings presentation decks, specifically in slides such as slide 4 and slide 5 from the 2025 Q1 and Q2 earnings slides.", 'metadata': {'sources': ['2025q2-alphabet-earnings-slides', '2025q1-alphabet-earnings-slides'], 'image_paths': ['00-data/presentations/slides_2025q2-alphabet-earnings-slides\\slide_05.png', '00-data/presentations/slides_2025q1-alphabet-earnings-slides\\slide_10.png', '00-data/presentations/slides_2025q1-alphabet-earnings-slides\\slide_04.png', '00-data/presentations/slides_2025q2-alphabet-earnings-slides\\slide_04.png', '00-data/presentations/slides_2025q1-alphabet-earnings-slides\\slide_05.png']}, 'cache_hit': True}


In [2]:
question = "“Explain the key drivers behind the company’s Operating Expenses over the last three fiscal years, using details from the MD&A and footnotes.”"
response = query_cfo_agent(
    question,
    cache_threshold=0.85,
    use_cache=True
)

print(response)

Cache MISS

[INFO] FAISS indices already initialized, reusing existing indices.
[TextSectionRetriever] Loaded 15 section indices from 00-data\sections


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mQuestion: “Explain the key drivers behind the company’s Operating Expenses over the last three fiscal years, using details from the MD&A and footnotes.”
Thought: To answer this question, I need to retrieve textual evidence from the Management’s Discussion and Analysis (MD&A) section and footnotes of the company’s annual filings that discuss operating expenses and their drivers over the last three fiscal years.
Action: retrieve_text
Action Input: "Operating Expenses drivers MD&A last three fiscal years"[0m[TextSectionRetriever] Searching sections: ['mdna', 'income_statement', 'financial_statements']
[TextSectionRetriever] Using k=40 for query: 'Operating Expenses drivers MD&A last three fiscal years'
[INFO] Searching 3 sections for: '"Retrieve information on operating expenses 

## 8. Results & Plots

Show baseline vs optimized. Include latency plots (p50/p95) and accuracy tables.

In [None]:
# TODO: Generate plots with matplotlib
