In [1]:
import pageindex_rs

In [2]:
from datasets import load_dataset

ds = load_dataset("PatronusAI/financebench", token=True)

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
ds

DatasetDict({
    train: Dataset({
        features: ['financebench_id', 'company', 'doc_name', 'question_type', 'question_reasoning', 'domain_question_num', 'question', 'answer', 'justification', 'dataset_subset_label', 'evidence', 'gics_sector', 'doc_type', 'doc_period', 'doc_link'],
        num_rows: 150
    })
})

In [3]:
print(ds)
print(ds['train'][0].keys())
print(ds['train'][0])

DatasetDict({
    train: Dataset({
        features: ['financebench_id', 'company', 'doc_name', 'question_type', 'question_reasoning', 'domain_question_num', 'question', 'answer', 'justification', 'dataset_subset_label', 'evidence', 'gics_sector', 'doc_type', 'doc_period', 'doc_link'],
        num_rows: 150
    })
})
dict_keys(['financebench_id', 'company', 'doc_name', 'question_type', 'question_reasoning', 'domain_question_num', 'question', 'answer', 'justification', 'dataset_subset_label', 'evidence', 'gics_sector', 'doc_type', 'doc_period', 'doc_link'])
{'financebench_id': 'financebench_id_03029', 'company': '3M', 'doc_name': '3M_2018_10K', 'question_type': 'metrics-generated', 'question_reasoning': 'Information extraction', 'domain_question_num': None, 'question': 'What is the FY2018 capital expenditure amount (in USD millions) for 3M? Give a response to the question by relying on the details shown in the cash flow statement.', 'answer': '$1577.00', 'justification': 'The metric cap

In [4]:
import re

def text_to_markdown(text: str) -> str:
    lines = text.split('\n')
    md_lines = []
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
        # Detect section headers — all caps or title case short lines
        # Financial docs use patterns like "Cash Flows from Operating Activities"
        if re.match(r'^[A-Z][A-Za-z\s]+Activities$', line) or \
           re.match(r'^[A-Z][A-Za-z\s]+(Statement|Schedule|Summary|Table).*$', line) or \
           (len(line) < 60 and line.istitle() and not any(c.isdigit() for c in line)):
            md_lines.append(f'## {line}')
        else:
            md_lines.append(line)
    
    return '\n'.join(md_lines)

# Test it on the first row
sample = ds['train'][0]
text = sample['evidence'][0]['evidence_text']
md = text_to_markdown(text)
print(md[:500])

Table of Contents
3M Company and Subsidiaries
## Consolidated Statement of Cash Flow s
Years ended December 31
## (Millions)
2018
2017
2016
## Cash Flows from Operating Activities
Net income including noncontrolling interest
$
5,363
$
4,869
$
5,058
Adjustments to reconcile net income including noncontrolling interest to net cash
provided by operating activities
Depreciation and amortization
1,488
1,544
1,474
Company pension and postretirement contributions
(370)
(967)
(383)
Company pension and p


In [14]:
import os

os.environ['GROQ_API_KEY'] = "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"

In [15]:
import re
import pageindex_rs
from groq import Groq

client = Groq()  # picks up GROQ_API_KEY from env

def text_to_markdown(text: str) -> str:
    lines = text.split('\n')
    md_lines = ['# Financial Document\n']
    for line in lines:
        line = line.strip()
        if not line:
            continue
        if (re.match(r'^[A-Z][A-Za-z ]+$', line)
                and len(line) > 15
                and not any(c.isdigit() for c in line)):
            md_lines.append(f'## {line}')
        else:
            md_lines.append(line)
    return '\n'.join(md_lines)

def retrieve_and_answer(row):
    question = row['question']
    expected = row['answer']
    evidence_text = row['evidence'][0]['evidence_text']

    md = text_to_markdown(evidence_text)
    index = pageindex_rs.PageIndex.from_markdown(row['doc_name'], md)
    outline = index.outline()

    # Step 1: pick a node
    pick_response = client.chat.completions.create(
        model="llama-3.3-70b-versatile",
        max_tokens=50,
        messages=[{
            "role": "user",
            "content": f"""Document outline:
{outline}

Question: {question}

Return only the node_id most relevant to answer this question. Nothing else."""
        }]
    )
    node_id = pick_response.choices[0].message.content.strip()

    # Step 2: retrieve that node
    node = index.get_node(node_id)
    if node is None:
        node = index.get_node_with_children("1")  # fallback

    # Step 3: answer
    answer_response = client.chat.completions.create(
        model="llama-3.3-70b-versatile",
        max_tokens=200,
        messages=[{
            "role": "user",
            "content": f"""Context:
{node.text}

Question: {question}

Answer concisely."""
        }]
    )
    predicted = answer_response.choices[0].message.content.strip()

    return {
        "question": question,
        "expected": expected,
        "predicted": predicted,
        "node_id": node_id,
        "breadcrumb": node.breadcrumb if node else []
    }

# Run on first 5 rows
for i in range(5):
    result = retrieve_and_answer(ds['train'][i])
    print(f"Q: {result['question'][:80]}")
    print(f"Expected:  {result['expected']}")
    print(f"Predicted: {result['predicted'][:100]}")
    print(f"Node: {result['node_id']} -> {result['breadcrumb']}")
    print("---")

Q: What is the FY2018 capital expenditure amount (in USD millions) for 3M? Give a r
Expected:  $1577.00
Predicted: The FY2018 capital expenditure amount for 3M is $1,577 million, as shown under "Purchases of propert
Node: [1.6] -> ['Financial Document']
---
Q: Assume that you are a public equities analyst. Answer the following question by 
Expected:  $8.70
Predicted: The year-end FY2018 net PPNE (Property, Plant, and Equipment, net) for 3M is $8.738 billion.
Node: [1.9] -> ['Financial Document']
---
Q: Is 3M a capital-intensive business based on FY2022 data?
Expected:  No, the company is managing its CAPEX and Fixed Assets pretty efficiently, which is evident from below key metrics:
CAPEX/Revenue Ratio: 5.1%
Fixed assets/Total Assets: 20%
Return on Assets= 12.4%
Predicted: The provided data is insufficient to determine if 3M is a capital-intensive business, as it only sho
Node: [1.1] -> ['Financial Document']
---
Q: What drove operating margin change as of FY2022 for 3M? If operating m

In [16]:
# Our Rust implementation — in your notebook
import pageindex_rs
index = pageindex_rs.PageIndex.from_file("test_doc", "/Volumes/ExtraStorage/PageIndexRust/tests/test_doc.md")
print(index.outline())
print(index.to_json())

[1] Introduction to Machine Learning
  [1.1] Supervised Learning
    [1.1.1] Classification
    [1.1.2] Regression
  [1.2] Unsupervised Learning
    [1.2.1] Clustering
    [1.2.2] Dimensionality Reduction
  [1.3] Reinforcement Learning
    [1.3.1] Policy Gradient Methods
    [1.3.2] Q-Learning
{
  "doc_id": "test_doc",
  "title": "Introduction to Machine Learning",
  "description": null,
  "root": {
    "node_id": "1",
    "title": "Introduction to Machine Learning",
    "depth": 1,
    "text": "",
    "summary": null,
    "children": [
      {
        "node_id": "1.1",
        "title": "Supervised Learning",
        "depth": 2,
        "text": "Supervised learning uses labeled data to train models. The algorithm learns a mapping from inputs to outputs based on example input-output pairs.",
        "summary": null,
        "children": [
          {
            "node_id": "1.1.1",
            "title": "Classification",
            "depth": 3,
            "text": "Classification predicts