### Extract data from pdf

In [None]:
from llm_synthesis.transformers.pdf_extraction.mistral_pdf_extractor import (
    MistralPDFExtractor,
)

pdf_path = "<path_to_your_pdf_file>"  # Replace with your PDF file path
pdf_bytes = open(pdf_path, "rb").read()
extracted_text = MistralPDFExtractor().forward(pdf_bytes)

print(extracted_text)

### Extract figures from extracted data

In [None]:
from llm_synthesis.transformers.figure_extraction.regex_figure_extractor import (
    FigureExtractorMarkdown,
)

figure_extractor = FigureExtractorMarkdown()

figures = figure_extractor.forward(extracted_text)

In [None]:
import base64

from IPython.display import Image

# Convert base64 string to image and display
Image(base64.b64decode(figures[0].base64_data))

In [None]:
from llm_synthesis.models.figure import FigureInfoWithPaper

figure = FigureInfoWithPaper(
    **figures[0].__dict__,
    paper_text=extracted_text,
    si_text="",
)

### Call Claude API for extraction

In [None]:
# Claude Cost Tracking Helper
def track_claude_costs(extractor, operation_name="Claude Operation"):
    """
    Utility function to track and display Claude API costs.
    
    Args:
        extractor: ClaudeLinePlotDataExtractor instance
        operation_name: Name of the operation for display
    
    Returns:
        dict: Cost information
    """
    cost = extractor.get_cost()
    cost_info = {
        "operation": operation_name,
        "cost_usd": cost,
        "formatted_cost": f"${cost:.6f}" if cost > 0 else "$0.00 (no cost data available)"
    }
    
    print(f"💰 {operation_name} Cost: {cost_info['formatted_cost']}")
    return cost_info

def reset_claude_costs(extractor):
    """Reset the cost counter and return previous cost."""
    previous_cost = extractor.reset_cost()
    print(f"🔄 Cost counter reset. Previous total: ${previous_cost:.6f}")
    return previous_cost

def create_cost_summary(cost_records):
    """Create a summary of all tracked costs."""
    total_cost = sum(record['cost_usd'] for record in cost_records)
    print("\n" + "="*50)
    print("📊 CLAUDE API COST SUMMARY")
    print("="*50)
    for record in cost_records:
        print(f"   • {record['operation']}: {record['formatted_cost']}")
    print("-"*50)
    print(f"   💰 TOTAL COST: ${total_cost:.6f}")
    print("="*50)
    return total_cost

In [None]:
from llm_synthesis.transformers.plot_extraction.claude_extraction.plot_data_extraction import (
    ClaudeLinePlotDataExtractor,
)

# Initialize the extractor
extractor = ClaudeLinePlotDataExtractor(model_name="claude-sonnet-4-20250514")

# Reset costs to start fresh (optional)
reset_claude_costs(extractor)

# Perform the extraction
extracted_data = extractor.forward(figure)
print("Extracted data:")
print(extracted_data)

# Track the cost of this operation
cost_info = track_claude_costs(extractor, "Plot Data Extraction")

# Store cost info for later summary (optional)
cost_records = [cost_info]

### Visualize extracted data series with their labels and axis for the chart

In [None]:
from llm_synthesis.utils.visualization import visulize_line_chart

visulize_line_chart(extracted_data)


In [None]:
# Final Cost Summary
print("📈 Final Claude API Cost Report")
total_session_cost = create_cost_summary(cost_records)

# You can also get the final cost directly from the extractor
final_extractor_cost = extractor.get_cost()
print(f"\n🔍 Extractor total cost: ${final_extractor_cost:.6f}")

# Save cost info to file (optional)
import json
from datetime import datetime

cost_report = {
    "timestamp": datetime.now().isoformat(),
    "notebook": "claude_coor_extractor.ipynb",
    "total_cost_usd": total_session_cost,
    "operations": cost_records,
    "model_used": "claude-sonnet-4-20250514"
}

# Uncomment to save cost report to file
# with open("claude_cost_report.json", "w") as f:
#     json.dump(cost_report, f, indent=2)
# print("💾 Cost report saved to claude_cost_report.json")