# Parsing Code using Llama Parse v2.0
UCB MFE FT ID GROUP - RL \\
In order to run this code, you need api from llama Cloud: \\
you would need LLAMA_CLOUD_API_KEY @ https://cloud.llamaindex.ai/project/bf1f404e-ee25-4695-a2e6-eb5519ac661a \\
Documentation: https://docs.cloud.llamaindex.ai/llamaparse/getting_started/python

For the data porting, we choose to use openai to handel all the information and data cleaning and organisation, so that it looks exactly like the excel format which was provided by Luis.

In [1]:
from llama_cloud_services import LlamaParse, LlamaReport, LlamaExtract
from pydantic import BaseModel, Field
from typing import Optional
from dotenv import load_dotenv
import os
import pandas as pd
import nest_asyncio
from typing import List, Optional, Dict
from pathlib import Path
import json

load_dotenv()
nest_asyncio.apply()

In [2]:
os.environ["LLAMA_CLOUD_API_KEY"]="llx-f6YYjXtedyymyFDxyFp793XAlgjy3IIBKZ2lday5PL5pX5CY"

In [3]:
from llama_parse import LlamaParse

In [4]:
# Agent 1: Financial Position
class FinancialPosition(BaseModel):
    year: int
    cash_and_short_term_investments: Optional[int]
    student_receivables_net: Optional[int]
    accounts_receivable: Optional[int]
    contributions_receivable: Optional[int]
    notes_receivable: Optional[int]
    loans_receivable_net: Optional[int]
    other_assets: Optional[int]
    investments: Optional[int]
    right_of_use_assets: Optional[int]
    land_buildings_equipment_net: Optional[int]
    accumulated_depreciation: Optional[int]
    rou_assets_finance_lease: Optional[int]
    rou_assets_operating_lease: Optional[int]
    current_portion_long_term_debt: Optional[int]
    current_portion_operating_lease: Optional[int]
    short_term_debt: Optional[int]
    total_assets: Optional[int]
    accounts_payable: Optional[int]
    student_deposits_and_deferred_revenue: Optional[int]
    tenant_capital_improvements: Optional[int]
    bonds_payable_net: Optional[int]
    refundable_advances_us_govt: Optional[int]
    lease_obligations: Optional[int]
    liabilities_under_split_interest_agreements: Optional[int]
    liabilities_associated_with_investments: Optional[int]
    non_controlling_interests: Optional[int]
    total_liabilities: Optional[int]
    net_assets_with_donor_restrictions: Optional[int]
    net_assets_without_donor_restrictions: Optional[int]
    total_net_assets: Optional[int]
    total_liabilities_and_net_assets: Optional[int]


# Agent 2: Statement of Activities
class StatementOfActivities(BaseModel):
    year: int
    tuition_and_fees_net: Optional[int]
    auxiliary_enterprises: Optional[int]
    government_grants: Optional[int]
    contributions_with_donor_restrictions: Optional[int]
    contributions_without_donor_restrictions: Optional[int]
    investment_income_operations_with_donor: Optional[int]
    investment_income_operations_without_donor: Optional[int]
    other_investment_income: Optional[int]
    other_sources: Optional[int]
    total_revenue: Optional[int]
    net_assets_released_from_restrictions: Optional[int]
    compensation: Optional[int]
    fringe_benefits: Optional[int]
    supplies_services_general: Optional[int]
    depreciation: Optional[int]
    interest: Optional[int]
    utilities_occupancy: Optional[int]
    total_expenses: Optional[int]
    change_in_net_assets_with_donor_restrictions: Optional[int]
    change_in_net_assets_without_donor_restrictions: Optional[int]
    total_change_in_net_assets: Optional[int]


# Agent 3: Cash Flows
class CashFlows(BaseModel):
    year: int
    net_cash_from_operating_activities: Optional[int]
    purchases_of_investments: Optional[int]
    sales_of_investments: Optional[int]
    purchases_of_capex: Optional[int]
    proceeds_from_sale_of_assets: Optional[int]
    net_cash_from_investing_activities: Optional[int]
    bond_repayment: Optional[int]
    govt_grant_reduction: Optional[int]
    restricted_contributions_for_long_term: Optional[int]
    net_cash_from_financing_activities: Optional[int]
    net_change_in_cash: Optional[int]
    cash_beginning_of_year: Optional[int]
    cash_end_of_year: Optional[int]


# Agent 4: Endowment Breakdown
class EndowmentBreakdown(BaseModel):
    year: int
    donor_restricted_start: Optional[int]
    board_designated_start: Optional[int]
    investment_income_donor: Optional[int]
    investment_gains_donor: Optional[int]
    investment_income_board: Optional[int]
    investment_gains_board: Optional[int]
    new_contributions: Optional[int]
    appropriated_donor: Optional[int]
    appropriated_board: Optional[int]
    donor_restricted_end: Optional[int]
    board_designated_end: Optional[int]


# Agent 5: Investment Valuation
class InvestmentValuation(BaseModel):
    year: int
    total_investments: Optional[int]
    domestic_fixed_income: Optional[int]
    domestic_equity: Optional[int]
    international_equity: Optional[int]
    private_equity_nav: Optional[int]
    real_assets_nav: Optional[int]
    hedge_funds_nav: Optional[int]
    level_1_total: Optional[int]
    nav_total: Optional[int]


# Agent 6: Notes Summary (lease_payments_schedule removed)
class NotesSummary(BaseModel):
    year: int
    financial_assets_within_12_months: Optional[int]
    lines_of_credit: Optional[int]
    available_liquidity_total: Optional[int]
    lease_liability: Optional[int]
    instruction_with_donor: Optional[int]
    academic_support_with_donor: Optional[int]
    student_aid_with_donor: Optional[int]
    general_institutional_with_donor: Optional[int]


# Final schema container
class FinancialStatement_2425(BaseModel):
    financial_position: List[FinancialPosition]
    statement_of_activities: List[StatementOfActivities]
    cash_flows: List[CashFlows]
    endowment: List[EndowmentBreakdown]
    investment_valuation: List[InvestmentValuation]
    notes_summary: List[NotesSummary]

In [5]:
# we are only allowed to have 100 fields. hence have to exclude these

# # === Agent 6: Notes Summary ===
# class NotesSummary(BaseModel):
#     year: int
#     financial_assets_within_12_months: Optional[int]
#     lines_of_credit: Optional[int]
#     available_liquidity_total: Optional[int]
#     lease_liability: Optional[int]
#     lease_payments_schedule: Optional[List[LeasePaymentEntry]]
#     instruction_with_donor: Optional[int]
#     academic_support_with_donor: Optional[int]
#     student_aid_with_donor: Optional[int]
#     general_institutional_with_donor: Optional[int]


# # === Agent 7: Financial Aid Summary ===
# class FinancialAidSummary(BaseModel):
#     year: int
#     financial_aid_gifts_grants: Optional[int]
#     financial_aid_loans: Optional[int]
#     financial_aid_work_study: Optional[int]
#     total_financial_aid: Optional[int]
#     institutional_funds: Optional[int]
#     restricted_grants: Optional[int]
#     total_student_aid: Optional[int]

# # === Agent 9: Operating Budget ===
# class OperatingBudget(BaseModel):
#     year: int
#     operating_budget_total: Optional[int]
#     operating_budget_bloomington: Optional[int]
#     operating_budget_indianapolis: Optional[int]
#     operating_budget_regionals: Optional[int]
#     operating_budget_admin: Optional[int]


# # === Agent 10: State Appropriations ===
# class StateAppropriations(BaseModel):
#     year: int
#     state_appropriations: Optional[int]
#     capital_appropriations: Optional[int]


# # === Agent 11: Cash Position Breakout ===
# class CashPositionBreakout(BaseModel):
#     year: int
#     restricted_cash_end_of_year: Optional[int]
#     unrestricted_cash_end_of_year: Optional[int]
#     right_to_use_asset_lease: Optional[int]
#     unrestricted_cash_and_cash_equivalents: Optional[int]
#     total_cash_and_investments: Optional[int]


# # === Agent 12: Debt Service Summary ===
# class DebtServiceSummary(BaseModel):
#     year: int
#     long_term_debt_total: Optional[int]
#     annual_debt_service: Optional[int]
#     net_revenue_for_debt_service: Optional[int]
#     debt_service_coverage_ratio: Optional[float]
#     change_in_net_assets: Optional[int]
#     investment_losses: Optional[int]
#     post_retirement_benefit_obligation: Optional[int]
#     depreciation: Optional[int]
#     interest_expense: Optional[int]


#     operating_budget: List[OperatingBudget]
#     state_appropriations: List[StateAppropriations]
#     cash_position_breakout: List[CashPositionBreakout]
#     debt_service_summary: List[DebtServiceSummary]


# # === Combined Schema ===
# class FinancialStatement_2425(BaseModel):
#     financial_position: List[FinancialPosition]
#     statement_of_activities: List[StatementOfActivities]
#     cash_flows: List[CashFlows]
#     endowment: List[EndowmentBreakdown]
#     investment_valuation: List[InvestmentValuation]
#     notes_summary: List[NotesSummary]
#     financial_aid_summary: List[FinancialAidSummary]
#     tuition_cost: List[TuitionCost]
#     operating_budget: List[OperatingBudget]
#     state_appropriations: List[StateAppropriations]
#     cash_position_breakout: List[CashPositionBreakout]
#     debt_service_summary: List[DebtServiceSummary]

In [6]:
extractor = LlamaExtract()

# Get your existing agent or create a new one
agent = extractor.get_agent(id="d48a8828-497b-4cf7-88fe-aef9e1c4b95e")

# Assign your updated schema
agent.data_schema = FinancialStatement_2425
agent.save()

No project_id provided, fetching default project.


ApiError: status_code: 400, body: {'detail': 'schema_validation: Schema exceeds maximum of 100 properties.\nSee: https://docs.cloud.llamaindex.ai/llamaextract/features/schema_restrictions'}

In [None]:

# ✅ 1. Use actual file path (Path object ensures compatibility)
pdf_path = Path("/Users/lirichard/Desktop/FT/Franklin Templeton Industrial Project/PDF/P21856488-P21421176-P21864772.pdf")

# ✅ 2. Run agent.extract with raw file path (not parsed documents)
extractions = agent.extract(str(pdf_path))  # <-- pass as string

# ✅ 3. View structured output
print(json.dumps(extractions.result, indent=2))

In [None]:
print(extractions.data)

In [None]:
!pip install xlsxwriter

In [None]:
import xlsxwriter
def save_extraction_to_excel(data: dict, output_path: str):
    all_blocks = []
    for key, value in data.items():
        # Convert list of dicts to DataFrame and transpose
        if isinstance(value, list) and all(isinstance(item, dict) for item in value):
            df = pd.DataFrame(value)
            df = df.set_index("year").T.reset_index().rename(columns={"index": "field"})
        else:
            df = pd.DataFrame([value])
        # Insert section header as the first row of the block
        section_header = pd.DataFrame([[f"== {key.upper()} =="] + [""] * (df.shape[1] - 1)], columns=df.columns)
        all_blocks.extend([section_header, df, pd.DataFrame([[""] * df.shape[1]])])  # add empty row after each section

    # Combine all blocks
    combined_df = pd.concat(all_blocks, ignore_index=True)

    # Write to Excel
    with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
        combined_df.to_excel(writer, sheet_name="Financials", index=False)

    return output_path

In [None]:
output_path = "yale_extracted_2024.xlsx"
save_extraction_to_excel(extractions.data, output_path)