# Parsing Code using Llama Parse v2.0
UCB MFE FT ID GROUP - RL \\
In order to run this code, you need api from llama Cloud: \\
you would need LLAMA_CLOUD_API_KEY @ https://cloud.llamaindex.ai/project/bf1f404e-ee25-4695-a2e6-eb5519ac661a \\
Documentation: https://docs.cloud.llamaindex.ai/llamaparse/getting_started/python

For the data porting, we choose to use openai to handel all the information and data cleaning and organisation, so that it looks exactly like the excel format which was provided by Luis.

In [1]:
from llama_cloud_services import LlamaParse, LlamaReport, LlamaExtract
from pydantic import BaseModel, Field
from typing import Optional
from dotenv import load_dotenv
import os
import pandas as pd
import nest_asyncio
from typing import List, Optional, Dict
from pathlib import Path
import json
import xlsxwriter

load_dotenv()
nest_asyncio.apply()

In [2]:
os.environ["LLAMA_CLOUD_API_KEY"]="llx-kS6nFu8QgI84j17LsTPY7n1uNMtrVVxKEdsZqRpT3KegS06w"

In [47]:
class FinancialPosition(BaseModel):
    year: int = Field(
        description=(
            "Fiscal year for which the balance sheet data applies. "
            "Only extract data where year == 2024; ignore entries for any other years."
        )
    )
    cash_and_short_term_investments: Optional[int] = Field(
        description=(
            "Total unrestricted cash and short-term investments at year-end (2024). "
            "Exclude restricted cash or endowments. Include cash equivalents and highly liquid instruments."
        )
    )
    student_receivables_net: Optional[int] = Field(
        description=(
            "Net student receivables (2024) after allowance for doubtful accounts. "
            "Only include the net figure; ignore separate gross and allowance values."
        )
    )
    accounts_receivable: Optional[int] = Field(
        description=(
            "Total accounts receivable at year-end 2024. Exclude intercompany or intra-campus balances."
        )
    )
    contributions_receivable: Optional[int] = Field(
        description=(
            "Pledges and donations receivable for 2024 that are not yet collected. "
            "Exclude amounts already recognized as income or for prior/future periods."
        )
    )
    notes_receivable: Optional[int] = Field(
        description=(
            "Outstanding notes receivable (2024), usually loan-based. Extract only current-year figures."
        )
    )
    loans_receivable_net: Optional[int] = Field(
        description=(
            "Student or institutional loans receivable for 2024, net of allowances. "
            "Exclude gross figures or those from past fiscal years."
        )
    )
    other_assets: Optional[int] = Field(
        description=(
            "Other assets not listed separately for 2024. Review footnotes to avoid duplication. "
            "Only include items related to the 2024 reporting period."
        )
    )
    investments: Optional[int] = Field(
        description=(
            "Investments held at fair value as of 2024, including equities and bonds. "
            "Exclude endowment funds if reported separately."
        )
    )
    right_of_use_assets: Optional[int] = Field(
        description=(
            "Total right-of-use (ROU) assets under ASC 842 for 2024, including both operating and finance leases. "
            "Confirm they are assets, not liabilities."
        )
    )
    land_buildings_equipment_net: Optional[int] = Field(
        description=(
            "Net book value of fixed assets (land, buildings, equipment) as of 2024, after accumulated depreciation."
        )
    )
    accumulated_depreciation: Optional[int] = Field(
        description=(
            "Total accumulated depreciation recorded against PP&E as of year-end 2024. "
            "Confirm this is a negative contra-asset."
        )
    )
    rou_assets_finance_lease: Optional[int] = Field(
        description=(
            "Right-of-use assets under finance leases for 2024 (not operating leases). Confirm lease classification."
        )
    )
    rou_assets_operating_lease: Optional[int] = Field(
        description=(
            "Right-of-use assets under operating leases for 2024. Confirm segregation from finance leases."
        )
    )
    current_portion_long_term_debt: Optional[int] = Field(
        description=(
            "Portion of long-term debt due within one year as of 2024. Exclude total debt figures."
        )
    )
    current_portion_operating_lease: Optional[int] = Field(
        description=(
            "Current-year portion (2024) of operating lease liabilities."
        )
    )
    short_term_debt: Optional[int] = Field(
        description=(
            "Short-term borrowings due within 2024 (e.g., commercial paper, credit lines)."
        )
    )
    total_assets: Optional[int] = Field(
        description=(
            "Sum of all reported assets as of December 31, 2024. "
            "Use only if explicitly reported; otherwise infer via component summation."
        )
    )
    accounts_payable: Optional[int] = Field(
        description=(
            "Total trade payables and invoices owed as of year-end 2024."
        )
    )
    student_deposits_and_deferred_revenue: Optional[int] = Field(
        description=(
            "Unearned tuition or fees collected in advance for 2024. "
            "Exclude other deferred revenues not related to students."
        )
    )
    tenant_capital_improvements: Optional[int] = Field(
        description=(
            "Leasehold obligations for tenant improvements as of 2024. Extract only if explicitly stated."
        )
    )
    bonds_payable_net: Optional[int] = Field(
        description=(
            "Long-term bonds payable, net of discounts/premiums, as of 2024. "
            "Exclude gross issuance values if not amortized."
        )
    )
    refundable_advances_us_govt: Optional[int] = Field(
        description=(
            "Federal grants or advances (e.g., Pell grants) that are refundable at year-end 2024."
        )
    )
    lease_obligations: Optional[int] = Field(
        description=(
            "Total lease liabilities recognized (current + non-current) under ASC 842 for 2024."
        )
    )
    liabilities_under_split_interest_agreements: Optional[int] = Field(
        description=(
            "Liabilities under split-interest trusts (e.g., charitable remainder trusts) reported for 2024. "
            "Refer to notes for detail."
        )
    )
    liabilities_associated_with_investments: Optional[int] = Field(
        description=(
            "Liabilities tied to investments (e.g., margin debt, derivatives) as of 2024."
        )
    )
    non_controlling_interests: Optional[int] = Field(
        description=(
            "Minority equity interests in consolidated subsidiaries as of 2024."
        )
    )
    total_liabilities: Optional[int] = Field(
        description=(
            "Sum of all liabilities reported for 2024. Extract explicitly if available."
        )
    )
    net_assets_with_donor_restrictions: Optional[int] = Field(
        description=(
            "Year-end 2024 net assets restricted by donors for time or purpose. "
            "Exclude board-designated or unrestricted funds."
        )
    )
    net_assets_without_donor_restrictions: Optional[int] = Field(
        description=(
            "Unrestricted net assets as of 2024, not subject to donor conditions."
        )
    )
    total_net_assets: Optional[int] = Field(
        description=(
            "Sum of restricted and unrestricted net assets for 2024."
        )
    )
    total_liabilities_and_net_assets: Optional[int] = Field(
        description=(
            "Balance sheet total as of 2024. Should match total assets; verify consistency."
        )
    )

In [48]:

class StatementOfActivities(BaseModel):
    year: int = Field(
        description=(
            "Fiscal year for which the income statement data applies. "
            "Only extract data where year == 2024; ignore entries for any other years."
        )
    )
    tuition_and_fees_net: Optional[int] = Field(
        description=(
            "Tuition and fees revenue (net of scholarships, discounts, allowances) for 2024."
        )
    )
    auxiliary_enterprises: Optional[int] = Field(
        description=(
            "Revenue from auxiliary services (housing, dining, bookstores, etc.) in 2024."
        )
    )
    government_grants: Optional[int] = Field(
        description=(
            "Operating grants and appropriations received from government sources in 2024."
        )
    )
    contributions_with_donor_restrictions: Optional[int] = Field(
        description=(
            "Gifts/donations restricted by donor conditions for 2024. Exclude unrestricted funds."
        )
    )
    contributions_without_donor_restrictions: Optional[int] = Field(
        description=(
            "Unrestricted donations available for general use in 2024."
        )
    )
    contributions: Optional[int] = Field(
        description=(
            "Total contributions for the year 2024 (sum of restricted + unrestricted)."
        )
    )
    investment_income_operations_with_donor: Optional[int] = Field(
        description=(
            "Income earned on donor-restricted investments (e.g., endowment) during 2024."
        )
    )
    investment_income_operations_without_donor: Optional[int] = Field(
        description=(
            "Income from investments without donor restriction (operating fund earnings) in 2024."
        )
    )
    investment_income_operations: Optional[int] = Field(
        description=(
            "Total investment income from operations for 2024. "
            "Includes both donor-restricted and unrestricted lines."
        )
    )
    other_investment_income: Optional[int] = Field(
        description=(
            "Gains or losses not categorized under operating income (realized/unrealized) in 2024."
        )
    )
    other_sources: Optional[int] = Field(
        description=(
            "Other non-operating or miscellaneous revenues for 2024."
        )
    )
    total_revenue: Optional[int] = Field(
        description=(
            "Total revenues for 2024. Extract only if explicitly stated."
        )
    )
    net_assets_released_from_restrictions: Optional[int] = Field(
        description=(
            "Donor-restricted funds reclassified as unrestricted after satisfying restrictions in 2024."
        )
    )
    compensation: Optional[int] = Field(
        description=(
            "Salaries and wages for faculty, staff, and student workers during 2024."
        )
    )
    fringe_benefits: Optional[int] = Field(
        description=(
            "Employee benefits (healthcare, pension, etc.) for 2024."
        )
    )
    supplies_services_general: Optional[int] = Field(
        description=(
            "General expenditures on supplies and services (excluding compensation) in 2024."
        )
    )
    depreciation: Optional[int] = Field(
        description=(
            "Depreciation expense related to capital assets for the fiscal year 2024."
        )
    )
    interest: Optional[int] = Field(
        description=(
            "Interest expense on debt instruments (bonds, loans) during 2024."
        )
    )
    utilities_occupancy: Optional[int] = Field(
        description=(
            "Utilities costs and building occupancy expenses (rent, maintenance, etc.) for 2024."
        )
    )
    total_expenses: Optional[int] = Field(
        description=(
            "Total operating and non-operating expenses for 2024. Only extract if clearly labeled."
        )
    )
    change_in_net_assets_with_donor_restrictions: Optional[int] = Field(
        description=(
            "Net increase/decrease in donor-restricted net assets from operations/investment in 2024."
        )
    )
    change_in_net_assets_without_donor_restrictions: Optional[int] = Field(
        description=(
            "Net increase/decrease in unrestricted net assets for 2024."
        )
    )
    total_change_in_net_assets: Optional[int] = Field(
        description=(
            "Overall change in net assets for the fiscal year 2024."
        )
    )

In [49]:
# Final schema container
class FinancialStatement_2425(BaseModel):
    financial_position: List[FinancialPosition]
    statement_of_activities: List[StatementOfActivities]

In [7]:
extractor = LlamaExtract()
agent_id = "c6827d74-e760-4a28-ba87-c12bfc70d7af"
# agent = extractor.create_agent(name = "fs-parser-2425", data_schema=FinancialStatement_2425)

No project_id provided, fetching default project.


In [9]:
agent = extractor.get_agent(id = agent_id)

In [50]:
#uncomment the following lines if you updated the schema
agent.data_schema = FinancialStatement_2425
agent.save()
agent = extractor.get_agent(id = agent_id)

# This is a single school test to see if the extraction works accrodingly. Can choose to skip this part.

In [27]:
# ✅ 1. Use actual file path (Path object ensures compatibility)
pdf_path = Path("/Users/richard_li/Franklin_Templton_Industry_Project/scrapping/university_pdfs/ARIZONA_STATE_UNIVERSITY/FY_2024_Arizona_State_University_Annual_Comprehensive_Financial_Report_for_the_year_ended_06_30_2024__4.1_MB_.pdf")

# ✅ 2. Run agent.extract with raw file path (not parsed documents)
extractions = agent.extract(str(pdf_path))  # <-- pass as string

Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.90s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.22s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:44<00:00, 44.88s/it]


In [28]:
print(extractions.data)

{'financial_position': [{'year': 2024, 'cash_and_short_term_investments': 645869, 'student_receivables_net': 203500000, 'accounts_receivable': 462300000, 'contributions_receivable': 242195, 'notes_receivable': None, 'loans_receivable_net': 2649, 'other_assets': 262480, 'investments': 2302337000, 'right_of_use_assets': 234616, 'land_buildings_equipment_net': 4087901, 'accumulated_depreciation': -2426794, 'rou_assets_finance_lease': None, 'rou_assets_operating_lease': 234616, 'current_portion_long_term_debt': 183414, 'current_portion_operating_lease': 21135, 'short_term_debt': None, 'total_assets': 6952605, 'accounts_payable': 139132, 'student_deposits_and_deferred_revenue': 407266, 'tenant_capital_improvements': None, 'bonds_payable_net': 2573772, 'refundable_advances_us_govt': None, 'lease_obligations': 303749, 'liabilities_under_split_interest_agreements': None, 'liabilities_associated_with_investments': None, 'non_controlling_interests': None, 'total_liabilities': 4478833, 'net_asset

In [20]:
def save_extraction_to_excel(data: dict, output_path: str):
    all_blocks = []
    for key, value in data.items():
        # Convert list of dicts to DataFrame and transpose
        if isinstance(value, list) and all(isinstance(item, dict) for item in value):
            df = pd.DataFrame(value)
            df = df.set_index("year").T.reset_index().rename(columns={"index": "field"})
        else:
            df = pd.DataFrame([value])
        # Insert section header as the first row of the block
        section_header = pd.DataFrame([[f"== {key.upper()} =="] + [""] * (df.shape[1] - 1)], columns=df.columns)
        all_blocks.extend([section_header, df, pd.DataFrame([[""] * df.shape[1]])])  # add empty row after each section

    # Combine all blocks
    combined_df = pd.concat(all_blocks, ignore_index=True)

    # Write to Excel
    with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
        combined_df.to_excel(writer, sheet_name="Financials", index=False)

    return output_path

In [21]:
output_path = "ASU_extracted_2024.xlsx"
save_extraction_to_excel(extractions.data, output_path)

'yale_extracted_2024.xlsx'

In [22]:
PDF_ROOT = "/Users/richard_li/Franklin_Templton_Industry_Project/scrapping/university_pdfs"
OUTPUT_ROOT = "fs_output_1"
os.makedirs(OUTPUT_ROOT, exist_ok=True)
load_dotenv()

False

# ------------------- Here is the entire school list run----------------------------

In [51]:
# === Configure your agent ===
extractor = LlamaExtract()
agent = extractor.get_agent(id = agent_id)

No project_id provided, fetching default project.


In [52]:
def process_school(school_name, school_dir):
    combined   = {}
    first_keys = None
    for fname in sorted(os.listdir(school_dir)):
        if not fname.lower().endswith(".pdf"):
            continue
        path = os.path.join(school_dir, fname)
        print(f"Extracting data from {fname}")
        try:
            run  = agent.extract(path)
            data = run.data or {}
            if first_keys is None:
                first_keys = list(data.keys())
                combined  = {k: None for k in first_keys}
            for k, v in data.items():
                if v not in (None, "", []):
                    combined[k] = v
        except Exception as err:
            print(f"Skipped {fname}: {err}")
    
    # ─── INSERT DIAGNOSTIC HERE ───────────────────────────────────────────────────
    # Right after the loop above has populated “combined,” but before you build the DataFrame:
    if first_keys:
        for k, v in combined.items():
            if isinstance(v, (list, tuple)) and len(v) != 1:
                print(f"→ '{k}' is a {type(v).__name__} of length {len(v)}: {v!r}")
    # ────────────────────────────────────────────────────────────────────────────────
    if first_keys:
        df = pd.DataFrame.from_dict(combined, orient="index", columns=["2024-25"])
        df.index.name = "Metric"
        outfile = os.path.join(OUTPUT_ROOT, f"{school_name}.xlsx")
        df.to_excel(outfile)
        print(f"Saved output to {outfile}")
    else:
        print(f"No PDF data found for {school_name}")

In [53]:
# Loop over schools
for school in sorted(os.listdir(PDF_ROOT)):
    school_dir = os.path.join(PDF_ROOT, school)
    if not os.path.isdir(school_dir):
        continue
    print(f"Processing school: {school}")
    process_school(school, school_dir)

print("Extraction complete.")

Processing school: ARIZONA_STATE_UNIVERSITY
Extracting data from FY_2024_Arizona_State_University_Annual_Comprehensive_Financial_Report_for_the_year_ended_06_30_2024__4.1_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:04<00:00,  4.77s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.27s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:32<00:00, 32.57s/it]


Extracting data from FY_2024_Arizona_State_University_Continuing_Disclosure_Undertaking_for_the_year_ended_06_30_2024__160_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.02s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.44it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:13<00:00, 13.56s/it]


Saved output to fs_output_1/ARIZONA_STATE_UNIVERSITY.xlsx
Processing school: BRADLEY_UNIVERSITY
Extracting data from Annual_Financial_Information_and_Operating_Data__Rule_15c2-12__for_FY24_for_the_year_ended_05_31_2024__227_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.51s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.42it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:11<00:00, 11.92s/it]


Extracting data from Audited_Financial_Statements_or_ACFR__Rule_15c2-12__for_FY24_for_the_year_ended_05_31_2024__541_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.82s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.35s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:15<00:00, 15.98s/it]


Saved output to fs_output_1/BRADLEY_UNIVERSITY.xlsx
Processing school: CALIFORNIA_STATE_UNIVERSITY
Extracting data from Exhibit_1_CSU_Combined_Financial_Stmts_6-30-24_for_the_year_ended_06_30_2024__15.6_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:02<00:00,  2.67s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:03<00:00,  3.13s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:37<00:00, 37.27s/it]


Extracting data from Exhibit_1_CSU_Combined_Financial_Stmts_6-30-24_for_the_year_ended_06_30_2024__308_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:02<00:00,  2.01s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:02<00:00,  2.74s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:09<00:00,  9.48s/it]


Extracting data from Exhibit_1_DRAFT_UNAUDITED_CSU_Combined_Financial_Statements_6-30-24_for_the_year_ended_06_30_2024__1.2_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:02<00:00,  2.64s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.11it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:22<00:00, 22.26s/it]


Extracting data from Exhibit_1_DRAFT_UNAUDITED_CSU_Combined_Financial_Statements_6-30-24_for_the_year_ended_06_30_2024__298_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:06<00:00,  6.79s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:03<00:00,  3.19s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:21<00:00, 21.60s/it]


Saved output to fs_output_1/CALIFORNIA_STATE_UNIVERSITY.xlsx
Processing school: CORNELL_UNIVERSITY
Extracting data from 2024_Audited_Financial_Statements_for_the_year_ended_06_30_2024__788_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.53s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.40s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:19<00:00, 19.65s/it]


Extracting data from 2024_Operating_Data_for_the_year_ended_06_30_2024__109_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.00it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.96it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:07<00:00,  7.85s/it]


Extracting data from Incorporate_OS_by_Reference_as_of_04_25_2024__2.4_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.60s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.49s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:22<00:00, 22.06s/it]


Saved output to fs_output_1/CORNELL_UNIVERSITY.xlsx
Processing school: CULINARY_INSTITUTE_OF_AMERICA_THE
Extracting data from 2024_Annual_Report_-_Corrected_for_the_year_ended_05_31_2024__130_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.40s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.07s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:11<00:00, 11.61s/it]


Extracting data from 2024_Annual_Report_for_the_year_ended_05_31_2024__129_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.45it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.46it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:10<00:00, 10.48s/it]


Extracting data from 2024_Audited_Financial_Statements_for_the_year_ended_05_31_2024__277_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.11it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.88it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:17<00:00, 17.40s/it]


Saved output to fs_output_1/CULINARY_INSTITUTE_OF_AMERICA_THE.xlsx
Processing school: GANNON_UNIVERSITY
Extracting data from Audited_Financial_Statements_for_the_year_ended_06_30_2024__786_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.65s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.78it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:16<00:00, 16.02s/it]


Extracting data from Continued_Disclosures_Fall_2024_for_the_year_ended_06_30_2024_Document1__203_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.86s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.15it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:48<00:00, 48.15s/it]


Saved output to fs_output_1/GANNON_UNIVERSITY.xlsx
Processing school: LEWIS_UNIVERSITY
Extracting data from Audited_Financial_Statements_for_the_year_ended_06_30_2024__430_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.98s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.48s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:22<00:00, 22.24s/it]


Extracting data from Continuing_Disclosure_for_the_year_ended_06_30_2024__298_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.25s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.50it/s]
Extracting files: 100%|███████████████████████████| 1/1 [01:39<00:00, 99.35s/it]


Saved output to fs_output_1/LEWIS_UNIVERSITY.xlsx
Processing school: MICHIGAN_STATE_UNIVERSITY
Extracting data from Updates_of_Tables_of_Operating_Information_for_the_year_ended_06_30_2024_Document1__143_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.51s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.15s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:08<00:00,  8.57s/it]


Extracting data from Updates_of_Tables_of_Operating_Information_for_the_year_ended_06_30_2024_Document2__8.4_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.58s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.68it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:38<00:00, 38.07s/it]


Saved output to fs_output_1/MICHIGAN_STATE_UNIVERSITY.xlsx
Processing school: MOLLOY_COLLEGE
Extracting data from Financial_Operating_Filing_for_the_year_ended_06_30_2024_Document1__304_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:03<00:00,  3.02s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.05it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:15<00:00, 15.24s/it]


Extracting data from Financial_Operating_Filing_for_the_year_ended_06_30_2024_Document2__142_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.05it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.37it/s]
Extracting files: 100%|███████████████████████████| 1/1 [01:00<00:00, 60.55s/it]


Saved output to fs_output_1/MOLLOY_COLLEGE.xlsx
Processing school: MOUNT_ST_MARY_S_UNIVERSITY_INC
Extracting data from Audited_Annual_Financials_for_the_year_ended_06_30_2024_Document1__29.1_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:04<00:00,  4.89s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.80it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:38<00:00, 38.66s/it]


Extracting data from Audited_Annual_Financials_for_the_year_ended_06_30_2024_Document2__5_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:02<00:00,  2.94s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.46it/s]
Extracting files: 100%|██████████████████████████| 1/1 [02:28<00:00, 148.07s/it]


Saved output to fs_output_1/MOUNT_ST_MARY_S_UNIVERSITY_INC.xlsx
Processing school: NEW_YORK_UNIVERSITY
Extracting data from 2024_Certificate_of_Compliance_-_Audit_for_the_year_ended_06_30_2024__330_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.24it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.77it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:09<00:00,  9.50s/it]


Extracting data from 2024_Certificate_of_Compliance_for_the_year_ended_06_30_2024__323_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.84s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.43s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:07<00:00,  7.86s/it]


Extracting data from 2024_Consolidated_Financial_Statements_New_York_University_for_the_year_ended_06_30_2024__466_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.54s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:04<00:00,  4.77s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:20<00:00, 20.93s/it]


Extracting data from 2024_Operating_Data_for_the_year_ended_06_30_2024__244_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.02it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.67it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:55<00:00, 55.10s/it]


Saved output to fs_output_1/NEW_YORK_UNIVERSITY.xlsx
Processing school: OHIO_STATE_UNIVERSITY_THE
Extracting data from The_Ohio_State_University_Annual_Update_to_Bondholders_for_the_year_ended_06_30_2024__577_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.00it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.00it/s]
Extracting files: 100%|███████████████████████████| 1/1 [01:15<00:00, 75.96s/it]


Extracting data from The_Ohio_State_University_Quarterly_Update_to_Bondholders_for_the_quarter_ended_03_31_2024__577_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:02<00:00,  2.40s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:10<00:00, 10.19s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:08<00:00,  8.79s/it]


Extracting data from The_Ohio_State_University_Quarterly_Update_to_Bondholders_for_the_quarter_ended_06_30_2024__576_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.01it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:07<00:00,  7.49s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:09<00:00,  9.25s/it]


Extracting data from The_Ohio_State_University_Quarterly_Update_to_Bondholders_for_the_quarter_ended_09_30_2024__574_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.06s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.40it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:07<00:00,  7.12s/it]


Extracting data from The_Ohio_State_University_Quarterly_Update_to_Bondholders_for_the_quarter_ended_12_31_2024__815_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.24s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.39it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:13<00:00, 13.33s/it]


Saved output to fs_output_1/OHIO_STATE_UNIVERSITY_THE.xlsx
Processing school: PRESIDENT___FELLOWS_OF_HARVARD_COLLEGE
Extracting data from Harvard_University_Audited_Financial_Information_for_the_year_ended_06_30_2024__10.6_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:04<00:00,  4.91s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.50s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:26<00:00, 26.19s/it]


Extracting data from Harvard_University_Financial_Report_for_the_year_ended_06_30_2024__10.6_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:02<00:00,  2.38s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.10it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:22<00:00, 22.20s/it]


Extracting data from Harvard_University_Student_Applications_and_Enrollment_for_the_year_ended_06_30_2024__557_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.36s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.14it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:06<00:00,  6.77s/it]


Saved output to fs_output_1/PRESIDENT___FELLOWS_OF_HARVARD_COLLEGE.xlsx
Processing school: STEVENSON_UNIVERSITY_INC
Extracting data from 2024_Annual_Compliance_Certificate__EagleBank__for_the_year_ended_06_30_2024__3_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:02<00:00,  2.16s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.17s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:27<00:00, 27.17s/it]


Extracting data from 2024_Annual_Compliance_Certificate_for_the_year_ended_06_30_2024__2.8_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.29s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.84it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:12<00:00, 12.49s/it]


Extracting data from 2024_Audited_Financial_Statements_for_the_year_ended_06_30_2024__348_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.75s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:03<00:00,  3.04s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:13<00:00, 13.49s/it]


Extracting data from 2024_Operating_Data_for_the_year_ended_06_30_2024__196_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.33s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:06<00:00,  6.53s/it]
Extracting files: 100%|███████████████████████████| 1/1 [01:10<00:00, 70.19s/it]


Saved output to fs_output_1/STEVENSON_UNIVERSITY_INC.xlsx
Processing school: STEVENS_INSTITUTE_OF_TECHNOLOGY
Extracting data from Annual_Report_for_the_year_ended_06_30_2024__216_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:04<00:00,  4.47s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.23s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:44<00:00, 44.10s/it]


Extracting data from Audit_Financial_Statement_for_the_year_ended_06_30_2024__626_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.98s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:07<00:00,  7.14s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:11<00:00, 11.03s/it]


Saved output to fs_output_1/STEVENS_INSTITUTE_OF_TECHNOLOGY.xlsx
Processing school: ST_LOUIS_UNIVERSITY_US
Extracting data from Amendment_to_Continuing_Disclosure_Undertaking_dated_01_05_2024__392_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.31s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:03<00:00,  3.59s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:06<00:00,  6.99s/it]


Extracting data from Audited_Financials_and_Operating_Data_for_the_year_ended_06_30_2024_Document1__561_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:06<00:00,  6.83s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.26it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:15<00:00, 15.46s/it]


Extracting data from Audited_Financials_and_Operating_Data_for_the_year_ended_06_30_2024_Document2__174_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:02<00:00,  2.46s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:16<00:00, 16.73s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:06<00:00,  6.41s/it]


Saved output to fs_output_1/ST_LOUIS_UNIVERSITY_US.xlsx
Processing school: TEXAS_A_M_UNIVERSITY
Extracting data from FY_2024_Continuing_Disclosure_Annual_Report_for_the_year_ended_08_31_2024__2.3_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.40s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.44it/s]
Extracting files: 100%|██████████████████████████| 1/1 [03:43<00:00, 223.16s/it]


Extracting data from Texas_A_M_University_System_Unaudited_Annual_Financial_Reports_for_the_year_ended_08_31_2024__2_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.78s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.68s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:29<00:00, 29.73s/it]


Saved output to fs_output_1/TEXAS_A_M_UNIVERSITY.xlsx
Processing school: UNIVERSITY_OF_COLORADO
Extracting data from Financial_and_Operating_Data__Fiscal_Year_2024_for_the_year_ended_06_30_2024__278_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.02s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.31it/s]
Extracting files: 100%|███████████████████████████| 1/1 [01:01<00:00, 61.27s/it]


Saved output to fs_output_1/UNIVERSITY_OF_COLORADO.xlsx
Processing school: UNIVERSITY_OF_MINNESOTA
Extracting data from Annual_Financial_Information_and_Operating_Data_for_the_year_ended_06_30_2024_Document1__1.3_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:03<00:00,  3.01s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.25s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:21<00:00, 21.32s/it]


Extracting data from Annual_Financial_Information_and_Operating_Data_for_the_year_ended_06_30_2024_Document2__315_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.85s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.56s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:51<00:00, 51.89s/it]

Saved output to fs_output_1/UNIVERSITY_OF_MINNESOTA.xlsx
Extraction complete.





In [56]:
OUTPUT_FILE = os.path.join(OUTPUT_ROOT, "all_schools.xlsx")

writer = pd.ExcelWriter(OUTPUT_FILE, engine="openpyxl")

for school in sorted(os.listdir(PDF_ROOT)):
    school_dir = os.path.join(PDF_ROOT, school)
    if not os.path.isdir(school_dir):
        continue

    combined   = {}
    first_keys = None
    for fname in sorted(os.listdir(school_dir)):
        if not fname.lower().endswith(".pdf"):
            continue
        path = os.path.join(school_dir, fname)
        print(f"Extracting data from {school}/{fname}")
        try:
            run  = agent.extract(path)
            data = run.data or {}
            if first_keys is None:
                first_keys = list(data.keys())
                combined  = {k: None for k in first_keys}
            for k, v in data.items():
                if v not in (None, "", []):
                    combined[k] = v
        except Exception as err:
            print(f"Skipped {fname}: {err}")

    if first_keys:
        df = pd.DataFrame.from_dict(combined, orient="index", columns=["2024-25"])
        df.index.name = "Metric"
        sheet_name = school[:31]
        df.to_excel(writer, sheet_name=sheet_name)
    else:
        print(f"No data for {school}.")

writer.close()
print(f"All schools written to {OUTPUT_FILE}")

Extracting data from ARIZONA_STATE_UNIVERSITY/FY_2024_Arizona_State_University_Annual_Comprehensive_Financial_Report_for_the_year_ended_06_30_2024__4.1_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:02<00:00,  2.18s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.11it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:28<00:00, 28.37s/it]


Extracting data from ARIZONA_STATE_UNIVERSITY/FY_2024_Arizona_State_University_Continuing_Disclosure_Undertaking_for_the_year_ended_06_30_2024__160_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.24s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.23it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:15<00:00, 15.13s/it]


Extracting data from BRADLEY_UNIVERSITY/Annual_Financial_Information_and_Operating_Data__Rule_15c2-12__for_FY24_for_the_year_ended_05_31_2024__227_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:02<00:00,  2.42s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.18s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:08<00:00,  8.14s/it]


Extracting data from BRADLEY_UNIVERSITY/Audited_Financial_Statements_or_ACFR__Rule_15c2-12__for_FY24_for_the_year_ended_05_31_2024__541_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.11s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.49it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:12<00:00, 12.09s/it]


Extracting data from CALIFORNIA_STATE_UNIVERSITY/Exhibit_1_CSU_Combined_Financial_Stmts_6-30-24_for_the_year_ended_06_30_2024__15.6_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:07<00:00,  7.68s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.58it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:30<00:00, 30.49s/it]


Extracting data from CALIFORNIA_STATE_UNIVERSITY/Exhibit_1_CSU_Combined_Financial_Stmts_6-30-24_for_the_year_ended_06_30_2024__308_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.78s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.84it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:07<00:00,  7.67s/it]


Extracting data from CALIFORNIA_STATE_UNIVERSITY/Exhibit_1_DRAFT_UNAUDITED_CSU_Combined_Financial_Statements_6-30-24_for_the_year_ended_06_30_2024__1.2_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.21s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.92it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:20<00:00, 20.67s/it]


Extracting data from CALIFORNIA_STATE_UNIVERSITY/Exhibit_1_DRAFT_UNAUDITED_CSU_Combined_Financial_Statements_6-30-24_for_the_year_ended_06_30_2024__298_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.07s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.16it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:11<00:00, 11.95s/it]


Extracting data from CORNELL_UNIVERSITY/2024_Audited_Financial_Statements_for_the_year_ended_06_30_2024__788_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:04<00:00,  4.86s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.46it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:18<00:00, 18.13s/it]


Extracting data from CORNELL_UNIVERSITY/2024_Operating_Data_for_the_year_ended_06_30_2024__109_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.11it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.63it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:09<00:00,  9.18s/it]


Extracting data from CORNELL_UNIVERSITY/Incorporate_OS_by_Reference_as_of_04_25_2024__2.4_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.69s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.64it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:18<00:00, 18.47s/it]


Extracting data from CULINARY_INSTITUTE_OF_AMERICA_THE/2024_Annual_Report_-_Corrected_for_the_year_ended_05_31_2024__130_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.22s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.60it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:11<00:00, 11.00s/it]


Extracting data from CULINARY_INSTITUTE_OF_AMERICA_THE/2024_Annual_Report_for_the_year_ended_05_31_2024__129_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.34it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.08it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:11<00:00, 11.86s/it]


Extracting data from CULINARY_INSTITUTE_OF_AMERICA_THE/2024_Audited_Financial_Statements_for_the_year_ended_05_31_2024__277_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:02<00:00,  2.39s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.78it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:12<00:00, 12.17s/it]


Extracting data from GANNON_UNIVERSITY/Audited_Financial_Statements_for_the_year_ended_06_30_2024__786_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:02<00:00,  2.10s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.83it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:10<00:00, 10.98s/it]


Extracting data from GANNON_UNIVERSITY/Continued_Disclosures_Fall_2024_for_the_year_ended_06_30_2024_Document1__203_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.31s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.27it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:10<00:00, 10.71s/it]


Extracting data from LEWIS_UNIVERSITY/Audited_Financial_Statements_for_the_year_ended_06_30_2024__430_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.41s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.04s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:15<00:00, 15.82s/it]


Extracting data from LEWIS_UNIVERSITY/Continuing_Disclosure_for_the_year_ended_06_30_2024__298_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.08s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.04it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:11<00:00, 11.58s/it]


Extracting data from MICHIGAN_STATE_UNIVERSITY/Updates_of_Tables_of_Operating_Information_for_the_year_ended_06_30_2024_Document1__143_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.24it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.64it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:09<00:00,  9.31s/it]


Extracting data from MICHIGAN_STATE_UNIVERSITY/Updates_of_Tables_of_Operating_Information_for_the_year_ended_06_30_2024_Document2__8.4_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:03<00:00,  3.84s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.04s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:39<00:00, 39.54s/it]


Extracting data from MOLLOY_COLLEGE/Financial_Operating_Filing_for_the_year_ended_06_30_2024_Document1__304_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:02<00:00,  2.71s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.24it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:19<00:00, 19.87s/it]


Extracting data from MOLLOY_COLLEGE/Financial_Operating_Filing_for_the_year_ended_06_30_2024_Document2__142_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.44s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.71s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:10<00:00, 10.43s/it]


Extracting data from MOUNT_ST_MARY_S_UNIVERSITY_INC/Audited_Annual_Financials_for_the_year_ended_06_30_2024_Document1__29.1_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:18<00:00, 18.39s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.97it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:23<00:00, 23.96s/it]


Extracting data from MOUNT_ST_MARY_S_UNIVERSITY_INC/Audited_Annual_Financials_for_the_year_ended_06_30_2024_Document2__5_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:02<00:00,  2.31s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.08it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:29<00:00, 29.32s/it]


Extracting data from NEW_YORK_UNIVERSITY/2024_Certificate_of_Compliance_-_Audit_for_the_year_ended_06_30_2024__330_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.07s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:02<00:00,  2.52s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:07<00:00,  7.98s/it]


Extracting data from NEW_YORK_UNIVERSITY/2024_Certificate_of_Compliance_for_the_year_ended_06_30_2024__323_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.08s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.19it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:06<00:00,  6.91s/it]


Extracting data from NEW_YORK_UNIVERSITY/2024_Consolidated_Financial_Statements_New_York_University_for_the_year_ended_06_30_2024__466_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.10s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.21it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:16<00:00, 16.51s/it]


Extracting data from NEW_YORK_UNIVERSITY/2024_Operating_Data_for_the_year_ended_06_30_2024__244_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.07s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.58it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:07<00:00,  7.42s/it]


Extracting data from OHIO_STATE_UNIVERSITY_THE/The_Ohio_State_University_Annual_Update_to_Bondholders_for_the_year_ended_06_30_2024__577_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.21s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.33it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:11<00:00, 11.69s/it]


Extracting data from OHIO_STATE_UNIVERSITY_THE/The_Ohio_State_University_Quarterly_Update_to_Bondholders_for_the_quarter_ended_03_31_2024__577_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.14s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.74it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:06<00:00,  6.61s/it]


Extracting data from OHIO_STATE_UNIVERSITY_THE/The_Ohio_State_University_Quarterly_Update_to_Bondholders_for_the_quarter_ended_06_30_2024__576_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.55s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.58it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:12<00:00, 12.58s/it]


Extracting data from OHIO_STATE_UNIVERSITY_THE/The_Ohio_State_University_Quarterly_Update_to_Bondholders_for_the_quarter_ended_09_30_2024__574_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.26s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:03<00:00,  3.99s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:07<00:00,  7.31s/it]


Extracting data from OHIO_STATE_UNIVERSITY_THE/The_Ohio_State_University_Quarterly_Update_to_Bondholders_for_the_quarter_ended_12_31_2024__815_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.38s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.09it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:09<00:00,  9.30s/it]


Extracting data from PRESIDENT___FELLOWS_OF_HARVARD_COLLEGE/Harvard_University_Audited_Financial_Information_for_the_year_ended_06_30_2024__10.6_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:03<00:00,  3.51s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.46it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:29<00:00, 29.84s/it]


Extracting data from PRESIDENT___FELLOWS_OF_HARVARD_COLLEGE/Harvard_University_Financial_Report_for_the_year_ended_06_30_2024__10.6_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:04<00:00,  4.99s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.31it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:20<00:00, 20.30s/it]


Extracting data from PRESIDENT___FELLOWS_OF_HARVARD_COLLEGE/Harvard_University_Student_Applications_and_Enrollment_for_the_year_ended_06_30_2024__557_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.04s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.13it/s]
Extracting files: 100%|███████████████████████████| 1/1 [01:09<00:00, 69.18s/it]


Extracting data from STEVENSON_UNIVERSITY_INC/2024_Annual_Compliance_Certificate__EagleBank__for_the_year_ended_06_30_2024__3_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:03<00:00,  3.71s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.13it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:13<00:00, 13.02s/it]


Extracting data from STEVENSON_UNIVERSITY_INC/2024_Annual_Compliance_Certificate_for_the_year_ended_06_30_2024__2.8_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.51s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.62it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:12<00:00, 12.04s/it]


Extracting data from STEVENSON_UNIVERSITY_INC/2024_Audited_Financial_Statements_for_the_year_ended_06_30_2024__348_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.06s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.09s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:17<00:00, 17.14s/it]


Extracting data from STEVENSON_UNIVERSITY_INC/2024_Operating_Data_for_the_year_ended_06_30_2024__196_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.19s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.07it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:12<00:00, 12.95s/it]


Extracting data from STEVENS_INSTITUTE_OF_TECHNOLOGY/Annual_Report_for_the_year_ended_06_30_2024__216_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.00it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.78it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:10<00:00, 10.49s/it]


Extracting data from STEVENS_INSTITUTE_OF_TECHNOLOGY/Audit_Financial_Statement_for_the_year_ended_06_30_2024__626_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.14s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.17it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:16<00:00, 16.01s/it]


Extracting data from ST_LOUIS_UNIVERSITY_US/Amendment_to_Continuing_Disclosure_Undertaking_dated_01_05_2024__392_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.22s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.27it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:07<00:00,  7.29s/it]


Extracting data from ST_LOUIS_UNIVERSITY_US/Audited_Financials_and_Operating_Data_for_the_year_ended_06_30_2024_Document1__561_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.73s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:05<00:00,  5.06s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:11<00:00, 12.00s/it]


Extracting data from ST_LOUIS_UNIVERSITY_US/Audited_Financials_and_Operating_Data_for_the_year_ended_06_30_2024_Document2__174_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.06it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.89it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:40<00:00, 40.43s/it]


Extracting data from TEXAS_A_M_UNIVERSITY/FY_2024_Continuing_Disclosure_Annual_Report_for_the_year_ended_08_31_2024__2.3_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.99s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.16it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:24<00:00, 24.56s/it]


Extracting data from TEXAS_A_M_UNIVERSITY/Texas_A_M_University_System_Unaudited_Annual_Financial_Reports_for_the_year_ended_08_31_2024__2_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.33s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.82it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:31<00:00, 31.15s/it]


Extracting data from UNIVERSITY_OF_COLORADO/Financial_and_Operating_Data__Fiscal_Year_2024_for_the_year_ended_06_30_2024__278_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:02<00:00,  2.08s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.39s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:10<00:00, 10.47s/it]


Extracting data from UNIVERSITY_OF_MINNESOTA/Annual_Financial_Information_and_Operating_Data_for_the_year_ended_06_30_2024_Document1__1.3_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.14s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.80it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:16<00:00, 16.52s/it]


Extracting data from UNIVERSITY_OF_MINNESOTA/Annual_Financial_Information_and_Operating_Data_for_the_year_ended_06_30_2024_Document2__315_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.70s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.94it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:15<00:00, 15.78s/it]

All schools written to fs_output_1/all_schools.xlsx





In [59]:
#Combine all the tabs into one sheet if wanted
file_path   = "fs_output_1/all_schools.xlsx"
output_path = "fs_output_1/all_schools_combined.xlsx"

raw = pd.read_excel(file_path, sheet_name=None, index_col=0)

school_series = {
    school: df.iloc[:, 0]                      # first (only) value column
    for school, df in raw.items()
}

df_comb = pd.DataFrame(school_series).T
df_comb.index.name = "School"                 
df_comb.insert(0, "Year", "2024‑2025")

# df_comb.loc['Texas_A&M', ['Total_Headcount','Undergraduate_Headcount']] = \
#     df_comb.loc['Texas_A&M', ['Undergraduate_Headcount','Total_Headcount']].values

# df_comb.loc['California_state_university', 'Undergraduate_Headcount'] = None 
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
    df_comb.to_excel(writer, sheet_name="Combined")

print("Saved:", output_path)

Saved: fs_output_1/all_schools_combined.xlsx


In [62]:
# 1. Read the existing Excel file
df_all = pd.read_excel('fs_output_1/all_schools_combined.xlsx')

# 2. Each row in 'financial_position' is a dict → expand to columns
fp_expanded = df_all['financial_position'].apply(pd.Series)
# (Optional) Add a prefix so you know these came from FP
fp_expanded = fp_expanded.add_prefix('FP_')

# 3. Each row in 'statement_of_activities' is a dict → expand to columns
soa_expanded = df_all['statement_of_activities'].apply(pd.Series)
soa_expanded = soa_expanded.add_prefix('SOA_')

# 4. Concatenate the original 'School' & 'Year' columns with the two new DataFrames
expanded_df = pd.concat([df_all[['School', 'Year']], fp_expanded, soa_expanded], axis=1)

# 5. (Optional) Save back out to a new Excel file
expanded_df.to_excel('all_schools_expanded.xlsx', index=False)                    

In [64]:
import pandas as pd
import ast

# 1) Load the combined file
df_all = pd.read_excel('fs_output_1/all_schools_combined.xlsx')

# 2) Convert each “dict‐string” back into a Python dict.
#    If the cell is already a dict (unlikely after Excel→read_excel), leave it as-is.
def to_dict_if_str(x):
    if isinstance(x, str):
        try:
            # literal_eval will transform a string like "{'year': 2024, ...}" into a dict
            return ast.literal_eval(x)
        except Exception:
            # If literal_eval fails for any reason, just return x unchanged
            return x
    else:
        return x

df_all['financial_position'] = df_all['financial_position'].apply(to_dict_if_str)
df_all['statement_of_activities'] = df_all['statement_of_activities'].apply(to_dict_if_str)

# 3) Now expand each column of real dicts into separate columns:
fp_expanded = df_all['financial_position'].apply(pd.Series)
fp_expanded = fp_expanded.add_prefix('FP_')

soa_expanded = df_all['statement_of_activities'].apply(pd.Series)
soa_expanded = soa_expanded.add_prefix('SOA_')

# 4) Concatenate “School” & “Year” with the two expanded pieces
expanded_df = pd.concat([df_all[['School', 'Year']], fp_expanded, soa_expanded], axis=1)

# 5) Inspect the first few rows:
print(expanded_df.head())

# 6) (Optional) Save out to a new Excel so you can open it in Excel directly:
expanded_df.to_excel('all_schools_expanded.xlsx', index=False)

                            School       Year  FP_year  \
0         ARIZONA_STATE_UNIVERSITY  2024‑2025   2024.0   
1               BRADLEY_UNIVERSITY  2024‑2025   2024.0   
2      CALIFORNIA_STATE_UNIVERSITY  2024‑2025   2024.0   
3               CORNELL_UNIVERSITY  2024‑2025   2024.0   
4  CULINARY_INSTITUTE_OF_AMERICA_T  2024‑2025   2024.0   

   FP_cash_and_short_term_investments  FP_student_receivables_net  \
0                            645869.0                         NaN   
1                             17444.0                       678.0   
2                           5793096.0                         NaN   
3                            775832.0                         NaN   
4                          54216000.0                   4477927.0   

   FP_accounts_receivable  FP_contributions_receivable  FP_notes_receivable  \
0                462300.0                      20285.0                  NaN   
1                  3909.0                       2041.0                  NaN   

In [63]:
expanded_df.head

<bound method NDFrame.head of                              School       Year  \
0          ARIZONA_STATE_UNIVERSITY  2024‑2025   
1                BRADLEY_UNIVERSITY  2024‑2025   
2       CALIFORNIA_STATE_UNIVERSITY  2024‑2025   
3                CORNELL_UNIVERSITY  2024‑2025   
4   CULINARY_INSTITUTE_OF_AMERICA_T  2024‑2025   
5                 GANNON_UNIVERSITY  2024‑2025   
6                  LEWIS_UNIVERSITY  2024‑2025   
7         MICHIGAN_STATE_UNIVERSITY  2024‑2025   
8                    MOLLOY_COLLEGE  2024‑2025   
9    MOUNT_ST_MARY_S_UNIVERSITY_INC  2024‑2025   
10              NEW_YORK_UNIVERSITY  2024‑2025   
11        OHIO_STATE_UNIVERSITY_THE  2024‑2025   
12  PRESIDENT___FELLOWS_OF_HARVARD_  2024‑2025   
13         STEVENSON_UNIVERSITY_INC  2024‑2025   
14  STEVENS_INSTITUTE_OF_TECHNOLOGY  2024‑2025   
15           ST_LOUIS_UNIVERSITY_US  2024‑2025   
16             TEXAS_A_M_UNIVERSITY  2024‑2025   
17           UNIVERSITY_OF_COLORADO  2024‑2025   
18          UNIVERSI