In [45]:
import os
import pandas as pd
from llama_cloud_services import LlamaExtract
from dotenv import load_dotenv
from typing import Optional
from pydantic import ValidationError

from pydantic import BaseModel, Field
from typing import Optional, Type

from BS_Schema import make_StatementOfFinancialPosition_model

import BS_Schema
from importlib import reload

BS_Schema = reload(BS_Schema)

In [47]:
## Import Class
SFP = make_StatementOfFinancialPosition_model(2024)
# 4.2) Inspect its name and fields
print(SFP.__name__)           # → "StatementOfFinancialPosition_2024"
print(list(SFP.__fields__.keys()))

StatementOfFinancialPosition_2024
['year', 'cash_and_short_term_investments_unrestricted_and_restricted', 'accumulated_depreciation', 'net_fixed_assets', 'long_term_investments_unrestricted_and_restricted', 'total_assets', 'short_term_debt', 'long_term_debt', 'pension_and_opeb_liability', 'total_liabilities', 'net_assets_without_donor_restrictions', 'expendable_net_assets_with_donor_restrictions', 'perpetual_net_assets_with_donor_restrictions', 'net_assets_with_donor_restrictions', 'total_net_assets', 'total_liabilities_and_net_assets', 'net_receivables', 'rou_assets_finance_lease', 'rou_assets_operating_lease', 'other_assets', 'current_portion_finance_lease', 'current_portion_long_term_debt', 'current_portion_operating_lease', 'accounts_payable', 'deferred_revenue', 'long_term_finance_lease', 'long_term_operating_lease', 'swap_obligation_fmv', 'pension_liability', 'opeb_liability', 'other_liabilities', 'net_assets', 'noncontrolling_interest']


In [51]:
PDF_ROOT = "scrapping/university_pdfs/"
OUTPUT_ROOT = "output_Balance_Sheet_DS"
os.makedirs(OUTPUT_ROOT, exist_ok=True)  
AGENT_ID = "bcb15a18-67ac-4772-9965-2654ecaff88c" #Different based on your LLamaCloud account
load_dotenv() #make sure the API key is in the .env file
Year = 2024
extractor = LlamaExtract(api_key='llx-jlyOr0ZQwOzn0BPYgzggmo6mFShTLTsxdrXGviOaBf6IDnHG',project_id = '8c10e62e-3810-4193-915d-d2d11105826d')# LlamaExtract(project_id = '8c10e62e-3810-4193-915d-d2d11105826d')

# #uncomment the below line if you are creating the agent for the first time
# agent = extractor.create_agent(name = "balance-sheet-parser-v1", data_schema=SFP)

agent = extractor.get_agent(id = AGENT_ID)

# uncomment the following lines if you updated the schema
agent.data_schema = SFP
agent.save()
agent = extractor.get_agent(id = AGENT_ID)

In [53]:
#check data schema
agent.data_schema

{'additionalProperties': False,
 'properties': {'year': {'description': 'Only extract from the full Statement of Financial Position (Balance Sheet); do not use condensed or net financial position tables, and do not perform any calculations—capture the number exactly as shown. The fiscal year for all line‐items: 2024.',
   'type': 'integer'},
  'cash_and_short_term_investments_unrestricted_and_restricted': {'anyOf': [{'type': 'integer'},
    {'type': 'null'}],
   'description': 'Only extract from the full Statement of Financial Position (Balance Sheet); do not use condensed or net financial position tables, and do not perform any calculations—capture the number exactly as shown. Cash & Short-Term Investments (both unrestricted & restricted) as of June 30, 2024.'},
  'accumulated_depreciation': {'anyOf': [{'type': 'integer'},
    {'type': 'null'}],
   'description': 'Only extract from the full Statement of Financial Position (Balance Sheet); do not use condensed or net financial position

In [57]:
def process_school(school_name, school_dir):
    combined   = {}
    first_keys = None

    for fname in sorted(os.listdir(school_dir)):
        if not fname.lower().endswith(".pdf"):
            continue
        path = os.path.join(school_dir, fname)
        print(f"Extracting data from {fname}")
        try:
            run  = agent.extract(path)
            data = run.data or {}
            if first_keys is None:
                first_keys = list(data.keys())
                combined  = {k: None for k in first_keys}
            for k, v in data.items():
                if v not in (None, "", []):
                    combined[k] = v
        except Exception as err:
            print(f"Skipped {fname}: {err}")

    if first_keys:
        df = pd.DataFrame.from_dict(combined, orient="index", columns=["2024-25"])
        df.index.name = "Metric"
        outfile = os.path.join(OUTPUT_ROOT, f"{school_name}.xlsx")
        df.to_excel(outfile)
        print(f"Saved output to {outfile}")
    else:
        print(f"No PDF data found for {school_name}")

In [65]:
results = {}

for school in sorted(os.listdir(PDF_ROOT)):
    school_dir = os.path.join(PDF_ROOT, school)
    if not os.path.isdir(school_dir):
        continue

    # 1) Gather all PDFs in the directory
    pdf_files = [
        f for f in sorted(os.listdir(school_dir))
        if f.lower().endswith(".pdf")
    ]

    # 2) Decide which PDFs to read:
    if len(pdf_files) == 1:
        to_read = pdf_files
    else:
        financial_only = [f for f in pdf_files if "financial" in f.lower()]
        to_read = financial_only if financial_only else pdf_files

    combined   = {}
    first_keys = None

    for fname in to_read:
        path = os.path.join(school_dir, fname)
        print(f"Extracting data from {school}/{fname}")
        try:
            run  = agent.extract(path)
            data = run.data or {}
            if first_keys is None:
                first_keys = list(data.keys())
                combined  = {k: None for k in first_keys}
            for k, v in data.items():
                if v not in (None, "", []):
                    combined[k] = v
        except Exception as err:
            print(f"Skipped {fname}: {err}")

    if first_keys:
        # store this school's combined dict in results
        results[school] = combined
    else:
        print(f"No data for {school}.")


# Build one DataFrame: rows=schools, columns=metrics
df_all = pd.DataFrame.from_dict(results, orient="index")




Extracting data from ARIZONA_STATE_UNIVERSITY/FY_2024_Arizona_State_University_Annual_Comprehensive_Financial_Report_for_the_year_ended_06_30_2024__4.1_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.11s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.24s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:24<00:00, 24.51s/it]


Extracting data from BRADLEY_UNIVERSITY/Annual_Financial_Information_and_Operating_Data__Rule_15c2-12__for_FY24_for_the_year_ended_05_31_2024__227_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.39it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.76s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:10<00:00, 10.21s/it]


Extracting data from BRADLEY_UNIVERSITY/Audited_Financial_Statements_or_ACFR__Rule_15c2-12__for_FY24_for_the_year_ended_05_31_2024__541_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.29it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:02<00:00,  2.97s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:15<00:00, 15.05s/it]


Extracting data from CALIFORNIA_STATE_UNIVERSITY/Exhibit_1_CSU_Combined_Financial_Stmts_6-30-24_for_the_year_ended_06_30_2024__15.6_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.90s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.85s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:23<00:00, 23.64s/it]


Extracting data from CALIFORNIA_STATE_UNIVERSITY/Exhibit_1_CSU_Combined_Financial_Stmts_6-30-24_for_the_year_ended_06_30_2024__308_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.76it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:02<00:00,  2.15s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:09<00:00,  9.01s/it]


Extracting data from CALIFORNIA_STATE_UNIVERSITY/Exhibit_1_DRAFT_UNAUDITED_CSU_Combined_Financial_Statements_6-30-24_for_the_year_ended_06_30_2024__1.2_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.30it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.23it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:15<00:00, 15.36s/it]


Extracting data from CALIFORNIA_STATE_UNIVERSITY/Exhibit_1_DRAFT_UNAUDITED_CSU_Combined_Financial_Statements_6-30-24_for_the_year_ended_06_30_2024__298_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.76it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:02<00:00,  2.36s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:09<00:00,  9.11s/it]


Extracting data from CORNELL_UNIVERSITY/2024_Audited_Financial_Statements_for_the_year_ended_06_30_2024__788_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.28s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.56s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:11<00:00, 11.14s/it]


Extracting data from CULINARY_INSTITUTE_OF_AMERICA_THE/2024_Audited_Financial_Statements_for_the_year_ended_05_31_2024__277_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.76it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.02s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:10<00:00, 10.72s/it]


Extracting data from GANNON_UNIVERSITY/Audited_Financial_Statements_for_the_year_ended_06_30_2024__786_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.64it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.33s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:09<00:00,  9.82s/it]


Extracting data from LEWIS_UNIVERSITY/Audited_Financial_Statements_for_the_year_ended_06_30_2024__430_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.50it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:02<00:00,  2.05s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:12<00:00, 12.09s/it]


Extracting data from MICHIGAN_STATE_UNIVERSITY/Updates_of_Tables_of_Operating_Information_for_the_year_ended_06_30_2024_Document1__143_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.51it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.33s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:08<00:00,  8.70s/it]


Extracting data from MICHIGAN_STATE_UNIVERSITY/Updates_of_Tables_of_Operating_Information_for_the_year_ended_06_30_2024_Document2__8.4_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.28s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.18it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:30<00:00, 30.82s/it]


Extracting data from MOLLOY_COLLEGE/Financial_Operating_Filing_for_the_year_ended_06_30_2024_Document1__304_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.06s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.12s/it]
Extracting files: 100%|██████████████████████████| 1/1 [05:14<00:00, 314.78s/it]


Extracting data from MOLLOY_COLLEGE/Financial_Operating_Filing_for_the_year_ended_06_30_2024_Document2__142_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.08s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.14s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:09<00:00,  9.70s/it]


Extracting data from MOUNT_ST_MARY_S_UNIVERSITY_INC/Audited_Annual_Financials_for_the_year_ended_06_30_2024_Document1__29.1_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:02<00:00,  2.52s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:02<00:00,  2.04s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:24<00:00, 24.57s/it]


Extracting data from MOUNT_ST_MARY_S_UNIVERSITY_INC/Audited_Annual_Financials_for_the_year_ended_06_30_2024_Document2__5_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.02it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.55s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:24<00:00, 24.45s/it]


Extracting data from NEW_YORK_UNIVERSITY/2024_Consolidated_Financial_Statements_New_York_University_for_the_year_ended_06_30_2024__466_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.75it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.10it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:15<00:00, 15.98s/it]


Extracting data from OHIO_STATE_UNIVERSITY_THE/The_Ohio_State_University_Annual_Update_to_Bondholders_for_the_year_ended_06_30_2024__577_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.14it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:02<00:00,  2.66s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:10<00:00, 10.85s/it]


Extracting data from OHIO_STATE_UNIVERSITY_THE/The_Ohio_State_University_Quarterly_Update_to_Bondholders_for_the_quarter_ended_03_31_2024__577_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.30it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.09it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:43<00:00, 43.32s/it]


Extracting data from OHIO_STATE_UNIVERSITY_THE/The_Ohio_State_University_Quarterly_Update_to_Bondholders_for_the_quarter_ended_06_30_2024__576_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.50it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.54s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:10<00:00, 10.54s/it]


Extracting data from OHIO_STATE_UNIVERSITY_THE/The_Ohio_State_University_Quarterly_Update_to_Bondholders_for_the_quarter_ended_09_30_2024__574_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.48it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.23s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:09<00:00,  9.61s/it]


Extracting data from OHIO_STATE_UNIVERSITY_THE/The_Ohio_State_University_Quarterly_Update_to_Bondholders_for_the_quarter_ended_12_31_2024__815_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.47it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.25s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:11<00:00, 11.14s/it]


Extracting data from PRESIDENT___FELLOWS_OF_HARVARD_COLLEGE/Harvard_University_Audited_Financial_Information_for_the_year_ended_06_30_2024__10.6_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.69s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.53s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:15<00:00, 15.98s/it]


Extracting data from PRESIDENT___FELLOWS_OF_HARVARD_COLLEGE/Harvard_University_Financial_Report_for_the_year_ended_06_30_2024__10.6_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:02<00:00,  2.00s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:02<00:00,  2.86s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:22<00:00, 22.22s/it]


Extracting data from STEVENSON_UNIVERSITY_INC/2024_Audited_Financial_Statements_for_the_year_ended_06_30_2024__348_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.28s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:02<00:00,  2.66s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:10<00:00, 10.13s/it]


Extracting data from STEVENS_INSTITUTE_OF_TECHNOLOGY/Audit_Financial_Statement_for_the_year_ended_06_30_2024__626_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.48it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.64s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:14<00:00, 14.44s/it]


Extracting data from ST_LOUIS_UNIVERSITY_US/Audited_Financials_and_Operating_Data_for_the_year_ended_06_30_2024_Document1__561_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.39s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.43s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:11<00:00, 11.57s/it]


Extracting data from ST_LOUIS_UNIVERSITY_US/Audited_Financials_and_Operating_Data_for_the_year_ended_06_30_2024_Document2__174_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.50it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.04s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:08<00:00,  8.49s/it]


Extracting data from TEXAS_A_M_UNIVERSITY/Texas_A_M_University_System_Unaudited_Annual_Financial_Reports_for_the_year_ended_08_31_2024__2_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.50it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.94it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:36<00:00, 36.42s/it]


Extracting data from UNIVERSITY_OF_COLORADO/Financial_and_Operating_Data__Fiscal_Year_2024_for_the_year_ended_06_30_2024__278_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:02<00:00,  2.42s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:02<00:00,  2.05s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:07<00:00,  7.40s/it]


Extracting data from UNIVERSITY_OF_MINNESOTA/Annual_Financial_Information_and_Operating_Data_for_the_year_ended_06_30_2024_Document1__1.3_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.59s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:02<00:00,  2.23s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:15<00:00, 15.79s/it]


Extracting data from UNIVERSITY_OF_MINNESOTA/Annual_Financial_Information_and_Operating_Data_for_the_year_ended_06_30_2024_Document2__315_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.08s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:02<00:00,  2.87s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:14<00:00, 14.37s/it]


In [83]:

from typing import List

def add_plug_accounts(df: pd.DataFrame) -> pd.DataFrame:
    """
    Given a DataFrame where each row is a university and columns include:
      - 'total_assets', 'other_assets', [various asset components…]
      - 'total_liabilities', 'other_liabilities', [various liability components…]
    this will compute two new columns:
    
      * other_assets_plug      = total_assets - sum(all other asset components)
      * other_liabilities_plug  = total_liabilities - sum(all other liability components)
    
    so that:
      sum(assets components) + other_assets_plug == total_assets
      sum(liability components) + other_liabilities_plug == total_liabilities
    
    Returns the DataFrame with these two new columns.
    """
    # define the “known” component column names
    asset_components = [
        "cash_and_short_term_investments_unrestricted_and_restricted",
        "net_receivables",
        "net_fixed_assets",
        "long_term_investments_unrestricted_and_restricted",
        "rou_assets_finance_lease",
        "rou_assets_operating_lease"
    ]
    liability_components = [
        "short_term_debt",
        "current_portion_finance_lease",
        "current_portion_long_term_debt",
        "current_portion_operating_lease",
        "accounts_payable",
        "deferred_revenue",
        "long_term_debt",
        "long_term_finance_lease",
        "long_term_operating_lease",
        "swap_obligation_fmv",
        "pension_and_opeb_liability",
        "pension_liability",
        "opeb_liability"
    ]
    
    # pick only those actually in the DataFrame
    assets_cols = [c for c in asset_components if c in df.columns]
    liabs_cols  = [c for c in liability_components if c in df.columns]
    
    # compute the sum of components (skip NaNs)
    df["sum_asset_components"]     = df[assets_cols].sum(axis=1, skipna=True)
    df["sum_liability_components"] = df[liabs_cols].sum(axis=1, skipna=True)
    
    # now compute the plugs
    df["other_assets_plug"]     = df["total_assets"] - df["sum_asset_components"]
    df["other_liabilities_plug"] = df["total_liabilities"] - df["sum_liability_components"]
    
    # (optional) drop the intermediate sum columns if you like
    df.drop(columns=["sum_asset_components", "sum_liability_components"], inplace=True)
    
    return df


In [99]:
# add plug accounts and make adjustment for expendable assets
df_allv1 = add_plug_accounts(df_all)
df_allv1['expendable_net_assets_with_donor_restrictions_calculated']=df_allv1['net_assets_with_donor_restrictions'] - df_allv1['perpetual_net_assets_with_donor_restrictions']
df_allv1.rename(columns={'expendable_net_assets_with_donor_restrictions':'expendable_net_assets_with_donor_restrictions_extracted'},inplace=True)

In [101]:
# save output
df_allv1.to_excel(OUTPUT_FILE)