In [2]:
import os
import pandas as pd
from llama_cloud_services import LlamaExtract
from schemas import StatementOfCashFlows2024  #This could be adjusted through schemas.py
from dotenv import load_dotenv

In [3]:
PDF_ROOT = "university_pdfs"
OUTPUT_ROOT = "output_cash_flow"
os.makedirs(OUTPUT_ROOT, exist_ok=True)  
AGENT_ID = "54164e43-e77a-4add-89a0-b99a31c1ed87" #Different based on your LLamaCloud account

In [4]:
load_dotenv() #make sure the API key is in the .env file
extractor = LlamaExtract(project_id = '8c10e62e-3810-4193-915d-d2d11105826d')

#uncomment the below line if you are creating the agent for the first time
# agent = extractor.create_agent(name = "statement_of_cash_flows-2024", data_schema=StatementOfCashFlows2024)

agent = extractor.get_agent(id = AGENT_ID)

#uncomment the following lines if you updated the schema
agent.data_schema = StatementOfCashFlows2024
agent.save()

Uploading files: 100%|████████████████████████████| 1/1 [00:02<00:00,  2.02s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.86it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:21<00:00, 21.07s/it]
Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.00s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.46it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:07<00:00,  7.68s/it]
Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.85s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.93it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:08<00:00,  8.05s/it]
Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.14s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.62it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:13<00:00, 13.01s/it]
Uploading files: 100%|██████

In [5]:
agent.data_schema

{'additionalProperties': False,
 'description': "Statement of Cash Flows for the fiscal year 2024.\nOnly extract data from the 2024 fiscal period (e.g. statements labeled ‘Fiscal Year 2024').\nIgnore any figures outside this period. Do not extract anything from 2023.\nValue should be in thousands.\nDo not extract anything from the condensed or summary table or statement. Only from the Long, fullly elborated statement or table.\nDo not derive or calculate values unless they appear explicitly in the document.",
 'properties': {'total_change_in_net_assets': {'anyOf': [{'type': 'number'},
    {'type': 'null'}],
   'description': "Cash amount labeled 'Total Change in Net Assets' for the 2024 fiscal year, in US dollars. Search around the statement or in the statement titlte for the right unit amount (thousands, millions).If not in thousands, convert it to thousands.Only extract the exact figure for that period."},
  'total_non_cash_exp': {'anyOf': [{'type': 'number'}, {'type': 'null'}],
   '

In [6]:
OUTPUT_FILE = os.path.join(OUTPUT_ROOT, "all_schools.xlsx")

writer = pd.ExcelWriter(OUTPUT_FILE, engine="openpyxl")

for school in sorted(os.listdir(PDF_ROOT)):
    school_dir = os.path.join(PDF_ROOT, school)
    if not os.path.isdir(school_dir):
        continue

    combined   = {}
    first_keys = None
    for fname in sorted(os.listdir(school_dir)):
        if not fname.lower().endswith(".pdf"):
            continue
        path = os.path.join(school_dir, fname)
        print(f"Extracting data from {school}/{fname}")
        try:
            run  = agent.extract(path)
            data = run.data or {}
            if first_keys is None:
                first_keys = list(data.keys())
                combined  = {k: None for k in first_keys}
            for k, v in data.items():
                if v not in (None, "", []):
                    combined[k] = v
        except Exception as err:
            print(f"Skipped {fname}: {err}")

    if first_keys:
        df = pd.DataFrame.from_dict(combined, orient="index", columns=["2023-24"])
        df.index.name = "Metric"
        sheet_name = school[:31]
        if os.path.exists(OUTPUT_FILE):
            with pd.ExcelWriter(OUTPUT_FILE, engine="openpyxl", mode="a", if_sheet_exists="replace") as writer:
                df.to_excel(writer, sheet_name=sheet_name)
        else:
            with pd.ExcelWriter(OUTPUT_FILE, engine="openpyxl") as writer:
                df.to_excel(writer, sheet_name=sheet_name)
    else:
        print(f"No data for {school}.")

writer.close()
print(f"All schools written to {OUTPUT_FILE}")

Extracting data from ARIZONA_STATE_UNIVERSITY/FY_2024_Arizona_State_University_Annual_Comprehensive_Financial_Report_for_the_year_ended_06_30_2024__4.1_MB_.pdf
Extracting data from ARIZONA_STATE_UNIVERSITY/FY_2024_Arizona_State_University_Continuing_Disclosure_Undertaking_for_the_year_ended_06_30_2024__160_KB_.pdf
Extracting data from BRADLEY_UNIVERSITY/Annual_Financial_Information_and_Operating_Data__Rule_15c2-12__for_FY24_for_the_year_ended_05_31_2024__227_KB_.pdf
Extracting data from BRADLEY_UNIVERSITY/Audited_Financial_Statements_or_ACFR__Rule_15c2-12__for_FY24_for_the_year_ended_05_31_2024__541_KB_.pdf
Extracting data from CALIFORNIA_STATE_UNIVERSITY/Exhibit_1_CSU_Combined_Financial_Stmts_6-30-24_for_the_year_ended_06_30_2024__15.6_MB_.pdf
Extracting data from CALIFORNIA_STATE_UNIVERSITY/Exhibit_1_CSU_Combined_Financial_Stmts_6-30-24_for_the_year_ended_06_30_2024__308_KB_.pdf
Extracting data from CALIFORNIA_STATE_UNIVERSITY/Exhibit_1_DRAFT_UNAUDITED_CSU_Combined_Financial_Statemen

In [7]:
#Combine all the tabs into one sheet if wanted
file_path   = "output_cash_flow/all_schools.xlsx"
output_path = "output_cash_flow/all_schools_combined.xlsx"

raw = pd.read_excel(file_path, sheet_name=None, index_col=0)

school_series = {
    school: df.iloc[:, 0]                      # first (only) value column
    for school, df in raw.items()
}

df_comb = pd.DataFrame(school_series).T
df_comb.index.name = "School"                 
df_comb.insert(0, "Year", "2023‑2024")

with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
    df_comb.to_excel(writer, sheet_name="Combined")

print("Saved:", output_path)

Saved: output_cash_flow/all_schools_combined.xlsx
