In [343]:
import os
import pandas as pd
from llama_cloud_services import LlamaExtract
from schemas10 import StatementOfCashFlows2024  #This could be adjusted through schemas2.py
from dotenv import load_dotenv

In [344]:
PDF_ROOT = "private_universities/university_pdfs"
OUTPUT_ROOT = "output_cash_flow"
os.makedirs(OUTPUT_ROOT, exist_ok=True)  
AGENT_ID = "efd1376e-694e-46e7-aed1-628587461bfe" #Different based on your LLamaCloud account

In [345]:
load_dotenv() #make sure the API key is in the .env file
extractor = LlamaExtract(project_id = '8c10e62e-3810-4193-915d-d2d11105826d')

#uncomment the below line if you are creating the agent for the first time
#agent = extractor.create_agent(name = "statement_of_cash_flows-2024", data_schema=StatementOfCashFlows2024)
#agent = extractor.create_agent(name="statement_of_cash_flows-2024-10", data_schema=StatementOfCashFlows2024)
agent = extractor.get_agent(id = AGENT_ID)

#uncomment the following lines if you updated the schema
agent.data_schema = StatementOfCashFlows2024
agent.save()


In [346]:
agent.data_schema

{'additionalProperties': False,
 'description': "Statement of Cash Flows for the fiscal year {year}.\nOnly extract data from the {year} fiscal period (e.g. statements labeled ‘Fiscal Year {year}').\nIgnore any figures outside this period. Do not extract anything from {year-1}.\nDo not extract anything from the condensed or summary table or statement. Only from the long, fully elaborated statement or table.\nDo not derive or calculate values unless they appear explicitly in the document.\nExtract the number as it is. Don't convert its unit.\nNote: In financial tables, values shown in parentheses (e.g., (3,705)) represent negative numbers or cash outflows.\n**Only extract values from the cash flow statement or table corresponding to the current year. Do not use other sections of the PDF or unrelated financial statements (e.g., income statement, balance sheet, or footnotes).**",
 'properties': {'total_change_in_net_assets': {'anyOf': [{'type': 'number'},
    {'type': 'null'}],
   'descrip

In [347]:
# Set the path to the final Excel output file
OUTPUT_FILE = os.path.join(OUTPUT_ROOT, "all_update.xlsx")

# Create a Pandas Excel writer using openpyxl
writer = pd.ExcelWriter(OUTPUT_FILE, engine="openpyxl")

# Track schools with mismatch between calculated and reported cash change
test = []

# Iterate through all schools (each school is a folder inside PDF_ROOT)
for school in sorted(os.listdir(PDF_ROOT)):
    school_dir = os.path.join(PDF_ROOT, school)
    
    # Skip if not a directory
    if not os.path.isdir(school_dir):
        continue

    combined   = {}      # Store combined extracted values for the school
    first_keys = None    # Store the order of extracted keys

    # Loop through PDF files inside the school folder
    for fname in sorted(os.listdir(school_dir)):
        if not fname.lower().endswith(".pdf"):
            continue

        path = os.path.join(school_dir, fname)
        print(f"Extracting data from {school}/{fname}")

        try:
            run  = agent.extract(path)          # Use the agent to extract data
            data = run.data or {}               # Fallback to empty if no data

            # Initialize keys on the first successful extraction
            if first_keys is None:
                first_keys = list(data.keys())
                combined  = {k: None for k in first_keys}

            # Update combined values only with non-empty results
            for k, v in data.items():
                if v not in (None, "", []):
                    combined[k] = v

        except Exception as err:
            print(f"Skipped {fname}: {err}")  # Handle and log extraction failures

    # Proceed only if we have valid keys
    if first_keys:
        # Convert combined dictionary to a single-column DataFrame
        df = pd.DataFrame.from_dict(combined, orient="index", columns=["2023-24"])
        df.index.name = "Metric"
        
        # Sheet names in Excel are limited to 31 characters
        sheet_name = school[:31]
        df.to_excel(writer, sheet_name=sheet_name)

        cap = df.loc['cash_flows_from_capital_and_related_financing_activities'].iloc[0]
        noncap = df.loc['cash_flows_from_noncapital_financing_activities'].iloc[0]
        
        if pd.isna(df.loc['net_cash_from_financing_activities'].iloc[0]) or df.loc['net_cash_from_financing_activities'].iloc[0] == 0:
            # Only set if at least one component is not None/NaN
            if not pd.isna(cap) or not pd.isna(noncap):
                df.loc['net_cash_from_financing_activities'] = (cap or 0) + (noncap or 0)
        # Pull out individual cash flow components and fill missing with 0
        s_op  = df.loc['net_cash_from_operating_activities'].fillna(0)
        s_inv = df.loc['net_cash_from_investment_activities'].fillna(0)
        s_fin = df.loc['net_cash_from_financing_activities'].fillna(0)

        # Compute total net change in cash (should match reported value)
        comb = s_op + s_inv + s_fin
        orig = df.loc['change_in_cash_and_equivalents']

        # If calculated total doesn't match reported value, flag the school
        if not orig.fillna(0).equals(comb):
            test.append(school)
    else:
        print(f"No data for {school}.")

# Save the Excel file
writer.close()
print(f"All schools written to {OUTPUT_FILE}")


Extracting data from BRADLEY_UNIVERSITY/Annual_Financial_Information_and_Operating_Data__Rule_15c2-12__for_FY24_for_the_year_ended_05_31_2024__227_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:00<00:00,  1.26it/s]
Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00,  1.05s/it]
Extracting files: 100%|██████████| 1/1 [00:07<00:00,  7.62s/it]


Extracting data from BRADLEY_UNIVERSITY/Audited_Financial_Statements_or_ACFR__Rule_15c2-12__for_FY24_for_the_year_ended_05_31_2024__541_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:00<00:00,  1.06it/s]
Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00,  2.09it/s]
Extracting files: 100%|██████████| 1/1 [00:10<00:00, 10.06s/it]


Extracting data from CORNELL_UNIVERSITY/2024_Audited_Financial_Statements_for_the_year_ended_06_30_2024__788_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:00<00:00,  1.08it/s]
Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00,  1.38s/it]
Extracting files: 100%|██████████| 1/1 [00:07<00:00,  7.85s/it]


Extracting data from CULINARY_INSTITUTE_OF_AMERICA_THE/2024_Annual_Report_-_Corrected_for_the_year_ended_05_31_2024__130_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:00<00:00,  1.31it/s]
Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00,  2.77it/s]
Extracting files: 100%|██████████| 1/1 [00:28<00:00, 28.28s/it]


Extracting data from CULINARY_INSTITUTE_OF_AMERICA_THE/2024_Annual_Report_for_the_year_ended_05_31_2024__129_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:00<00:00,  1.27it/s]
Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00,  1.40s/it]
Extracting files: 100%|██████████| 1/1 [00:08<00:00,  8.83s/it]


Extracting data from CULINARY_INSTITUTE_OF_AMERICA_THE/2024_Audited_Financial_Statements_for_the_year_ended_05_31_2024__277_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:00<00:00,  1.12it/s]
Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00,  1.01it/s]
Extracting files: 100%|██████████| 1/1 [00:12<00:00, 12.52s/it]


Extracting data from GANNON_UNIVERSITY/Audited_Financial_Statements_for_the_year_ended_06_30_2024__786_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:00<00:00,  1.04it/s]
Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00,  1.05it/s]
Extracting files: 100%|██████████| 1/1 [00:08<00:00,  8.80s/it]


Extracting data from GANNON_UNIVERSITY/Continued_Disclosures_Fall_2024_for_the_year_ended_06_30_2024_Document1__203_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:00<00:00,  1.27it/s]
Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00,  2.22it/s]
Extracting files: 100%|██████████| 1/1 [00:10<00:00, 10.09s/it]


Extracting data from LEWIS_UNIVERSITY/Audited_Financial_Statements_for_the_year_ended_06_30_2024__430_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:00<00:00,  1.03it/s]
Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00,  2.51it/s]
Extracting files: 100%|██████████| 1/1 [00:11<00:00, 11.25s/it]


Extracting data from LEWIS_UNIVERSITY/Continuing_Disclosure_for_the_year_ended_06_30_2024__298_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:00<00:00,  1.21it/s]
Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00,  1.25it/s]
Extracting files: 100%|██████████| 1/1 [00:10<00:00, 10.04s/it]


Extracting data from MOLLOY_COLLEGE/Financial_Operating_Filing_for_the_year_ended_06_30_2024_Document1__304_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:00<00:00,  1.24it/s]
Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00,  1.56s/it]
Extracting files: 100%|██████████| 1/1 [00:09<00:00,  9.47s/it]


Extracting data from MOLLOY_COLLEGE/Financial_Operating_Filing_for_the_year_ended_06_30_2024_Document2__142_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:00<00:00,  1.36it/s]
Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00,  1.03s/it]
Extracting files: 100%|██████████| 1/1 [00:07<00:00,  7.62s/it]


Extracting data from MOUNT_ST_MARY_S_UNIVERSITY_INC/Audited_Annual_Financials_for_the_year_ended_06_30_2024_Document1__29.1_MB_.pdf


Uploading files: 100%|██████████| 1/1 [00:16<00:00, 16.13s/it]
Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00,  1.27s/it]
Extracting files: 100%|██████████| 1/1 [00:17<00:00, 17.36s/it]


Extracting data from MOUNT_ST_MARY_S_UNIVERSITY_INC/Audited_Annual_Financials_for_the_year_ended_06_30_2024_Document2__5_MB_.pdf


Uploading files: 100%|██████████| 1/1 [00:02<00:00,  2.34s/it]
Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00,  1.15it/s]
Extracting files: 100%|██████████| 1/1 [00:19<00:00, 19.66s/it]


Extracting data from NEW_YORK_UNIVERSITY/2024_Certificate_of_Compliance_-_Audit_for_the_year_ended_06_30_2024__330_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:00<00:00,  1.21it/s]
Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00,  1.64s/it]
Extracting files: 100%|██████████| 1/1 [00:10<00:00, 10.12s/it]


Extracting data from NEW_YORK_UNIVERSITY/2024_Certificate_of_Compliance_for_the_year_ended_06_30_2024__323_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:00<00:00,  1.12it/s]
Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00,  1.01it/s]
Extracting files: 100%|██████████| 1/1 [00:10<00:00, 10.11s/it]


Extracting data from NEW_YORK_UNIVERSITY/2024_Consolidated_Financial_Statements_New_York_University_for_the_year_ended_06_30_2024__466_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:00<00:00,  1.06it/s]
Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00,  1.27s/it]
Extracting files: 100%|██████████| 1/1 [00:09<00:00,  9.06s/it]


Extracting data from NEW_YORK_UNIVERSITY/2024_Operating_Data_for_the_year_ended_06_30_2024__244_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:00<00:00,  1.35it/s]
Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00,  1.24s/it]
Extracting files: 100%|██████████| 1/1 [00:07<00:00,  7.96s/it]


Extracting data from PRESIDENT___FELLOWS_OF_HARVARD_COLLEGE/Harvard_University_Audited_Financial_Information_for_the_year_ended_06_30_2024__10.6_MB_.pdf


Uploading files: 100%|██████████| 1/1 [00:07<00:00,  7.73s/it]
Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00,  1.17s/it]
Extracting files: 100%|██████████| 1/1 [00:12<00:00, 12.48s/it]


Extracting data from PRESIDENT___FELLOWS_OF_HARVARD_COLLEGE/Harvard_University_Financial_Report_for_the_year_ended_06_30_2024__10.6_MB_.pdf


Uploading files: 100%|██████████| 1/1 [00:07<00:00,  7.69s/it]
Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00,  1.48s/it]
Extracting files: 100%|██████████| 1/1 [00:11<00:00, 11.22s/it]


Extracting data from PRESIDENT___FELLOWS_OF_HARVARD_COLLEGE/Harvard_University_Student_Applications_and_Enrollment_for_the_year_ended_06_30_2024__557_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:00<00:00,  1.10it/s]
Creating extraction jobs: 100%|██████████| 1/1 [00:04<00:00,  4.55s/it]
Extracting files: 100%|██████████| 1/1 [00:07<00:00,  7.71s/it]


Extracting data from STEVENSON_UNIVERSITY_INC/2024_Annual_Compliance_Certificate__EagleBank__for_the_year_ended_06_30_2024__3_MB_.pdf


Uploading files: 100%|██████████| 1/1 [00:01<00:00,  1.19s/it]
Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00,  2.21it/s]
Extracting files: 100%|██████████| 1/1 [00:09<00:00,  9.98s/it]


Extracting data from STEVENSON_UNIVERSITY_INC/2024_Annual_Compliance_Certificate_for_the_year_ended_06_30_2024__2.8_MB_.pdf


Uploading files: 100%|██████████| 1/1 [00:01<00:00,  1.04s/it]
Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00,  1.19s/it]
Extracting files: 100%|██████████| 1/1 [00:10<00:00, 10.00s/it]


Extracting data from STEVENSON_UNIVERSITY_INC/2024_Audited_Financial_Statements_for_the_year_ended_06_30_2024__348_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:00<00:00,  1.23it/s]
Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00,  1.27it/s]
Extracting files: 100%|██████████| 1/1 [00:08<00:00,  8.81s/it]


Extracting data from STEVENSON_UNIVERSITY_INC/2024_Operating_Data_for_the_year_ended_06_30_2024__196_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:01<00:00,  1.07s/it]
Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00,  1.27it/s]
Extracting files: 100%|██████████| 1/1 [00:11<00:00, 11.61s/it]


Extracting data from STEVENS_INSTITUTE_OF_TECHNOLOGY/Annual_Report_for_the_year_ended_06_30_2024__216_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:00<00:00,  1.21it/s]
Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00,  1.66s/it]
Extracting files: 100%|██████████| 1/1 [00:11<00:00, 11.42s/it]


Extracting data from STEVENS_INSTITUTE_OF_TECHNOLOGY/Audit_Financial_Statement_for_the_year_ended_06_30_2024__626_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:00<00:00,  1.03it/s]
Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00,  1.09s/it]
Extracting files: 100%|██████████| 1/1 [00:09<00:00,  9.13s/it]


Extracting data from ST_LOUIS_UNIVERSITY_US/Amendment_to_Continuing_Disclosure_Undertaking_dated_01_05_2024__392_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:00<00:00,  1.19it/s]
Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00,  1.26s/it]
Extracting files: 100%|██████████| 1/1 [00:05<00:00,  5.34s/it]


Extracting data from ST_LOUIS_UNIVERSITY_US/Audited_Financials_and_Operating_Data_for_the_year_ended_06_30_2024_Document1__561_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:00<00:00,  1.16it/s]
Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00,  2.30it/s]
Extracting files: 100%|██████████| 1/1 [00:10<00:00, 10.16s/it]


Extracting data from ST_LOUIS_UNIVERSITY_US/Audited_Financials_and_Operating_Data_for_the_year_ended_06_30_2024_Document2__174_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:01<00:00,  1.68s/it]
Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00,  1.72s/it]
Extracting files: 100%|██████████| 1/1 [00:08<00:00,  9.00s/it]

All schools written to output_cash_flow/all_update.xlsx





In [348]:
# Set file paths for input Excel (with multiple sheets) and output Excel (with single combined sheet)
file_path   = "output_cash_flow/all_update.xlsx"
output_path = "output_cash_flow/all_update_combined.xlsx"

# Read all sheets from the Excel file into a dictionary of DataFrames
# Each key in `raw` is a sheet name (i.e., school), and the value is its DataFrame
raw = pd.read_excel(file_path, sheet_name=None, index_col=0)

# Create a dictionary where:
# - keys = school names
# - values = the first (and only) column of each sheet (Series of metrics)
school_series = {
    school: df.iloc[:, 0]   # Extract the only column (i.e., "2023‑24") as a Series
    for school, df in raw.items()
}

# Combine all school Series into one DataFrame:
# - Each row = one school
# - Each column = one financial metric
df_comb = pd.DataFrame(school_series).T
df_comb.index.name = "School"  # Name the row index

# Insert a "Year" column at the front for context
df_comb.insert(0, "Year", "2024")

# Write the combined DataFrame to a new Excel file with a single sheet
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
    df_comb.to_excel(writer, sheet_name="Combined")

# Confirm that the file is saved
print("Saved:", output_path)

Saved: output_cash_flow/all_update_combined.xlsx


In [349]:
test

[]