In [1]:
import os
import pandas as pd
from llama_cloud_services import LlamaExtract


from schemas17 import StatementOfCashFlows2024

from dotenv import load_dotenv

import schemas17
from importlib import reload

import numpy as np

schemas16 = reload(schemas17)

In [None]:
PDF_ROOT = "university_pdfs_hy"
OUTPUT_ROOT = "output_cash_flow"
os.makedirs(OUTPUT_ROOT, exist_ok=True)  

# Get this Agent ID by click on "Extract" in the left side toolbar and click button "+ Create New Agent"
# Or, get this Agent ID by click on "Extract" in the left side toolbar, then click "Extract" in the upper toolbar, then look at the "Name" and "ID"
# No need to update this if you are not creating new Agent ID
AGENT_ID = "304914f4-ada4-4c1e-80c9-c0327d46e9ca"

In [None]:
load_dotenv() #make sure the API key is in the .env file

# Get this API Key by click on "API Key" in the left side toolbar and click button "+ Generate New Key"
# Update this if you are a new user: get your own API Key
# No need to change Project ID
extractor = LlamaExtract(api_key = "llx-0iDIvL2kdA0ibfSAWf0k7REakRsBAJexNunAvlN0w8PXNT3K", project_id = '8c10e62e-3810-4193-915d-d2d11105826d')

# Uncomment the below line if you are creating the agent for the first time
# agent = extractor.create_agent(name="statement_of_cash_flows-2024-16", data_schema=StatementOfCashFlows2024)
agent = extractor.get_agent(id = AGENT_ID)

# Uncomment the following lines if you updated the schema
agent.data_schema = StatementOfCashFlows2024

agent.save()

In [4]:
agent.data_schema

{'additionalProperties': False,
 'description': "Statement of Cash Flows for the fiscal year {year}.\nOnly extract data from the {year} fiscal period (e.g. statements labeled ‘Fiscal Year {year}').\nIgnore any figures outside this period. Do not extract anything from {year-1}.\n **Only extract values from the cash flow statement or table corresponding to the current year. Do not extract from the financial statement notes or other financial sections. Do not use unrelated financial statements (e.g., income statement, balance sheet, or footnotes).**\nDo not extract anything from the condensed or summary table or statement. Only from the long, fully elaborated statement or table.\nDo not derive or calculate values unless they appear explicitly in the document.\nExtract the number as it is. Don't convert its unit.\nNote: In financial tables, values shown in parentheses (e.g., (3,705)) represent negative numbers or cash outflows.",
 'properties': {'total_change_in_net_assets': {'anyOf': [{'t

In [None]:
# Set the path to the final Excel output file
OUTPUT_FILE = os.path.join(OUTPUT_ROOT, "all_update.xlsx")

# Create a Pandas Excel writer using openpyxl
writer = pd.ExcelWriter(OUTPUT_FILE, engine="openpyxl")

# Track schools with mismatch between calculated and reported cash change
test = []

# Iterate through all schools (each school is a folder inside PDF_ROOT)
for school in sorted(os.listdir(PDF_ROOT)):
    school_dir = os.path.join(PDF_ROOT, school)
    
    # Skip if not a directory
    if not os.path.isdir(school_dir):
        continue

    combined   = {}      # Store combined extracted values for the school
    first_keys = None    # Store the order of extracted keys

    # Loop through PDF files inside the school folder
    for fname in sorted(os.listdir(school_dir)):
        if not fname.lower().endswith(".pdf"):
            continue

        path = os.path.join(school_dir, fname)
        print(f"Extracting data from {school}/{fname}")

        try:
            run  = agent.extract(path)          # Use the agent to extract data
            data = run.data or {}               # Fallback to empty if no data

            # The following code is to give all the numeric data (int, float) in USD 1,000 
            mult = data.get("cash_flow_2024_unit_multiplier", 1) or 1
            for k, v in data.items():
                if k != "cash_flow_2024_unit_multiplier" and isinstance(v, (int, float)):
                    # Scale to dollars, then convert to thousands
                    data[k] = (v * mult) / 1000
            
            # 'other_changes_in_investment_activities_calculated' is calculated from subtracting 'capital_expenses' from 'net_cash_from_investment_activities'
            if (
                "net_cash_from_investment_activities" in data 
                and "capital_expenses" in data
            ):
                net_inv = data.get("net_cash_from_investment_activities")
                capex   = data.get("capital_expenses")
                if net_inv is not None and capex is not None:
                    data["other_changes_in_investment_activities_calculated"] = net_inv - capex            

            # Initialize keys on the first successful extraction
            if first_keys is None:
                first_keys = list(data.keys())
                combined  = {k: None for k in first_keys}

            # Update combined values only with non-empty results
            for k, v in data.items():
                if v not in (None, "", []):
                    combined[k] = v

        except Exception as err:
            print(f"Skipped {fname}: {err}")  # Handle and log extraction failures

    # Proceed only if we have valid keys
    if first_keys:
        # Convert combined dictionary to a single-column DataFrame
        df_values = pd.DataFrame.from_dict(combined, orient="index", columns=["2023-24"])
        df_values.index.name = "Metric"

        # This is to add metadata of the extraction
        field_meta = run.extraction_metadata.get("field_metadata", {}) if run.extraction_metadata else {}
        reasoning_map = {k: v.get("reasoning") for k, v in field_meta.items()}
        df_reasoning = pd.DataFrame.from_dict(reasoning_map, orient="index", columns=["reasoning"])
        df = df_values.join(df_reasoning)
        
        # Sheet names in Excel are limited to 31 characters
        sheet_name = school[:31]
        df.to_excel(writer, sheet_name=sheet_name)

        # In public universities, 'net_cash_from_financing_activities' is calculated by summing 'cash_flows_from_capital_and_related_financing_activities' and 'cash_flows_from_noncapital_financing_activities'
        # As we are now focus on private universities, these part are commented
        # cap = df.loc['cash_flows_from_capital_and_related_financing_activities', "2023-24"]
        # noncap = df.loc['cash_flows_from_noncapital_financing_activities', "2023-24"]
        # if pd.isna(df.loc['net_cash_from_financing_activities', "2023-24"]) or df.loc['net_cash_from_financing_activities', "2023-24"] == 0:
        #     if not pd.isna(cap) or not pd.isna(noncap):
        #         df.loc['net_cash_from_financing_activities', "2023-24"] = (cap or 0) + (noncap or 0)

        # Pull out numeric columns only to calculate 'change_in_cash_and_equivalents'
        s_op  = df.loc['net_cash_from_operating_activities', "2023-24"] or 0
        s_inv = df.loc['net_cash_from_investment_activities', "2023-24"] or 0
        s_fin = df.loc['net_cash_from_financing_activities', "2023-24"] or 0

        # Compute total net change in cash (numeric only)
        comb = s_op + s_inv + s_fin

        # The total net change 'change_in_cash_and_equivalents' is to be compared with 'change_in_cash_and_equivalents' from extraction
        orig = df.loc['change_in_cash_and_equivalents', "2023-24"]

        # If calculated total doesn't match extracted value, flag the school
        orig_val = orig if pd.notna(orig) else 0
        comb_val = comb if pd.notna(comb) else 0

        if not np.isclose(orig_val, comb_val, equal_nan=True):
            test.append(school)
    else:
        print(f"No data for {school}.")

# Save the Excel file
writer.close()
print(f"All schools written to {OUTPUT_FILE}")


Extracting data from AUGSBURG_UNIVERSITY/FY24_Augsburg_University_Financial_Statements_for_the_year_ended_05_31_2024__704_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:01<00:00,  1.68s/it]
Creating extraction jobs: 100%|██████████| 1/1 [00:07<00:00,  7.47s/it]
Extracting files: 100%|██████████| 1/1 [01:38<00:00, 98.42s/it]


Extracting data from ELIZABETHTOWN_COLLEGE/2024_Audited_Financial_Statements_-_Elizabethtown_College_for_the_year_ended_06_30_2024__889_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:01<00:00,  1.47s/it]
Creating extraction jobs: 100%|██████████| 1/1 [00:04<00:00,  4.48s/it]
Extracting files: 100%|██████████| 1/1 [01:51<00:00, 111.65s/it]


Extracting data from ITHACA_COLLEGE/2024_Audited_Financial_Statements_for_the_year_ended_06_30_2024__838_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:01<00:00,  1.21s/it]
Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00,  1.83s/it]
Extracting files: 100%|██████████| 1/1 [01:38<00:00, 98.01s/it]


Extracting data from WIDENER_UNIVERSITY/Financial_Statement_for_the_year_ended_06_30_2024__500_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:01<00:00,  1.16s/it]
Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00,  1.13s/it]
Extracting files: 100%|██████████| 1/1 [01:36<00:00, 96.96s/it]

All schools written to output_cash_flow\all_update.xlsx





In [6]:
# Set file paths for input Excel (with multiple sheets) and output Excel (with single combined sheet)
file_path   = "output_cash_flow/all_update.xlsx"
output_path = "output_cash_flow/all_update_combined.xlsx"

# Read all sheets from the Excel file into a dictionary of DataFrames
# Each key in `raw` is a sheet name (i.e., school), and the value is its DataFrame
raw = pd.read_excel(file_path, sheet_name=None, index_col=0)

# Create a dictionary where:
# - keys = school names
# - values = the first (and only) column of each sheet (Series of metrics)
# school_series = {
#     school: df.iloc[:, 0]   # Extract the only column (i.e., "2023‑24") as a Series
#     for school, df in raw.items()
# }

# Combine all school Series into one DataFrame:
# - Each row = one school
# - Each column = one financial metric
# df_comb = pd.DataFrame(school_series).T
# df_comb.index.name = "School"  # Name the row index

# Insert a "Year" column at the front for context
# df_comb.insert(0, "Year", "2024")

# Write the combined DataFrame to a new Excel file with a single sheet
# with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
#     df_comb.to_excel(writer, sheet_name="Combined")

# Create dictionaries for values and reasoning
school_values = {school: df["2023-24"] for school, df in raw.items()}
school_reasoning = {school: df["reasoning"] for school, df in raw.items() if "reasoning" in df.columns}

# Convert to DataFrames
df_values_comb = pd.DataFrame(school_values).T
df_reasoning_comb = pd.DataFrame(school_reasoning).T

# Add School index name + Year column
df_values_comb.index.name = "School"
df_reasoning_comb.index.name = "School"
df_values_comb.insert(0, "Year", "2024")
df_reasoning_comb.insert(0, "Year", "2024")

# Merge values + reasoning side by side
df_comb = pd.concat(
    {"values": df_values_comb, "reasoning": df_reasoning_comb},
    axis=1
)

# Write to Excel
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
    df_comb.to_excel(writer, sheet_name="Combined")

# Confirm that the file is saved
print("Saved:", output_path)

Saved: output_cash_flow/all_update_combined.xlsx


In [7]:
test

[]