In [None]:
import os
import pandas as pd
from llama_cloud_services import LlamaExtract
from financial_schemas_endowment_final import generate_endowment_schema

In [None]:
FISCAL_YEAR = 2024 #Change the year if you want different years
EndowmentSchema = generate_endowment_schema(FISCAL_YEAR)

PDF_ROOT = "private_universities/university_pdfs" # Change this to the point to the directory where you are storing the pdfs after scraping
OUTPUT_ROOT = "output_endowment_final" # Make this point to the directory/folder where you want to store the excel files with information extracted
os.makedirs(OUTPUT_ROOT, exist_ok=True)  
AGENT_ID = "56843d2c-7e9b-445d-b634-9833dd1cb4db" #Different based on your LLamaCloud account
api_key = os.getenv("LLAMACLOUD_API_KEY") 

In [None]:
extractor = LlamaExtract(
    api_key="llx-63CU3PdyDo0d230ureocmy9JOHgnPwYgE2HETi55DqzYCIpy",  # Add your Llamacloud API Key 
    project_id="8c10e62e-3810-4193-915d-d2d11105826d"  #Change the project ID only if Luis has asked you. This is dependent on the llamacloud account
)

#agent = extractor.create_agent(name = "endowment-parser-2024", data_schema=EndowmentAndInvestmentLevels_2024_25)

agent = extractor.get_agent(id = AGENT_ID)

#uncomment the following lines if you updated the schema
agent.data_schema = EndowmentSchema.schema()
agent.save()



/var/folders/m1/8j3nd3m95y1brb0j52fx43fr0000gn/T/ipykernel_11736/1010406420.py:11: PydanticDeprecatedSince20: The `schema` method is deprecated; use `model_json_schema` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  agent.data_schema = EndowmentSchema.schema()


Uploading files: 100%|██████████| 1/1 [00:01<00:00,  1.26s/it]
Creating extraction jobs: 100%|██████████| 1/1 [00:02<00:00,  2.56s/it]
Extracting files: 100%|██████████| 1/1 [00:06<00:00,  6.70s/it]
Uploading files: 100%|██████████| 1/1 [00:01<00:00,  1.16s/it]
Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00,  1.95it/s]
Extracting files: 100%|██████████| 1/1 [00:11<00:00, 11.87s/it]
Uploading files: 100%|██████████| 1/1 [00:01<00:00,  1.11s/it]
Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00,  2.41it/s]
Extracting files: 100%|██████████| 1/1 [00:11<00:00, 11.57s/it]
Uploading files: 100%|██████████| 1/1 [00:01<00:00,  1.01s/it]
Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00,  2.41it/s]
Extracting files: 100%|██████████| 1/1 [00:06<00:00,  6.75s/it]
Uploading files: 100%|██████████| 1/1 [00:01<00:00,  1.21s/it]
Creating extraction jobs: 100%|██████████| 1/1 [00:02<00:00,  2.46s/it]
Extracting files: 100%|██████████| 1/1 [00:12<00:00, 12.08s/it]
Uploa

In [None]:
# This is to check if the schema is ok
import json
print(json.dumps(agent.data_schema, indent=2))


{
  "additionalProperties": false,
  "properties": {
    "endowment_net_assets_eoy_total": {
      "anyOf": [
        {
          "type": "integer"
        },
        {
          "type": "null"
        }
      ],
      "description": "Total endowment net assets for the 2024 fiscal year (in thousands). Only extract from a table titled 'Changes in Endowment Net Assets' located in the Notes section. Only use data explicitly labeled as '2024', 'FY2024', or 'as of June 30, 2024'. Do not extract from general balance sheets, rollforwards, or systemwide summaries. Standardize all values to $000s using table metadata or heuristics."
    },
    "endowment_net_assets_eoy_with_donor_restrictions": {
      "anyOf": [
        {
          "type": "integer"
        },
        {
          "type": "null"
        }
      ],
      "description": "Total donor-restricted endowment net assets as of June 30, 2024 (in thousands). Must be extracted from a 'Changes in Endowment Net Assets' table in the Notes sec

The following two cell blocks extract all schools' info into one excel file per school.

In [None]:
def process_school(school_name, school_dir):
    combined   = {}
    first_keys = None

    for fname in sorted(os.listdir(school_dir)):
        if not fname.lower().endswith(".pdf"):
            continue
        path = os.path.join(school_dir, fname)
        print(f"Extracting data from {fname}")
        try:
            run  = agent.extract(path)
            data = run.data or {}
            if first_keys is None:
                first_keys = list(data.keys())
                combined  = {k: None for k in first_keys}
            for k, v in data.items():
                if v not in (None, "", []):
                    combined[k] = v
        except Exception as err:
            print(f"Skipped {fname}: {err}")

    if first_keys:
        df = pd.DataFrame.from_dict(combined, orient="index", columns=[f"{FISCAL_YEAR - 1}-{str(FISCAL_YEAR)[-2:]}"])
        df.index.name = "Metric"
        outfile = os.path.join(OUTPUT_ROOT, f"{school_name}.xlsx")
        df.to_excel(outfile)
        print(f"Saved output to {outfile}")
    else:
        print(f"No PDF data found for {school_name}")

In [11]:
# Loop over schools
for school in sorted(os.listdir(PDF_ROOT)):
    school_dir = os.path.join(PDF_ROOT, school)
    if not os.path.isdir(school_dir):
        continue
    print(f"Processing school: {school}")
    process_school(school, school_dir)

print("Extraction complete.")

Processing school: BRADLEY_UNIVERSITY
Extracting data from Annual_Financial_Information_and_Operating_Data__Rule_15c2-12__for_FY24_for_the_year_ended_05_31_2024__227_KB_.pdf
Extracting data from Audited_Financial_Statements_or_ACFR__Rule_15c2-12__for_FY24_for_the_year_ended_05_31_2024__541_KB_.pdf
Saved output to output_endowment_final/BRADLEY_UNIVERSITY.xlsx
Processing school: CORNELL_UNIVERSITY
Extracting data from 2024_Audited_Financial_Statements_for_the_year_ended_06_30_2024__788_KB_.pdf
Extracting data from 2024_Operating_Data_for_the_year_ended_06_30_2024__109_KB_.pdf
Extracting data from Incorporate_OS_by_Reference_as_of_04_25_2024__2.4_MB_.pdf
Saved output to output_endowment_final/CORNELL_UNIVERSITY.xlsx
Processing school: CULINARY_INSTITUTE_OF_AMERICA_THE
Extracting data from 2024_Annual_Report_-_Corrected_for_the_year_ended_05_31_2024__130_KB_.pdf
Extracting data from 2024_Annual_Report_for_the_year_ended_05_31_2024__129_KB_.pdf
Extracting data from 2024_Audited_Financial_S

Creating extraction jobs:   0%|          | 0/1 [04:15<?, ?it/s]


Skipped Harvard_University_Student_Applications_and_Enrollment_for_the_year_ended_06_30_2024__557_KB_.pdf: Request timed out: 
Saved output to output_endowment_final/PRESIDENT___FELLOWS_OF_HARVARD_COLLEGE.xlsx
Processing school: STEVENSON_UNIVERSITY_INC
Extracting data from 2024_Annual_Compliance_Certificate__EagleBank__for_the_year_ended_06_30_2024__3_MB_.pdf
Extracting data from 2024_Annual_Compliance_Certificate_for_the_year_ended_06_30_2024__2.8_MB_.pdf
Extracting data from 2024_Audited_Financial_Statements_for_the_year_ended_06_30_2024__348_KB_.pdf
Extracting data from 2024_Operating_Data_for_the_year_ended_06_30_2024__196_KB_.pdf
Saved output to output_endowment_final/STEVENSON_UNIVERSITY_INC.xlsx
Processing school: STEVENS_INSTITUTE_OF_TECHNOLOGY
Extracting data from Annual_Report_for_the_year_ended_06_30_2024__216_KB_.pdf
Extracting data from Audit_Financial_Statement_for_the_year_ended_06_30_2024__626_KB_.pdf
Saved output to output_endowment_final/STEVENS_INSTITUTE_OF_TECHNOLO

The following cell block extracts all the schools' info into one excel sheet but in different tabs.

In [12]:
OUTPUT_FILE = os.path.join(OUTPUT_ROOT, "all_schools.xlsx")

writer = pd.ExcelWriter(OUTPUT_FILE, engine="openpyxl")

for school in sorted(os.listdir(PDF_ROOT)):
    school_dir = os.path.join(PDF_ROOT, school)
    if not os.path.isdir(school_dir):
        continue

    combined   = {}
    first_keys = None
    for fname in sorted(os.listdir(school_dir)):
        if not fname.lower().endswith(".pdf"):
            continue
        path = os.path.join(school_dir, fname)
        print(f"Extracting data from {school}/{fname}")
        try:
            run  = agent.extract(path)
            data = run.data or {}
            if first_keys is None:
                first_keys = list(data.keys())
                combined  = {k: None for k in first_keys}
            for k, v in data.items():
                if v not in (None, "", []):
                    combined[k] = v
        except Exception as err:
            print(f"Skipped {fname}: {err}")

    if first_keys:
        df = pd.DataFrame.from_dict(combined, orient="index", columns=[f"{FISCAL_YEAR - 1}-{str(FISCAL_YEAR)[-2:]}"])
        df.index.name = "Metric"
        sheet_name = school[:31]
        df.to_excel(writer, sheet_name=sheet_name)
    else:
        print(f"No data for {school}.")

writer.close()
print(f"All schools written to {OUTPUT_FILE}")

Extracting data from BRADLEY_UNIVERSITY/Annual_Financial_Information_and_Operating_Data__Rule_15c2-12__for_FY24_for_the_year_ended_05_31_2024__227_KB_.pdf
Extracting data from BRADLEY_UNIVERSITY/Audited_Financial_Statements_or_ACFR__Rule_15c2-12__for_FY24_for_the_year_ended_05_31_2024__541_KB_.pdf
Extracting data from CORNELL_UNIVERSITY/2024_Audited_Financial_Statements_for_the_year_ended_06_30_2024__788_KB_.pdf
Extracting data from CORNELL_UNIVERSITY/2024_Operating_Data_for_the_year_ended_06_30_2024__109_KB_.pdf
Extracting data from CORNELL_UNIVERSITY/Incorporate_OS_by_Reference_as_of_04_25_2024__2.4_MB_.pdf
Extracting data from CULINARY_INSTITUTE_OF_AMERICA_THE/2024_Annual_Report_-_Corrected_for_the_year_ended_05_31_2024__130_KB_.pdf
Extracting data from CULINARY_INSTITUTE_OF_AMERICA_THE/2024_Annual_Report_for_the_year_ended_05_31_2024__129_KB_.pdf
Extracting data from CULINARY_INSTITUTE_OF_AMERICA_THE/2024_Audited_Financial_Statements_for_the_year_ended_05_31_2024__277_KB_.pdf
Extra

In [None]:
#Combine all the tabs into one sheet if wanted
file_path   = "output_endowment_final/all_schools.xlsx"  #Change this if need be
output_path = "output_endowment_final/all_schools_combined.xlsx" #Change this if need be

raw = pd.read_excel(file_path, sheet_name=None, index_col=0)

school_series = {
    school: df.iloc[:, 0]                      # first (only) value column
    for school, df in raw.items()
}

df_comb = pd.DataFrame(school_series).T
df_comb.index.name = "School"                 
df_comb.insert(0, "Year", f"{FISCAL_YEAR - 1}–{FISCAL_YEAR}")

with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
    df_comb.to_excel(writer, sheet_name="Combined")

print("Saved:", output_path)

Saved: output_endowment_final/all_schools_combined.xlsx
