In [147]:
import os
import pandas as pd
from llama_cloud_services import LlamaExtract
from dotenv import load_dotenv
from typing import Optional
from pydantic import ValidationError
from BS_Schema import make_StatementOfFinancialPosition_model

In [149]:
## Import Class
SFP = make_StatementOfFinancialPosition_model(2024)
# 4.2) Inspect its name and fields
print(SFP.__name__)           # → "StatementOfFinancialPosition_2024"
print(list(SFP.__fields__.keys()))

StatementOfFinancialPosition_2024
['year', 'cash_and_short_term_investments_unrestricted_and_restricted', 'accumulated_depreciation', 'net_fixed_assets', 'long_term_investments_unrestricted_and_restricted', 'total_assets', 'short_term_debt', 'long_term_debt', 'pension_and_opeb_liability', 'total_liabilities', 'net_assets_without_donor_restrictions', 'expendable_net_assets_with_donor_restrictions', 'perpetual_net_assets_with_donor_restrictions', 'net_assets_with_donor_restrictions', 'total_net_assets', 'total_liabilities_and_net_assets']


In [151]:
LlamaExtract(api_key='llx-zKFsBQYVXG6DqlTVV2yi3xpkS8nePwYvgOeYoMzJ04RBpgYw')

No project_id provided, fetching default project.


LlamaExtract(api_key='llx-zKFsBQYVXG6DqlTVV2yi3xpkS8nePwYvgOeYoMzJ04RBpgYw', base_url='https://api.cloud.llamaindex.ai', check_interval=1, max_timeout=2000, num_workers=4, show_progress=True, verbose=False, verify=True, httpx_timeout=60)

In [None]:
ee231c03c0cf6607315be41f48f2ca75503002282bb13f37ff28a6197360edcd

llx-zKFsBQYVXG6DqlTVV2yi3xpkS8nePwYvgOeYoMzJ04RBpgYw

In [157]:
PDF_ROOT = "scrapping/university_pdfs/"
OUTPUT_ROOT = "output_Balance_Sheet_DS"
os.makedirs(OUTPUT_ROOT, exist_ok=True)  
AGENT_ID = "83849ea7-429e-48b6-b538-7a8425da77b3" #Different based on your LLamaCloud account
load_dotenv() #make sure the API key is in the .env file
Year = 2024

extractor = LlamaExtract(api_key='llx-zKFsBQYVXG6DqlTVV2yi3xpkS8nePwYvgOeYoMzJ04RBpgYw')# LlamaExtract(project_id = '8c10e62e-3810-4193-915d-d2d11105826d')

# #uncomment the below line if you are creating the agent for the first time
# agent = extractor.create_agent(name = "balance-sheet-parser", data_schema=SFP)

agent = extractor.get_agent(id = AGENT_ID)

# #uncomment the following lines if you updated the schema
# agent.data_schema = SFP
# agent.save()
# agent = extractor.get_agent(id = AGENT_ID)

No project_id provided, fetching default project.


In [155]:
agent.data_schema

{'additionalProperties': False,
 'properties': {'year': {'description': "The fiscal year to which all other line‐items refer. E.g. '2024' means every balance (Cash, Liabilities, Net Assets, etc.) is 'as of June 30, 2024'.",
   'type': 'integer'},
  'cash_and_short_term_investments_unrestricted_and_restricted': {'anyOf': [{'type': 'integer'},
    {'type': 'null'}],
   'description': "Total of Cash & Short-Term Investments (both unrestricted and restricted) as of June 30, 2024. May be labeled 'Cash & Short-Term Investments (Unrestricted & Restricted)', 'Cash and investments', or similar. Only extract if this amount clearly refers to year-end 2024. Do not include petty cash or illiquid equivalents."},
  'accumulated_depreciation': {'anyOf': [{'type': 'integer'},
    {'type': 'null'}],
   'description': "Accumulated Depreciation as of June 30, 2024. May be labeled 'Accumulated Depreciation', 'Allowance for Depreciation', or similar. Only extract if explicitly shown on the Statement of Fina

In [159]:
def process_school(school_name, school_dir):
    combined   = {}
    first_keys = None

    for fname in sorted(os.listdir(school_dir)):
        if not fname.lower().endswith(".pdf"):
            continue
        path = os.path.join(school_dir, fname)
        print(f"Extracting data from {fname}")
        try:
            run  = agent.extract(path)
            data = run.data or {}
            if first_keys is None:
                first_keys = list(data.keys())
                combined  = {k: None for k in first_keys}
            for k, v in data.items():
                if v not in (None, "", []):
                    combined[k] = v
        except Exception as err:
            print(f"Skipped {fname}: {err}")

    if first_keys:
        df = pd.DataFrame.from_dict(combined, orient="index", columns=["2024-25"])
        df.index.name = "Metric"
        outfile = os.path.join(OUTPUT_ROOT, f"{school_name}.xlsx")
        df.to_excel(outfile)
        print(f"Saved output to {outfile}")
    else:
        print(f"No PDF data found for {school_name}")

In [161]:
# os.listdir('/Users/danyalsoomro/Desktop/Python_Projects/Industry_projects/Franklin_Templton_Industry_Project/scrapping/university_pdfs')

In [163]:
# Loop over schools
for school in sorted(os.listdir(PDF_ROOT))[0:4]:
    school_dir = os.path.join(PDF_ROOT, school)
    if not os.path.isdir(school_dir):
        continue
    print(f"Processing school: {school}")
    process_school(school, school_dir)

print("Extraction complete.")

Processing school: ARIZONA_STATE_UNIVERSITY
Extracting data from FY_2024_Arizona_State_University_Annual_Comprehensive_Financial_Report_for_the_year_ended_06_30_2024__4.1_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:05<00:00,  5.80s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.71it/s]
Extracting files: 100%|██████████████████████████| 1/1 [02:06<00:00, 126.42s/it]


Extracting data from FY_2024_Arizona_State_University_Continuing_Disclosure_Undertaking_for_the_year_ended_06_30_2024__160_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.28s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:04<00:00,  4.01s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:09<00:00,  9.25s/it]


Saved output to output_Balance_Sheet_DS/ARIZONA_STATE_UNIVERSITY.xlsx
Processing school: BRADLEY_UNIVERSITY
Extracting data from Annual_Financial_Information_and_Operating_Data__Rule_15c2-12__for_FY24_for_the_year_ended_05_31_2024__227_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.03it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.38s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:06<00:00,  6.81s/it]


Extracting data from Audited_Financial_Statements_or_ACFR__Rule_15c2-12__for_FY24_for_the_year_ended_05_31_2024__541_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.18s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.18s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:11<00:00, 11.84s/it]


Saved output to output_Balance_Sheet_DS/BRADLEY_UNIVERSITY.xlsx
Processing school: CALIFORNIA_STATE_UNIVERSITY
Extracting data from Exhibit_1_CSU_Combined_Financial_Stmts_6-30-24_for_the_year_ended_06_30_2024__15.6_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:12<00:00, 12.77s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.08s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:16<00:00, 16.99s/it]


Extracting data from Exhibit_1_CSU_Combined_Financial_Stmts_6-30-24_for_the_year_ended_06_30_2024__308_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.18s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.96it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:10<00:00, 10.40s/it]


Extracting data from Exhibit_1_DRAFT_UNAUDITED_CSU_Combined_Financial_Statements_6-30-24_for_the_year_ended_06_30_2024__1.2_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:02<00:00,  2.27s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.81it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:16<00:00, 16.22s/it]


Extracting data from Exhibit_1_DRAFT_UNAUDITED_CSU_Combined_Financial_Statements_6-30-24_for_the_year_ended_06_30_2024__298_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.86s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.33s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:15<00:00, 15.33s/it]

Saved output to output_Balance_Sheet_DS/CALIFORNIA_STATE_UNIVERSITY.xlsx
Extraction complete.





In [165]:
OUTPUT_FILE = os.path.join(OUTPUT_ROOT, "all_schools.xlsx")

writer = pd.ExcelWriter(OUTPUT_FILE, engine="openpyxl")

for school in sorted(os.listdir(PDF_ROOT)):
    school_dir = os.path.join(PDF_ROOT, school)
    if not os.path.isdir(school_dir):
        continue

    combined   = {}
    first_keys = None
    for fname in sorted(os.listdir(school_dir)):
        if not fname.lower().endswith(".pdf"):
            continue
        path = os.path.join(school_dir, fname)
        print(f"Extracting data from {school}/{fname}")
        try:
            run  = agent.extract(path)
            data = run.data or {}
            if first_keys is None:
                first_keys = list(data.keys())
                combined  = {k: None for k in first_keys}
            for k, v in data.items():
                if v not in (None, "", []):
                    combined[k] = v
        except Exception as err:
            print(f"Skipped {fname}: {err}")

    if first_keys:
        df = pd.DataFrame.from_dict(combined, orient="index", columns=["2024-25"])
        df.index.name = "Metric"
        sheet_name = school[:31]
        df.to_excel(writer, sheet_name=sheet_name)
    else:
        print(f"No data for {school}.")

writer.close()
print(f"All schools written to {OUTPUT_FILE}")

Extracting data from ARIZONA_STATE_UNIVERSITY/FY_2024_Arizona_State_University_Annual_Comprehensive_Financial_Report_for_the_year_ended_06_30_2024__4.1_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:05<00:00,  5.25s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.98it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:43<00:00, 43.27s/it]


Extracting data from ARIZONA_STATE_UNIVERSITY/FY_2024_Arizona_State_University_Continuing_Disclosure_Undertaking_for_the_year_ended_06_30_2024__160_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.55s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:06<00:00,  6.66s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:02<00:00,  2.53s/it]


Extracting data from BRADLEY_UNIVERSITY/Annual_Financial_Information_and_Operating_Data__Rule_15c2-12__for_FY24_for_the_year_ended_05_31_2024__227_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.30s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.01s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:01<00:00,  1.77s/it]


Extracting data from BRADLEY_UNIVERSITY/Audited_Financial_Statements_or_ACFR__Rule_15c2-12__for_FY24_for_the_year_ended_05_31_2024__541_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.17s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:03<00:00,  3.77s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:04<00:00,  4.70s/it]


No data for BRADLEY_UNIVERSITY.
Extracting data from CALIFORNIA_STATE_UNIVERSITY/Exhibit_1_CSU_Combined_Financial_Stmts_6-30-24_for_the_year_ended_06_30_2024__15.6_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:08<00:00,  8.76s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.77it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:02<00:00,  2.03s/it]


Extracting data from CALIFORNIA_STATE_UNIVERSITY/Exhibit_1_CSU_Combined_Financial_Stmts_6-30-24_for_the_year_ended_06_30_2024__308_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.35s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.23it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:01<00:00,  1.64s/it]


Extracting data from CALIFORNIA_STATE_UNIVERSITY/Exhibit_1_DRAFT_UNAUDITED_CSU_Combined_Financial_Statements_6-30-24_for_the_year_ended_06_30_2024__1.2_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:02<00:00,  2.10s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.24s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:02<00:00,  2.04s/it]


Extracting data from CALIFORNIA_STATE_UNIVERSITY/Exhibit_1_DRAFT_UNAUDITED_CSU_Combined_Financial_Statements_6-30-24_for_the_year_ended_06_30_2024__298_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:09<00:00,  9.02s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.23it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:01<00:00,  1.89s/it]


No data for CALIFORNIA_STATE_UNIVERSITY.
Extracting data from CORNELL_UNIVERSITY/2024_Audited_Financial_Statements_for_the_year_ended_06_30_2024__788_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.39s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.12s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:02<00:00,  2.35s/it]


Extracting data from CORNELL_UNIVERSITY/2024_Operating_Data_for_the_year_ended_06_30_2024__109_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.74s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:07<00:00,  7.75s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:01<00:00,  1.75s/it]


Extracting data from CORNELL_UNIVERSITY/Incorporate_OS_by_Reference_as_of_04_25_2024__2.4_MB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:02<00:00,  2.29s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:01<00:00,  1.11s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:01<00:00,  1.79s/it]


No data for CORNELL_UNIVERSITY.
Extracting data from CULINARY_INSTITUTE_OF_AMERICA_THE/2024_Annual_Report_-_Corrected_for_the_year_ended_05_31_2024__130_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.26it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.80it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:01<00:00,  1.72s/it]


Extracting data from CULINARY_INSTITUTE_OF_AMERICA_THE/2024_Annual_Report_for_the_year_ended_05_31_2024__129_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.02it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.99it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:01<00:00,  1.81s/it]


Extracting data from CULINARY_INSTITUTE_OF_AMERICA_THE/2024_Audited_Financial_Statements_for_the_year_ended_05_31_2024__277_KB_.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.02s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.34it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:01<00:00,  1.65s/it]


No data for CULINARY_INSTITUTE_OF_AMERICA_THE.
Extracting data from GANNON_UNIVERSITY/Audited_Financial_Statements_for_the_year_ended_06_30_2024__786_KB_.pdf


Uploading files:   0%|                                    | 0/1 [00:01<?, ?it/s]


Skipped Audited_Financial_Statements_for_the_year_ended_06_30_2024__786_KB_.pdf: status_code: 429, body: {'detail': 'Rate limit exceeded. Please try again later.'}
Extracting data from GANNON_UNIVERSITY/Continued_Disclosures_Fall_2024_for_the_year_ended_06_30_2024_Document1__203_KB_.pdf


Uploading files:   0%|                                    | 0/1 [00:00<?, ?it/s]


Skipped Continued_Disclosures_Fall_2024_for_the_year_ended_06_30_2024_Document1__203_KB_.pdf: status_code: 429, body: {'detail': 'Rate limit exceeded. Please try again later.'}
No data for GANNON_UNIVERSITY.
Extracting data from LEWIS_UNIVERSITY/Audited_Financial_Statements_for_the_year_ended_06_30_2024__430_KB_.pdf


Uploading files:   0%|                                    | 0/1 [00:01<?, ?it/s]


Skipped Audited_Financial_Statements_for_the_year_ended_06_30_2024__430_KB_.pdf: status_code: 429, body: {'detail': 'Rate limit exceeded. Please try again later.'}
Extracting data from LEWIS_UNIVERSITY/Continuing_Disclosure_for_the_year_ended_06_30_2024__298_KB_.pdf


Uploading files:   0%|                                    | 0/1 [00:00<?, ?it/s]


Skipped Continuing_Disclosure_for_the_year_ended_06_30_2024__298_KB_.pdf: status_code: 429, body: {'detail': 'Rate limit exceeded. Please try again later.'}
No data for LEWIS_UNIVERSITY.
Extracting data from MICHIGAN_STATE_UNIVERSITY/Updates_of_Tables_of_Operating_Information_for_the_year_ended_06_30_2024_Document1__143_KB_.pdf


Uploading files:   0%|                                    | 0/1 [00:00<?, ?it/s]


Skipped Updates_of_Tables_of_Operating_Information_for_the_year_ended_06_30_2024_Document1__143_KB_.pdf: status_code: 429, body: {'detail': 'Rate limit exceeded. Please try again later.'}
Extracting data from MICHIGAN_STATE_UNIVERSITY/Updates_of_Tables_of_Operating_Information_for_the_year_ended_06_30_2024_Document2__8.4_MB_.pdf


Uploading files:   0%|                                    | 0/1 [00:00<?, ?it/s]

KeyboardInterrupt: 