In [2]:
import os
import pandas as pd
from llama_cloud_services import LlamaExtract
from schemas import Enrollment2024_25  #This could be adjusted through schemas.py
from dotenv import load_dotenv

In [3]:
PDF_ROOT = "university_pdfs"
OUTPUT_ROOT = "output_scrapping"
os.makedirs(OUTPUT_ROOT, exist_ok=True)  
AGENT_ID = "ca221e4c-b3b2-4bf1-8862-d26016c9943a" #Different based on your LLamaCloud account
load_dotenv() #make sure the API key is in the .env file

True

In [15]:
extractor = LlamaExtract(project_id = '8c10e62e-3810-4193-915d-d2d11105826d')

#uncomment the below line if you are creating the agent for the first time
# agent = extractor.create_agent(name = "enrollment-parser-2024", data_schema=Enrollment2024_25)

agent = extractor.get_agent(id = AGENT_ID)

#uncomment the following lines if you updated the schema
agent.data_schema = Enrollment2024_25
agent.save()
agent = extractor.get_agent(id = AGENT_ID)


In [17]:
agent.data_schema

{'additionalProperties': False,
 'properties': {'Undergraduate_Headcount': {'anyOf': [{'type': 'integer'},
    {'type': 'null'}],
   'description': "Total undergraduate headcount for the 2024–2025 academic year (Different than undergraduate FTE. Sometimes you need to combine both full-time and part time).Search around the tables to locate what type of enrollment information it is.Only extract data for the 2024–2025 year or terms labeled Fall 2024, etcignore any data from other years or terms (e.g. 2023, 2023–2024, Fall 2023, Fall 2022, 2022). it's possible for a school to have multiple campuses, so combine all campuses' count or online and in-person count if applicable.If it didn't specify what kind of headcount is it, do not assume it's undergraduate headcount!!!Combine online and in-person if applicable.look around the table to see what type of data is it Do not derive or hallucinate the data unless the field is actually in the document."},
  'Undergraduate_Headcount_Full_Time': {'an

The following two cell blocks extract all schools' info into one excel file per school.

In [None]:
def process_school(school_name, school_dir):
    combined   = {}
    first_keys = None

    for fname in sorted(os.listdir(school_dir)):
        if not fname.lower().endswith(".pdf"):
            continue
        path = os.path.join(school_dir, fname)
        print(f"Extracting data from {fname}")
        try:
            run  = agent.extract(path)
            data = run.data or {}
            if first_keys is None:
                first_keys = list(data.keys())
                combined  = {k: None for k in first_keys}
            for k, v in data.items():
                if v not in (None, "", []):
                    combined[k] = v
        except Exception as err:
            print(f"Skipped {fname}: {err}")

    if first_keys:
        df = pd.DataFrame.from_dict(combined, orient="index", columns=["2024-25"])
        df.index.name = "Metric"
        outfile = os.path.join(OUTPUT_ROOT, f"{school_name}.xlsx")
        df.to_excel(outfile)
        print(f"Saved output to {outfile}")
    else:
        print(f"No PDF data found for {school_name}")

In [None]:
# Loop over schools
for school in sorted(os.listdir(PDF_ROOT)):
    school_dir = os.path.join(PDF_ROOT, school)
    if not os.path.isdir(school_dir):
        continue
    print(f"Processing school: {school}")
    process_school(school, school_dir)

print("Extraction complete.")

The following cell block extracts all the schools' info into one excel sheet but in different tabs.

In [19]:
OUTPUT_FILE = os.path.join(OUTPUT_ROOT, "all_schools.xlsx")

writer = pd.ExcelWriter(OUTPUT_FILE, engine="openpyxl")

for school in sorted(os.listdir(PDF_ROOT)):
    school_dir = os.path.join(PDF_ROOT, school)
    if not os.path.isdir(school_dir):
        continue

    combined   = {}
    first_keys = None
    for fname in sorted(os.listdir(school_dir)):
        if not fname.lower().endswith(".pdf"):
            continue
        path = os.path.join(school_dir, fname)
        print(f"Extracting data from {school}/{fname}")
        try:
            run  = agent.extract(path)
            data = run.data or {}
            if first_keys is None:
                first_keys = list(data.keys())
                combined  = {k: None for k in first_keys}
            for k, v in data.items():
                if v not in (None, "", []):
                    combined[k] = v
        except Exception as err:
            print(f"Skipped {fname}: {err}")

    if first_keys:
        df = pd.DataFrame.from_dict(combined, orient="index", columns=["2024-25"])
        df.index.name = "Metric"
        sheet_name = school[:31]
        df.to_excel(writer, sheet_name=sheet_name)
    else:
        print(f"No data for {school}.")

writer.close()
print(f"All schools written to {OUTPUT_FILE}")

Extracting data from ARIZONA_STATE_UNIVERSITY/FY_2024_Arizona_State_University_Annual_Comprehensive_Financial_Report_for_the_year_ended_06_30_2024__4.1_MB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:10<00:00, 10.14s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.05it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:24<00:00, 24.13s/it]


Extracting data from ARIZONA_STATE_UNIVERSITY/FY_2024_Arizona_State_University_Continuing_Disclosure_Undertaking_for_the_year_ended_06_30_2024__160_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.12s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:02<00:00,  2.97s/it]
Extracting files: 100%|█████████████████████████████| 1/1 [00:09<00:00,  9.90s/it]


Extracting data from BRADLEY_UNIVERSITY/Annual_Financial_Information_and_Operating_Data__Rule_15c2-12__for_FY24_for_the_year_ended_05_31_2024__227_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.15s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.07it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:09<00:00,  9.28s/it]


Extracting data from BRADLEY_UNIVERSITY/Audited_Financial_Statements_or_ACFR__Rule_15c2-12__for_FY24_for_the_year_ended_05_31_2024__541_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.60s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.36it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:16<00:00, 16.57s/it]


Extracting data from CALIFORNIA_STATE_UNIVERSITY/Exhibit_1_CSU_Combined_Financial_Stmts_6-30-24_for_the_year_ended_06_30_2024__15.6_MB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:07<00:00,  7.54s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.55it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:17<00:00, 17.68s/it]


Extracting data from CALIFORNIA_STATE_UNIVERSITY/Exhibit_1_CSU_Combined_Financial_Stmts_6-30-24_for_the_year_ended_06_30_2024__308_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.26s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.54it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:11<00:00, 11.59s/it]


Extracting data from CALIFORNIA_STATE_UNIVERSITY/Exhibit_1_DRAFT_UNAUDITED_CSU_Combined_Financial_Statements_6-30-24_for_the_year_ended_06_30_2024__1.2_MB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:02<00:00,  2.15s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.39it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:26<00:00, 26.26s/it]


Extracting data from CALIFORNIA_STATE_UNIVERSITY/Exhibit_1_DRAFT_UNAUDITED_CSU_Combined_Financial_Statements_6-30-24_for_the_year_ended_06_30_2024__298_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.79s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.49it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:08<00:00,  8.62s/it]


Extracting data from CORNELL_UNIVERSITY/2024_Audited_Financial_Statements_for_the_year_ended_06_30_2024__788_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.37s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.52it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:12<00:00, 12.25s/it]


Extracting data from CORNELL_UNIVERSITY/2024_Operating_Data_for_the_year_ended_06_30_2024__109_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.11it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.38it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:09<00:00,  9.44s/it]


Extracting data from CORNELL_UNIVERSITY/Incorporate_OS_by_Reference_as_of_04_25_2024__2.4_MB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.79s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  2.13it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:14<00:00, 14.62s/it]


Extracting data from CULINARY_INSTITUTE_OF_AMERICA_THE/2024_Annual_Report_-_Corrected_for_the_year_ended_05_31_2024__130_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.69s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.22it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:11<00:00, 11.90s/it]


Extracting data from CULINARY_INSTITUTE_OF_AMERICA_THE/2024_Annual_Report_for_the_year_ended_05_31_2024__129_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.12s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.37it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:09<00:00,  9.64s/it]


Extracting data from CULINARY_INSTITUTE_OF_AMERICA_THE/2024_Audited_Financial_Statements_for_the_year_ended_05_31_2024__277_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.25s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.79it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:10<00:00, 10.78s/it]


Extracting data from GANNON_UNIVERSITY/Audited_Financial_Statements_for_the_year_ended_06_30_2024__786_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:02<00:00,  2.39s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.19it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:10<00:00, 10.67s/it]


Extracting data from GANNON_UNIVERSITY/Continued_Disclosures_Fall_2024_for_the_year_ended_06_30_2024_Document1__203_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:08<00:00,  8.55s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.60it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:08<00:00,  8.61s/it]


Extracting data from LEWIS_UNIVERSITY/Audited_Financial_Statements_for_the_year_ended_06_30_2024__430_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:02<00:00,  2.91s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.96it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:14<00:00, 14.31s/it]


Extracting data from LEWIS_UNIVERSITY/Continuing_Disclosure_for_the_year_ended_06_30_2024__298_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.07s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.05it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:12<00:00, 12.39s/it]


Extracting data from MICHIGAN_STATE_UNIVERSITY/Updates_of_Tables_of_Operating_Information_for_the_year_ended_06_30_2024_Document1__143_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.72s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.75it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:14<00:00, 14.53s/it]


Extracting data from MICHIGAN_STATE_UNIVERSITY/Updates_of_Tables_of_Operating_Information_for_the_year_ended_06_30_2024_Document2__8.4_MB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:03<00:00,  3.86s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.38it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:27<00:00, 27.67s/it]


Extracting data from MOLLOY_COLLEGE/Financial_Operating_Filing_for_the_year_ended_06_30_2024_Document1__304_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.35s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.41it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:12<00:00, 12.42s/it]


Extracting data from MOLLOY_COLLEGE/Financial_Operating_Filing_for_the_year_ended_06_30_2024_Document2__142_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.12s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.15it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:10<00:00, 10.13s/it]


Extracting data from MOUNT_ST_MARY_S_UNIVERSITY_INC/Audited_Annual_Financials_for_the_year_ended_06_30_2024_Document1__29.1_MB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:19<00:00, 19.38s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.67it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:23<00:00, 23.30s/it]


Extracting data from MOUNT_ST_MARY_S_UNIVERSITY_INC/Audited_Annual_Financials_for_the_year_ended_06_30_2024_Document2__5_MB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:02<00:00,  2.59s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.31it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:26<00:00, 26.48s/it]


Extracting data from NEW_YORK_UNIVERSITY/2024_Certificate_of_Compliance_-_Audit_for_the_year_ended_06_30_2024__330_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.67s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.60it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:11<00:00, 11.47s/it]


Extracting data from NEW_YORK_UNIVERSITY/2024_Certificate_of_Compliance_for_the_year_ended_06_30_2024__323_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.28s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.41it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:11<00:00, 11.98s/it]


Extracting data from NEW_YORK_UNIVERSITY/2024_Consolidated_Financial_Statements_New_York_University_for_the_year_ended_06_30_2024__466_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.39s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  2.08it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:10<00:00, 10.77s/it]


Extracting data from NEW_YORK_UNIVERSITY/2024_Operating_Data_for_the_year_ended_06_30_2024__244_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:02<00:00,  2.19s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:01<00:00,  1.87s/it]
Extracting files: 100%|█████████████████████████████| 1/1 [00:10<00:00, 10.23s/it]


Extracting data from OHIO_STATE_UNIVERSITY_THE/The_Ohio_State_University_Annual_Update_to_Bondholders_for_the_year_ended_06_30_2024__577_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:02<00:00,  2.21s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.77it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:12<00:00, 12.23s/it]


Extracting data from OHIO_STATE_UNIVERSITY_THE/The_Ohio_State_University_Quarterly_Update_to_Bondholders_for_the_quarter_ended_03_31_2024__577_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:02<00:00,  2.37s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.40it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:10<00:00, 10.72s/it]


Extracting data from OHIO_STATE_UNIVERSITY_THE/The_Ohio_State_University_Quarterly_Update_to_Bondholders_for_the_quarter_ended_06_30_2024__576_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.92s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.45it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:11<00:00, 11.17s/it]


Extracting data from OHIO_STATE_UNIVERSITY_THE/The_Ohio_State_University_Quarterly_Update_to_Bondholders_for_the_quarter_ended_09_30_2024__574_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:06<00:00,  6.90s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:01<00:00,  1.07s/it]
Extracting files: 100%|█████████████████████████████| 1/1 [00:12<00:00, 12.19s/it]


Extracting data from OHIO_STATE_UNIVERSITY_THE/The_Ohio_State_University_Quarterly_Update_to_Bondholders_for_the_quarter_ended_12_31_2024__815_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:02<00:00,  2.30s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.70it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:11<00:00, 11.57s/it]


Extracting data from PRESIDENT___FELLOWS_OF_HARVARD_COLLEGE/Harvard_University_Audited_Financial_Information_for_the_year_ended_06_30_2024__10.6_MB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:10<00:00, 10.88s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.53it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:16<00:00, 16.85s/it]


Extracting data from PRESIDENT___FELLOWS_OF_HARVARD_COLLEGE/Harvard_University_Financial_Report_for_the_year_ended_06_30_2024__10.6_MB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:05<00:00,  5.54s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.50it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:21<00:00, 21.67s/it]


Extracting data from PRESIDENT___FELLOWS_OF_HARVARD_COLLEGE/Harvard_University_Student_Applications_and_Enrollment_for_the_year_ended_06_30_2024__557_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.52s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.34it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:09<00:00,  9.83s/it]


Extracting data from STEVENSON_UNIVERSITY_INC/2024_Annual_Compliance_Certificate__EagleBank__for_the_year_ended_06_30_2024__3_MB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:02<00:00,  2.36s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.51it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:10<00:00, 10.80s/it]


Extracting data from STEVENSON_UNIVERSITY_INC/2024_Annual_Compliance_Certificate_for_the_year_ended_06_30_2024__2.8_MB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.91s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.42it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:13<00:00, 13.22s/it]


Extracting data from STEVENSON_UNIVERSITY_INC/2024_Audited_Financial_Statements_for_the_year_ended_06_30_2024__348_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.15s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.00it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:11<00:00, 11.32s/it]


Extracting data from STEVENSON_UNIVERSITY_INC/2024_Operating_Data_for_the_year_ended_06_30_2024__196_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:02<00:00,  2.30s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.71it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:09<00:00,  9.97s/it]


Extracting data from STEVENS_INSTITUTE_OF_TECHNOLOGY/Annual_Report_for_the_year_ended_06_30_2024__216_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.61s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.23it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:13<00:00, 13.11s/it]


Extracting data from STEVENS_INSTITUTE_OF_TECHNOLOGY/Audit_Financial_Statement_for_the_year_ended_06_30_2024__626_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:02<00:00,  2.07s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.32it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:12<00:00, 12.49s/it]


Extracting data from ST_LOUIS_UNIVERSITY_US/Amendment_to_Continuing_Disclosure_Undertaking_dated_01_05_2024__392_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.75s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.35it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:10<00:00, 10.13s/it]


Extracting data from ST_LOUIS_UNIVERSITY_US/Audited_Financials_and_Operating_Data_for_the_year_ended_06_30_2024_Document1__561_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.61s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.41it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:10<00:00, 10.17s/it]


Extracting data from ST_LOUIS_UNIVERSITY_US/Audited_Financials_and_Operating_Data_for_the_year_ended_06_30_2024_Document2__174_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.24s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.81it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:08<00:00,  8.62s/it]


Extracting data from TEXAS_A_M_UNIVERSITY/FY_2024_Continuing_Disclosure_Annual_Report_for_the_year_ended_08_31_2024__2.3_MB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.97s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:01<00:00,  1.16s/it]
Extracting files: 100%|█████████████████████████████| 1/1 [00:21<00:00, 21.22s/it]


Extracting data from TEXAS_A_M_UNIVERSITY/Texas_A_M_University_System_Unaudited_Annual_Financial_Reports_for_the_year_ended_08_31_2024__2_MB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:02<00:00,  2.68s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.87it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:18<00:00, 18.27s/it]


Extracting data from UNIVERSITY_OF_COLORADO/Financial_and_Operating_Data__Fiscal_Year_2024_for_the_year_ended_06_30_2024__278_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.50s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.62it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:13<00:00, 13.05s/it]


Extracting data from UNIVERSITY_OF_MINNESOTA/Annual_Financial_Information_and_Operating_Data_for_the_year_ended_06_30_2024_Document1__1.3_MB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.62s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.28it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:19<00:00, 19.02s/it]


Extracting data from UNIVERSITY_OF_MINNESOTA/Annual_Financial_Information_and_Operating_Data_for_the_year_ended_06_30_2024_Document2__315_KB_.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.37s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.36it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:21<00:00, 21.45s/it]

All schools written to output_scrapping/all_schools.xlsx





In [20]:
#Combine all the tabs into one sheet if wanted
file_path   = "output_scrapping/all_schools.xlsx"
output_path = "output_scrapping/all_schools_combined.xlsx"

raw = pd.read_excel(file_path, sheet_name=None, index_col=0)

school_series = {
    school: df.iloc[:, 0]                      # first (only) value column
    for school, df in raw.items()
}

df_comb = pd.DataFrame(school_series).T
df_comb.index.name = "School"                 
df_comb.insert(0, "Year", "2024‑2025")

# df_comb.loc['Texas_A&M', ['Total_Headcount','Undergraduate_Headcount']] = \
#     df_comb.loc['Texas_A&M', ['Undergraduate_Headcount','Total_Headcount']].values

# df_comb.loc['California_state_university', 'Undergraduate_Headcount'] = None 
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
    df_comb.to_excel(writer, sheet_name="Combined")

print("Saved:", output_path)

Saved: output_scrapping/all_schools_combined.xlsx
