In [1]:
import os
import pandas as pd
from llama_cloud_services import LlamaExtract
from schemas import Enrollment2024_25  #This could be adjusted through schemas.py
from dotenv import load_dotenv

In [2]:
PDF_ROOT = "university_pdfs_test"
OUTPUT_ROOT = "output_4"
os.makedirs(OUTPUT_ROOT, exist_ok=True)  
AGENT_ID = "ca221e4c-b3b2-4bf1-8862-d26016c9943a" #Different based on your LLamaCloud account
load_dotenv() #make sure the API key is in the .env file

True

In [3]:
extractor = LlamaExtract(project_id = '8c10e62e-3810-4193-915d-d2d11105826d')

#uncomment the below line if you are creating the agent for the first time
# agent = extractor.create_agent(name = "enrollment-parser-2024", data_schema=Enrollment2024_25)

agent = extractor.get_agent(id = AGENT_ID)

#uncomment the following lines if you updated the schema
agent.data_schema = Enrollment2024_25
agent.save()
agent = extractor.get_agent(id = AGENT_ID)


In [5]:
agent.data_schema

{'additionalProperties': False,
 'properties': {'Undergraduate_Headcount': {'anyOf': [{'type': 'integer'},
    {'type': 'null'}],
   'description': "Total undergraduate headcount for the 2024–2025 academic year (Different than undergraduate FTE. Sometimes you need to combine both full-time and part time).Search around the tables to locate what type of enrollment information it is.Only extract data for the 2024–2025 year or terms labeled Fall 2024, etcignore any data from other years or terms (e.g. 2023, 2023–2024, Fall 2023, Fall 2022, 2022). it's possible for a school to have multiple campuses, so combine all campuses' count or online and in-person count if applicable.If it didn't specify what kind of headcount is it, do not assume it's undergraduate headcount!!!Combine online and in-person if applicable.look around the table to see what type of data is it Do not derive or hallucinate the data unless the field is actually in the document."},
  'Undergraduate_Headcount_Full_Time': {'an

The following two cell blocks extract all schools' info into one excel file per school.

In [10]:
def process_school(school_name, school_dir):
    combined   = {}
    first_keys = None

    for fname in sorted(os.listdir(school_dir)):
        if not fname.lower().endswith(".pdf"):
            continue
        path = os.path.join(school_dir, fname)
        print(f"Extracting data from {fname}")
        try:
            run  = agent.extract(path)
            data = run.data or {}
            if first_keys is None:
                first_keys = list(data.keys())
                combined  = {k: None for k in first_keys}
            for k, v in data.items():
                if v not in (None, "", []):
                    combined[k] = v
        except Exception as err:
            print(f"Skipped {fname}: {err}")

    if first_keys:
        df = pd.DataFrame.from_dict(combined, orient="index", columns=["2024-25"])
        df.index.name = "Metric"
        outfile = os.path.join(OUTPUT_ROOT, f"{school_name}.xlsx")
        df.to_excel(outfile)
        print(f"Saved output to {outfile}")
    else:
        print(f"No PDF data found for {school_name}")

In [12]:
# Loop over schools
for school in sorted(os.listdir(PDF_ROOT)):
    school_dir = os.path.join(PDF_ROOT, school)
    if not os.path.isdir(school_dir):
        continue
    print(f"Processing school: {school}")
    process_school(school, school_dir)

print("Extraction complete.")

Processing school: ASU
Extracting data from P11817713-P11393130-P11833489.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.30s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.63it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:07<00:00,  7.78s/it]


Saved output to output_2/ASU.xlsx
Processing school: Bradley University
Extracting data from P21862068-P21425154-P21869099.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.15s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.04it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:07<00:00,  7.82s/it]


Saved output to output_2/Bradley University.xlsx
Processing school: California_state_university
Extracting data from P21878315-P21436983-P21882690.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.21s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.86it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:10<00:00, 10.36s/it]


Saved output to output_2/California_state_university.xlsx
Processing school: Cornell_university
Extracting data from P11799657-P11380074-P11818843.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.17s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.05it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:10<00:00, 10.98s/it]


Saved output to output_2/Cornell_university.xlsx
Processing school: Culinary_institute_of_America
Extracting data from P11790595-P11373821-P11811978.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.09it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.52it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:09<00:00,  9.22s/it]


Saved output to output_2/Culinary_institute_of_America.xlsx
Processing school: Gannon_university
Extracting data from P21859160-P21423095-P21866877.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.04s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.93it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:11<00:00, 11.41s/it]


Saved output to output_2/Gannon_university.xlsx
Processing school: Harvard_university
Extracting data from P21889042-P21444694-P21891364.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.27s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.27it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:14<00:00, 14.18s/it]


Saved output to output_2/Harvard_university.xlsx
Processing school: Lewis_univsersity
Extracting data from P11819634-P11394595-P11835096.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.07s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.44it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:10<00:00, 10.58s/it]


Saved output to output_2/Lewis_univsersity.xlsx
Processing school: MT_ST_MARY
Extracting data from Mt St Mary's fall 24 continuing disclosure.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:03<00:00,  3.20s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.05it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:26<00:00, 26.54s/it]


Saved output to output_2/MT_ST_MARY.xlsx
Processing school: Michigan_state_university
Extracting data from P21870305-P21430806-P21875444.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.28s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.70it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:09<00:00,  9.03s/it]


Saved output to output_2/Michigan_state_university.xlsx
Processing school: Molloy_college
Extracting data from P21874771-P21434198-P21879428.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.03it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.04it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:17<00:00, 17.10s/it]


Saved output to output_2/Molloy_college.xlsx
Processing school: New_York_University
Extracting data from P11812334-P11389223-P11829036.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.10s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.05it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:13<00:00, 13.25s/it]


Saved output to output_2/New_York_University.xlsx
Processing school: Ohio_state
Extracting data from P21875437-P21434721-P21880042.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.56s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.06it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:11<00:00, 11.86s/it]


Saved output to output_2/Ohio_state.xlsx
Processing school: ST_Louis_univ
Extracting data from P21874807-P21434222-P21879459.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.31s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.04it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:10<00:00, 10.46s/it]


Saved output to output_2/ST_Louis_univ.xlsx
Processing school: Stevenson
Extracting data from P21870783-P21431172-P21875933.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.03it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.27it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:08<00:00,  8.97s/it]


Saved output to output_2/Stevenson.xlsx
Processing school: Texas_A&M
Extracting data from P21898799-P21452091-P21899985.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:02<00:00,  2.24s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.23it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:16<00:00, 16.30s/it]


Saved output to output_2/Texas_A&M.xlsx
Processing school: The_catholic_university_of_america
Extracting data from P21849198-P11371813-P21859547.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.03it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.42it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:09<00:00,  9.01s/it]


Saved output to output_2/The_catholic_university_of_america.xlsx
Processing school: University_of_colorado
Extracting data from P21915816-P21464707-P21913817.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.05s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.69it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:09<00:00,  9.10s/it]


Saved output to output_2/University_of_colorado.xlsx
Processing school: University_of_minesota
Extracting data from P11814086-P21430908-P11830336.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.10s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.05it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:11<00:00, 11.99s/it]


Saved output to output_2/University_of_minesota.xlsx
Processing school: stevens_institue_of_technology
Extracting data from P21870185-P21430712-P21875325.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.04s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.11it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:08<00:00,  8.93s/it]

Saved output to output_2/stevens_institue_of_technology.xlsx
Extraction complete.



Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.08it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:02<00:00,  2.93s/it]
Extracting files: 100%|███████████████████████████| 1/1 [00:08<00:00,  8.15s/it]
Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.30s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.15it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:17<00:00, 17.84s/it]
Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.05s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.81it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:10<00:00, 10.77s/it]
Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.04it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.45it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:14<00:00, 14.96s/it]
Uploading files: 100%|█████

The following cell block extracts all the schools' info into one excel sheet but in different tabs.

In [10]:
OUTPUT_FILE = os.path.join(OUTPUT_ROOT, "all_schools.xlsx")

writer = pd.ExcelWriter(OUTPUT_FILE, engine="openpyxl")

for school in sorted(os.listdir(PDF_ROOT)):
    school_dir = os.path.join(PDF_ROOT, school)
    if not os.path.isdir(school_dir):
        continue

    combined   = {}
    first_keys = None
    for fname in sorted(os.listdir(school_dir)):
        if not fname.lower().endswith(".pdf"):
            continue
        path = os.path.join(school_dir, fname)
        print(f"Extracting data from {school}/{fname}")
        try:
            run  = agent.extract(path)
            data = run.data or {}
            if first_keys is None:
                first_keys = list(data.keys())
                combined  = {k: None for k in first_keys}
            for k, v in data.items():
                if v not in (None, "", []):
                    combined[k] = v
        except Exception as err:
            print(f"Skipped {fname}: {err}")

    if first_keys:
        df = pd.DataFrame.from_dict(combined, orient="index", columns=["2024-25"])
        df.index.name = "Metric"
        sheet_name = school[:31]
        df.to_excel(writer, sheet_name=sheet_name)
    else:
        print(f"No data for {school}.")

writer.close()
print(f"All schools written to {OUTPUT_FILE}")

Extracting data from ASU/P11817713-P11393130-P11833489.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.31s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.05it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:11<00:00, 11.55s/it]


Extracting data from Bradley University/P21862068-P21425154-P21869099.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.01s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.51it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:08<00:00,  8.79s/it]


Extracting data from California_state_university/P21878315-P21436983-P21882690.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.19s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.04it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:07<00:00,  7.52s/it]


Extracting data from Cornell_university/P11799657-P11380074-P11818843.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.21it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.63it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:08<00:00,  8.93s/it]


Extracting data from Culinary_institute_of_America/P11790595-P11373821-P11811978.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.08s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.03it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:10<00:00, 10.05s/it]


Extracting data from Gannon_university/P21859160-P21423095-P21866877.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.10it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.19it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:10<00:00, 10.06s/it]


Extracting data from Harvard_university/P21889042-P21444694-P21891364.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.24s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.31it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:11<00:00, 11.72s/it]


Extracting data from Lewis_univsersity/P11819634-P11394595-P11835096.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.15s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.04it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:13<00:00, 13.95s/it]


Extracting data from MT_ST_MARY/Mt St Mary's fall 24 continuing disclosure.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:02<00:00,  2.89s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.52it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:23<00:00, 23.98s/it]


Extracting data from Michigan_state_university/P21870305-P21430806-P21875444.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:00<00:00,  1.05it/s]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.05it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:08<00:00,  8.78s/it]


Extracting data from Molloy_college/P21874771-P21434198-P21879428.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.02s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.04it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:07<00:00,  7.75s/it]


Extracting data from New_York_University/P11812334-P11389223-P11829036.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.33s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.04it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:10<00:00, 10.10s/it]


Extracting data from Ohio_state/P21875437-P21434721-P21880042.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.09s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.09it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:11<00:00, 11.27s/it]


Extracting data from ST_Louis_univ/P21874807-P21434222-P21879459.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.19s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.04it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:12<00:00, 12.79s/it]


Extracting data from Stevenson/P21870783-P21431172-P21875933.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.31s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.05it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:09<00:00,  9.02s/it]


Extracting data from Texas_A&M/P21898799-P21452091-P21899985.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.79s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.47it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:24<00:00, 24.10s/it]


Extracting data from The_catholic_university_of_america/P21849198-P11371813-P21859547.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.38s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.28it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:15<00:00, 15.43s/it]


Extracting data from University_of_colorado/P21915816-P21464707-P21913817.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.28s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.05it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:30<00:00, 30.81s/it]


Extracting data from University_of_minesota/P11814086-P21430908-P11830336.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.04s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  1.63it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:17<00:00, 17.50s/it]


Extracting data from stevens_institue_of_technology/P21870185-P21430712-P21875325.pdf


Uploading files: 100%|████████████████████████████| 1/1 [00:01<00:00,  1.05s/it]
Creating extraction jobs: 100%|███████████████████| 1/1 [00:00<00:00,  2.12it/s]
Extracting files: 100%|███████████████████████████| 1/1 [00:09<00:00,  9.86s/it]

All schools written to output_4/all_schools.xlsx





In [11]:
#Combine all the tabs into one sheet if wanted
file_path   = "output_4/all_schools.xlsx"
output_path = "output_4/all_schools_combined.xlsx"

raw = pd.read_excel(file_path, sheet_name=None, index_col=0)

school_series = {
    school: df.iloc[:, 0]                      # first (only) value column
    for school, df in raw.items()
}

df_comb = pd.DataFrame(school_series).T
df_comb.index.name = "School"                 
df_comb.insert(0, "Year", "2024‑2025")

df_comb.loc['Texas_A&M', ['Total_Headcount','Undergraduate_Headcount']] = \
    df_comb.loc['Texas_A&M', ['Undergraduate_Headcount','Total_Headcount']].values

# df_comb.loc['California_state_university', 'Undergraduate_Headcount'] = None 
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
    df_comb.to_excel(writer, sheet_name="Combined")

print("Saved:", output_path)

Saved: output_4/all_schools_combined.xlsx
