In [2]:
import os
import pandas as pd
from llama_cloud_services import LlamaExtract
from schemas import Enrollment2024_25  #This could be adjusted through schemas.py
from dotenv import load_dotenv

In [3]:
PDF_ROOT = "university_pdfs_test"
OUTPUT_ROOT = "output_1"
os.makedirs(OUTPUT_ROOT, exist_ok=True)  
AGENT_ID = "ca221e4c-b3b2-4bf1-8862-d26016c9943a" #Different based on your LLamaCloud account
load_dotenv() #make sure the API key is in the .env file

True

In [14]:
extractor = LlamaExtract(project_id = '8c10e62e-3810-4193-915d-d2d11105826d')

#uncomment the below line if you are creating the agent for the first time
agent = extractor.create_agent(name = "enrollment-parser-2024", data_schema=Enrollment2024_25)

agent = extractor.get_agent(id = AGENT_ID)

#uncomment the following lines if you updated the schema
agent.data_schema = Enrollment2024_25
agent.save()
agent = extractor.get_agent(id = AGENT_ID)


In [16]:
agent.data_schema

{'additionalProperties': False,
 'properties': {'Undergraduate_Headcount': {'anyOf': [{'type': 'integer'},
    {'type': 'null'}],
   'description': "Total undergraduate headcount for the 2024–2025 academic year (Different than undergraduate FTE. Sometimes you need to combine both full-time and part time).Search around the tables to locate what type of enrollment information it is.Ex: Texas A&M's first table is just says 'total enrollment headcount', but if you look around the table, it's for combination of undergraduate and graduate, so it doesn't belong to this undergraduate_headcount, it should be total headcount.DO NOT include the first table information on Texas A&M for this field, it should be for total headcount.Only extract data for the 2024–2025 year or terms labeled Fall 2024, etcignore any data from other years or terms (e.g., 2023–2024, Fall 2023). it's possible for a school to have multiple campuses, so combine all campuses' count or online and in-person count if applicable

The following two cell blocks extract all schools' info into one excel file per school.

In [31]:
def process_school(school_name, school_dir):
    combined   = {}
    first_keys = None

    for fname in sorted(os.listdir(school_dir)):
        if not fname.lower().endswith(".pdf"):
            continue
        path = os.path.join(school_dir, fname)
        print(f"Extracting data from {fname}")
        try:
            run  = agent.extract(path)
            data = run.data or {}
            if first_keys is None:
                first_keys = list(data.keys())
                combined  = {k: None for k in first_keys}
            for k, v in data.items():
                if v not in (None, "", []):
                    combined[k] = v
        except Exception as err:
            print(f"Skipped {fname}: {err}")

    if first_keys:
        df = pd.DataFrame.from_dict(combined, orient="index", columns=["2024-25"])
        df.index.name = "Metric"
        outfile = os.path.join(OUTPUT_ROOT, f"{school_name}.xlsx")
        df.to_excel(outfile)
        print(f"Saved output to {outfile}")
    else:
        print(f"No PDF data found for {school_name}")

In [None]:
# Loop over schools
for school in sorted(os.listdir(PDF_ROOT)):
    school_dir = os.path.join(PDF_ROOT, school)
    if not os.path.isdir(school_dir):
        continue
    print(f"Processing school: {school}")
    process_school(school, school_dir)

print("Extraction complete.")

The following cell block extracts all the schools' info into one excel sheet but in different tabs.

In [10]:
OUTPUT_FILE = os.path.join(OUTPUT_ROOT, "all_schools.xlsx")

writer = pd.ExcelWriter(OUTPUT_FILE, engine="openpyxl")

for school in sorted(os.listdir(PDF_ROOT)):
    school_dir = os.path.join(PDF_ROOT, school)
    if not os.path.isdir(school_dir):
        continue

    combined   = {}
    first_keys = None
    for fname in sorted(os.listdir(school_dir)):
        if not fname.lower().endswith(".pdf"):
            continue
        path = os.path.join(school_dir, fname)
        print(f"Extracting data from {school}/{fname}")
        try:
            run  = agent.extract(path)
            data = run.data or {}
            if first_keys is None:
                first_keys = list(data.keys())
                combined  = {k: None for k in first_keys}
            for k, v in data.items():
                if v not in (None, "", []):
                    combined[k] = v
        except Exception as err:
            print(f"Skipped {fname}: {err}")

    if first_keys:
        df = pd.DataFrame.from_dict(combined, orient="index", columns=["2024-25"])
        df.index.name = "Metric"
        sheet_name = school[:31]
        df.to_excel(writer, sheet_name=sheet_name)
    else:
        print(f"No data for {school}.")

writer.close()
print(f"All schools written to {OUTPUT_FILE}")

Extracting data from ASU/P11817713-P11393130-P11833489.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.34it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.39it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:15<00:00, 15.98s/it]


Extracting data from Bradley University/P21862068-P21425154-P21869099.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.30s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  2.28it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:16<00:00, 16.33s/it]


Extracting data from California_state_university/P21878315-P21436983-P21882690.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.09s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  2.08it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:14<00:00, 14.44s/it]


Extracting data from Cornell_university/P11799657-P11380074-P11818843.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.01s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:01<00:00,  1.49s/it]
Extracting files: 100%|█████████████████████████████| 1/1 [00:10<00:00, 10.70s/it]


Extracting data from Culinary_institute_of_America/P11790595-P11373821-P11811978.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.30it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.72it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:11<00:00, 11.31s/it]


Extracting data from Gannon_university/P21859160-P21423095-P21866877.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.04it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  2.67it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:10<00:00, 10.38s/it]


Extracting data from Harvard_university/P21889042-P21444694-P21891364.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.08s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  2.11it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:10<00:00, 10.50s/it]


Extracting data from Lewis_univsersity/P11819634-P11394595-P11835096.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.08it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  2.44it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:11<00:00, 11.55s/it]


Extracting data from MT_ST_MARY/Mt St Mary's fall 24 continuing disclosure.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.44s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  2.25it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:39<00:00, 39.04s/it]


Extracting data from Michigan_state_university/P21870305-P21430806-P21875444.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.38it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.90it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:14<00:00, 14.04s/it]


Extracting data from Molloy_college/P21874771-P21434198-P21879428.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.32it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:04<00:00,  4.53s/it]
Extracting files: 100%|█████████████████████████████| 1/1 [00:07<00:00,  7.82s/it]


Extracting data from New_York_University/P11812334-P11389223-P11829036.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.08s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  2.05it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:09<00:00,  9.53s/it]


Extracting data from Ohio_state/P21875437-P21434721-P21880042.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.11it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  2.55it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:02<00:00,  2.96s/it]


No data for Ohio_state.
Extracting data from ST_Louis_univ/P21874807-P21434222-P21879459.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:02<00:00,  2.65s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.54it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:01<00:00,  1.74s/it]


No data for ST_Louis_univ.
Extracting data from Stevenson/P21870783-P21431172-P21875933.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.23it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  2.25it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:01<00:00,  1.63s/it]


No data for Stevenson.
Extracting data from Texas_A&M/P21898799-P21452091-P21899985.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.26s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.71it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:02<00:00,  2.73s/it]


No data for Texas_A&M.
Extracting data from The_catholic_university_of_america/P21849198-P11371813-P21859547.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.18it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  2.06it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:01<00:00,  1.44s/it]


No data for The_catholic_university_of_america.
Extracting data from University_of_colorado/P21915816-P21464707-P21913817.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.19it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.88it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:01<00:00,  1.49s/it]


No data for University_of_colorado.
Extracting data from University_of_minesota/P11814086-P21430908-P11830336.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.05it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.28it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:01<00:00,  1.77s/it]


No data for University_of_minesota.
Extracting data from stevens_institue_of_technology/P21870185-P21430712-P21875325.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.14it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  2.34it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:04<00:00,  4.03s/it]

No data for stevens_institue_of_technology.
All schools written to output_1/all_schools.xlsx





In [11]:
#Combine all the tabs into one sheet if wanted
file_path   = "output_1/all_schools.xlsx"
output_path = "output_1/all_schools_combined.xlsx"

raw = pd.read_excel(file_path, sheet_name=None, index_col=0)

school_series = {
    school: df.iloc[:, 0]                      # first (only) value column
    for school, df in raw.items()
}

df_comb = pd.DataFrame(school_series).T
df_comb.index.name = "School"                 
df_comb.insert(0, "Year", "2024‑2025")

df_comb.loc['Texas_A&M', ['Total_Headcount','Undergraduate_Headcount']] = \
    df_comb.loc['Texas_A&M', ['Undergraduate_Headcount','Total_Headcount']].values

with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
    df_comb.to_excel(writer, sheet_name="Combined")

print("Saved:", output_path)

KeyError: 'Texas_A&M'