In [2]:
import os
import pandas as pd
from llama_cloud_services import LlamaExtract
from schemas import Enrollment2024_25  #This could be adjusted through schemas.py
from dotenv import load_dotenv

In [10]:
PDF_ROOT = "university_pdfs_test"
OUTPUT_ROOT = "output_1"
os.makedirs(OUTPUT_ROOT, exist_ok=True)  
AGENT_ID = "6edffe95-2dac-4992-8f32-7c179c60850a" #Different based on your LLamaCloud account
load_dotenv() #make sure the API key is in the .env file

True

In [12]:
extractor = LlamaExtract()

#uncomment the below line if you are creating the agent for the first time
# agent = extractor.create_agent(name = "enrollment-parser-2024", data_schema=Enrollment2024_25)

agent = extractor.get_agent(id = AGENT_ID)

#uncomment the following lines if you updated the schema
agent.data_schema = Enrollment2024_25
agent.save()

No project_id provided, fetching default project.


The following two cell blocks extract all schools' info into one excel file per school.

In [15]:
def process_school(school_name, school_dir):
    combined   = {}
    first_keys = None

    for fname in sorted(os.listdir(school_dir)):
        if not fname.lower().endswith(".pdf"):
            continue
        path = os.path.join(school_dir, fname)
        print(f"Extracting data from {fname}")
        try:
            run  = agent.extract(path)
            data = run.data or {}
            if first_keys is None:
                first_keys = list(data.keys())
                combined  = {k: None for k in first_keys}
            for k, v in data.items():
                if v not in (None, "", []):
                    combined[k] = v
        except Exception as err:
            print(f"Skipped {fname}: {err}")

    if first_keys:
        df = pd.DataFrame.from_dict(combined, orient="index", columns=["2024-25"])
        df.index.name = "Metric"
        outfile = os.path.join(OUTPUT_ROOT, f"{school_name}.xlsx")
        df.to_excel(outfile)
        print(f"Saved output to {outfile}")
    else:
        print(f"No PDF data found for {school_name}")

In [None]:
# Loop over schools
for school in sorted(os.listdir(PDF_ROOT)):
    school_dir = os.path.join(PDF_ROOT, school)
    if not os.path.isdir(school_dir):
        continue
    print(f"Processing school: {school}")
    process_school(school, school_dir)

print("Extraction complete.")

The following cell block extracts all the schools' info into one excel sheet but in different tabs.

In [17]:
OUTPUT_FILE = os.path.join(OUTPUT_ROOT, "all_schools.xlsx")

writer = pd.ExcelWriter(OUTPUT_FILE, engine="openpyxl")

for school in sorted(os.listdir(PDF_ROOT)):
    school_dir = os.path.join(PDF_ROOT, school)
    if not os.path.isdir(school_dir):
        continue

    combined   = {}
    first_keys = None
    for fname in sorted(os.listdir(school_dir)):
        if not fname.lower().endswith(".pdf"):
            continue
        path = os.path.join(school_dir, fname)
        print(f"Extracting data from {school}/{fname}")
        try:
            run  = agent.extract(path)
            data = run.data or {}
            if first_keys is None:
                first_keys = list(data.keys())
                combined  = {k: None for k in first_keys}
            for k, v in data.items():
                if v not in (None, "", []):
                    combined[k] = v
        except Exception as err:
            print(f"Skipped {fname}: {err}")

    if first_keys:
        df = pd.DataFrame.from_dict(combined, orient="index", columns=["2024-25"])
        df.index.name = "Metric"
        sheet_name = school[:31]
        df.to_excel(writer, sheet_name=sheet_name)
    else:
        print(f"No data for {school}.")

writer.close()
print(f"All schools written to {OUTPUT_FILE}")

Extracting data from ASU/P11817713-P11393130-P11833489.pdf


Uploading files: 100%|████████████████████████████████████| 1/1 [00:00<00:00,  1.14it/s]
Creating extraction jobs: 100%|███████████████████████████| 1/1 [00:00<00:00,  2.22it/s]
Extracting files: 100%|███████████████████████████████████| 1/1 [01:11<00:00, 71.69s/it]


Extracting data from Bradley University/P21862068-P21425154-P21869099.pdf


Uploading files: 100%|████████████████████████████████████| 1/1 [00:00<00:00,  1.03it/s]
Creating extraction jobs: 100%|███████████████████████████| 1/1 [00:00<00:00,  1.28it/s]
Extracting files: 100%|███████████████████████████████████| 1/1 [00:38<00:00, 38.04s/it]


Extracting data from California_state_university/P21878315-P21436983-P21882690.pdf


Uploading files: 100%|████████████████████████████████████| 1/1 [00:00<00:00,  1.05it/s]
Creating extraction jobs: 100%|███████████████████████████| 1/1 [00:00<00:00,  1.94it/s]
Extracting files: 100%|███████████████████████████████████| 1/1 [00:56<00:00, 56.82s/it]


Extracting data from Cornell_university/P11799657-P11380074-P11818843.pdf


Uploading files: 100%|████████████████████████████████████| 1/1 [00:00<00:00,  1.04it/s]
Creating extraction jobs: 100%|███████████████████████████| 1/1 [00:00<00:00,  2.87it/s]
Extracting files: 100%|███████████████████████████████████| 1/1 [00:54<00:00, 55.00s/it]


Extracting data from Culinary_institute_of_America/P11790595-P11373821-P11811978.pdf


Uploading files: 100%|████████████████████████████████████| 1/1 [00:00<00:00,  1.19it/s]
Creating extraction jobs: 100%|███████████████████████████| 1/1 [00:00<00:00,  2.52it/s]
Extracting files: 100%|███████████████████████████████████| 1/1 [01:16<00:00, 76.59s/it]


Extracting data from Gannon_university/P21859160-P21423095-P21866877.pdf


Uploading files: 100%|████████████████████████████████████| 1/1 [00:00<00:00,  1.10it/s]
Creating extraction jobs: 100%|███████████████████████████| 1/1 [00:00<00:00,  2.76it/s]
Extracting files: 100%|███████████████████████████████████| 1/1 [00:38<00:00, 38.40s/it]


Extracting data from Harvard_university/P21889042-P21444694-P21891364.pdf


Uploading files: 100%|████████████████████████████████████| 1/1 [00:01<00:00,  1.18s/it]
Creating extraction jobs: 100%|███████████████████████████| 1/1 [00:00<00:00,  1.63it/s]
Extracting files: 100%|██████████████████████████████████| 1/1 [02:21<00:00, 141.14s/it]


Extracting data from Lewis_univsersity/P11819634-P11394595-P11835096.pdf


Uploading files: 100%|████████████████████████████████████| 1/1 [00:00<00:00,  1.04it/s]
Creating extraction jobs: 100%|███████████████████████████| 1/1 [00:00<00:00,  2.06it/s]
Extracting files: 100%|███████████████████████████████████| 1/1 [01:25<00:00, 85.86s/it]


Extracting data from MT_ST_MARY/Mt St Mary's fall 24 continuing disclosure.pdf


Uploading files: 100%|████████████████████████████████████| 1/1 [00:02<00:00,  2.25s/it]
Creating extraction jobs: 100%|███████████████████████████| 1/1 [00:00<00:00,  2.97it/s]
Extracting files: 100%|██████████████████████████████████| 1/1 [02:19<00:00, 139.24s/it]


Extracting data from Michigan_state_university/P21870305-P21430806-P21875444.pdf


Uploading files: 100%|████████████████████████████████████| 1/1 [00:01<00:00,  1.15s/it]
Creating extraction jobs: 100%|███████████████████████████| 1/1 [00:00<00:00,  2.20it/s]
Extracting files: 100%|███████████████████████████████████| 1/1 [00:40<00:00, 40.23s/it]


Extracting data from Molloy_college/P21874771-P21434198-P21879428.pdf


Uploading files: 100%|████████████████████████████████████| 1/1 [00:00<00:00,  1.06it/s]
Creating extraction jobs: 100%|███████████████████████████| 1/1 [00:00<00:00,  1.96it/s]
Extracting files: 100%|███████████████████████████████████| 1/1 [01:06<00:00, 66.87s/it]


Extracting data from New_York_University/P11812334-P11389223-P11829036.pdf


Uploading files: 100%|████████████████████████████████████| 1/1 [00:01<00:00,  1.07s/it]
Creating extraction jobs: 100%|███████████████████████████| 1/1 [00:00<00:00,  1.47it/s]
Extracting files: 100%|███████████████████████████████████| 1/1 [01:05<00:00, 65.26s/it]


Extracting data from Ohio_state/P21875437-P21434721-P21880042.pdf


Uploading files: 100%|████████████████████████████████████| 1/1 [00:01<00:00,  1.48s/it]
Creating extraction jobs: 100%|███████████████████████████| 1/1 [00:00<00:00,  1.63it/s]
Extracting files: 100%|███████████████████████████████████| 1/1 [01:23<00:00, 83.48s/it]


Extracting data from ST_Louis_univ/P21874807-P21434222-P21879459.pdf


Uploading files: 100%|████████████████████████████████████| 1/1 [00:00<00:00,  1.00it/s]
Creating extraction jobs: 100%|███████████████████████████| 1/1 [00:00<00:00,  2.62it/s]
Extracting files: 100%|███████████████████████████████████| 1/1 [00:38<00:00, 38.29s/it]


Extracting data from Stevenson/P21870783-P21431172-P21875933.pdf


Uploading files: 100%|████████████████████████████████████| 1/1 [00:01<00:00,  1.05s/it]
Creating extraction jobs: 100%|███████████████████████████| 1/1 [00:00<00:00,  2.71it/s]
Extracting files: 100%|███████████████████████████████████| 1/1 [01:25<00:00, 85.44s/it]


Extracting data from Texas_A&M/P21898799-P21452091-P21899985.pdf


Uploading files: 100%|████████████████████████████████████| 1/1 [00:02<00:00,  2.15s/it]
Creating extraction jobs: 100%|███████████████████████████| 1/1 [00:00<00:00,  1.33it/s]
Extracting files: 100%|███████████████████████████████████| 1/1 [01:25<00:00, 85.33s/it]


Extracting data from The_catholic_university_of_america/P21849198-P11371813-P21859547.pdf


Uploading files: 100%|████████████████████████████████████| 1/1 [00:01<00:00,  1.18s/it]
Creating extraction jobs: 100%|███████████████████████████| 1/1 [00:00<00:00,  1.63it/s]
Extracting files: 100%|███████████████████████████████████| 1/1 [00:55<00:00, 55.19s/it]


Extracting data from University_of_colorado/P21915816-P21464707-P21913817.pdf


Uploading files: 100%|████████████████████████████████████| 1/1 [00:01<00:00,  1.09s/it]
Creating extraction jobs: 100%|███████████████████████████| 1/1 [00:00<00:00,  2.50it/s]
Extracting files: 100%|███████████████████████████████████| 1/1 [00:55<00:00, 55.81s/it]


Extracting data from University_of_minesota/P11814086-P21430908-P11830336.pdf


Uploading files: 100%|████████████████████████████████████| 1/1 [00:00<00:00,  1.03it/s]
Creating extraction jobs: 100%|███████████████████████████| 1/1 [00:00<00:00,  2.40it/s]
Extracting files: 100%|███████████████████████████████████| 1/1 [00:49<00:00, 49.55s/it]


Extracting data from stevens_institue_of_technology/P21870185-P21430712-P21875325.pdf


Uploading files: 100%|████████████████████████████████████| 1/1 [00:00<00:00,  1.03it/s]
Creating extraction jobs: 100%|███████████████████████████| 1/1 [00:00<00:00,  2.46it/s]
Extracting files: 100%|███████████████████████████████████| 1/1 [00:31<00:00, 31.78s/it]

All schools written to output_1/all_schools.xlsx





In [21]:
#Combine all the tabs into one sheet if wanted
file_path   = "output_1/all_schools.xlsx"
output_path = "output_1/all_schools_combined.xlsx"

raw = pd.read_excel(file_path, sheet_name=None, index_col=0)

school_series = {
    school: df.iloc[:, 0]                      # first (only) value column
    for school, df in raw.items()
}

df_comb = pd.DataFrame(school_series).T
df_comb.index.name = "School"                 
df_comb.insert(0, "Year", "2024‑2025")

with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
    df_comb.to_excel(writer, sheet_name="Combined")

print("Saved:", output_path)

Saved: output_1/all_schools_combined.xlsx
