In [26]:
import os
import pandas as pd
from llama_extract import LlamaExtract
from schemas import Enrollment2024_25  #This could be adjusted through schemas.py
from dotenv import load_dotenv

In [28]:
PDF_ROOT = "university_pdfs_test"
OUTPUT_ROOT = "output"
AGENT_ID = "711c6031-269c-4c51-9f10-d92990c296fe" #Different based on your LLamaCloud account
load_dotenv() #make sure the API key is in the .env file

True

In [30]:
extractor = LlamaExtract()

#uncomment the below line if you are creating the agent for the first time
# agent = extractor.create_agent(name = "enrollment-parser-2024", data_schema=Enrollment2024)

agent = extractor.get_agent(id = AGENT_ID)

#uncomment the following lines if you updated the schema
# agent.data_schema = Enrollment2024_25
# agent.save()

No project_id provided, fetching default project.


The following two cell blocks extract all schools' info into one excel file per school.

In [31]:
def process_school(school_name, school_dir):
    combined   = {}
    first_keys = None

    for fname in sorted(os.listdir(school_dir)):
        if not fname.lower().endswith(".pdf"):
            continue
        path = os.path.join(school_dir, fname)
        print(f"Extracting data from {fname}")
        try:
            run  = agent.extract(path)
            data = run.data or {}
            if first_keys is None:
                first_keys = list(data.keys())
                combined  = {k: None for k in first_keys}
            for k, v in data.items():
                if v not in (None, "", []):
                    combined[k] = v
        except Exception as err:
            print(f"Skipped {fname}: {err}")

    if first_keys:
        df = pd.DataFrame.from_dict(combined, orient="index", columns=["2024-25"])
        df.index.name = "Metric"
        outfile = os.path.join(OUTPUT_ROOT, f"{school_name}.xlsx")
        df.to_excel(outfile)
        print(f"Saved output to {outfile}")
    else:
        print(f"No PDF data found for {school_name}")

In [16]:
# Loop over schools
for school in sorted(os.listdir(PDF_ROOT)):
    school_dir = os.path.join(PDF_ROOT, school)
    if not os.path.isdir(school_dir):
        continue
    print(f"Processing school: {school}")
    process_school(school, school_dir)

print("Extraction complete.")

Processing school: ASU
Extracting data from P11817713-P11393130-P11833489.pdf...


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.09it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  3.14it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:03<00:00,  3.75s/it]


Saved output to output/ASU.xlsx
Processing school: Bradley University
Extracting data from P21862068-P21425154-P21869099.pdf...


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.20it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  4.00it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:04<00:00,  4.08s/it]


Saved output to output/Bradley University.xlsx
Processing school: California_state_university
Extracting data from P21878315-P21436983-P21882690.pdf...


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.06s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  4.30it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:03<00:00,  3.76s/it]


Saved output to output/California_state_university.xlsx
Processing school: Cornell_university
Extracting data from P11799657-P11380074-P11818843.pdf...


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.43it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  2.71it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:05<00:00,  5.67s/it]


Saved output to output/Cornell_university.xlsx
Processing school: Culinary_institute_of_America
Extracting data from P11790595-P11373821-P11811978.pdf...


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.36it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  2.88it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:03<00:00,  3.78s/it]


Saved output to output/Culinary_institute_of_America.xlsx
Processing school: Gannon_university
Extracting data from P21859160-P21423095-P21866877.pdf...


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.07it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  3.34it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:02<00:00,  2.63s/it]


Saved output to output/Gannon_university.xlsx
Processing school: Harvard_university
Extracting data from P21889042-P21444694-P21891364.pdf...


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.18s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  2.98it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:03<00:00,  3.45s/it]


Saved output to output/Harvard_university.xlsx
Processing school: Lewis_univsersity
Extracting data from P11819634-P11394595-P11835096.pdf...


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.15it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  3.01it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:03<00:00,  3.78s/it]


Saved output to output/Lewis_univsersity.xlsx
Processing school: MT_ST_MARY
Extracting data from Mt St Mary's fall 24 continuing disclosure.pdf...


Uploading files: 100%|██████████████████████████████| 1/1 [00:02<00:00,  2.89s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  3.64it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:05<00:00,  5.06s/it]


Saved output to output/MT_ST_MARY.xlsx
Processing school: Michigan_state_university
Extracting data from P21870305-P21430806-P21875444.pdf...


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.21it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  3.22it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:04<00:00,  4.02s/it]


Saved output to output/Michigan_state_university.xlsx
Processing school: Molloy_college
Extracting data from P21874771-P21434198-P21879428.pdf...


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.24it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  3.02it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:03<00:00,  3.85s/it]


Saved output to output/Molloy_college.xlsx
Processing school: New_York_University
Extracting data from P11812334-P11389223-P11829036.pdf...


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.20it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  3.59it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:03<00:00,  3.82s/it]


Saved output to output/New_York_University.xlsx
Processing school: Ohio_state
Extracting data from P21875437-P21434721-P21880042.pdf...


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.05s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  4.01it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:04<00:00,  4.94s/it]


Saved output to output/Ohio_state.xlsx
Processing school: ST_Louis_univ
Extracting data from P21874807-P21434222-P21879459.pdf...


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.28it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  3.20it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:03<00:00,  3.79s/it]


Saved output to output/ST_Louis_univ.xlsx
Processing school: Stevenson
Extracting data from P21870783-P21431172-P21875933.pdf...


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.15it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  3.99it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:03<00:00,  3.74s/it]


Saved output to output/Stevenson.xlsx
Processing school: Texas_A&M
Extracting data from P21898799-P21452091-P21899985.pdf...


Uploading files: 100%|██████████████████████████████| 1/1 [00:02<00:00,  2.25s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  3.77it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:05<00:00,  5.04s/it]


Saved output to output/Texas_A&M.xlsx
Processing school: The_catholic_university_of_america
Extracting data from P21849198-P11371813-P21859547.pdf...


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.11it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  3.03it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:04<00:00,  4.89s/it]


Saved output to output/The_catholic_university_of_america.xlsx
Processing school: University_of_colorado
Extracting data from P21915816-P21464707-P21913817.pdf...


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.05it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  3.77it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:03<00:00,  3.77s/it]


Saved output to output/University_of_colorado.xlsx
Processing school: University_of_minesota
Extracting data from P11814086-P21430908-P11830336.pdf...


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.08it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  2.43it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:05<00:00,  5.12s/it]


Saved output to output/University_of_minesota.xlsx
Processing school: stevens_institue_of_technology
Extracting data from P21870185-P21430712-P21875325.pdf...


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.16it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  3.48it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:03<00:00,  3.34s/it]

Saved output to output/stevens_institue_of_technology.xlsx
Extraction complete.



Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.15it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  3.52it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:02<00:00,  2.66s/it]
Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.11it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  3.25it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:02<00:00,  2.62s/it]
Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.03it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  2.78it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:04<00:00,  4.03s/it]
Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.34it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  2.18it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:03<00:00,  3.85s/it]
Upl

The following cell block extracts all the schools' info into one excel sheet but in different tabs.

In [34]:
OUTPUT_FILE = os.path.join(OUTPUT_ROOT, "all_schools.xlsx")

writer = pd.ExcelWriter(OUTPUT_FILE, engine="openpyxl")

for school in sorted(os.listdir(PDF_ROOT)):
    school_dir = os.path.join(PDF_ROOT, school)
    if not os.path.isdir(school_dir):
        continue

    combined   = {}
    first_keys = None
    for fname in sorted(os.listdir(school_dir)):
        if not fname.lower().endswith(".pdf"):
            continue
        path = os.path.join(school_dir, fname)
        print(f"Extracting data from {school}/{fname}")
        try:
            run  = agent.extract(path)
            data = run.data or {}
            if first_keys is None:
                first_keys = list(data.keys())
                combined  = {k: None for k in first_keys}
            for k, v in data.items():
                if v not in (None, "", []):
                    combined[k] = v
        except Exception as err:
            print(f"Skipped {fname}: {err}")

    if first_keys:
        df = pd.DataFrame.from_dict(combined, orient="index", columns=["2024-25"])
        df.index.name = "Metric"
        sheet_name = school[:31]
        df.to_excel(writer, sheet_name=sheet_name)
    else:
        print(f"No data for {school}.")

writer.close()
print(f"All schools written to {OUTPUT_FILE}")

Extracting data from ASU/P11817713-P11393130-P11833489.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.24it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  3.63it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:02<00:00,  2.63s/it]


Extracting data from Bradley University/P21862068-P21425154-P21869099.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.14it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  3.19it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:06<00:00,  6.43s/it]


Extracting data from California_state_university/P21878315-P21436983-P21882690.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.12it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  3.87it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:02<00:00,  2.65s/it]


Extracting data from Cornell_university/P11799657-P11380074-P11818843.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.24it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  3.56it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:02<00:00,  2.56s/it]


Extracting data from Culinary_institute_of_America/P11790595-P11373821-P11811978.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.24it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  2.45it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:04<00:00,  4.30s/it]


Extracting data from Gannon_university/P21859160-P21423095-P21866877.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.24it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  3.32it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:02<00:00,  2.68s/it]


Extracting data from Harvard_university/P21889042-P21444694-P21891364.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.19s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  2.21it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:02<00:00,  2.63s/it]


Extracting data from Lewis_univsersity/P11819634-P11394595-P11835096.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.08it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.11it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:03<00:00,  3.69s/it]


Extracting data from MT_ST_MARY/Mt St Mary's fall 24 continuing disclosure.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:06<00:00,  6.50s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  3.74it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:03<00:00,  3.89s/it]


Extracting data from Michigan_state_university/P21870305-P21430806-P21875444.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.35it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  2.41it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:05<00:00,  5.06s/it]


Extracting data from Molloy_college/P21874771-P21434198-P21879428.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.34it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  4.19it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:02<00:00,  2.55s/it]


Extracting data from New_York_University/P11812334-P11389223-P11829036.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.08it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  3.98it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:03<00:00,  3.86s/it]


Extracting data from Ohio_state/P21875437-P21434721-P21880042.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.10s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  2.47it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:03<00:00,  3.93s/it]


Extracting data from ST_Louis_univ/P21874807-P21434222-P21879459.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.04it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  3.65it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:03<00:00,  3.82s/it]


Extracting data from Stevenson/P21870783-P21431172-P21875933.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.02s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  2.38it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:03<00:00,  3.86s/it]


Extracting data from Texas_A&M/P21898799-P21452091-P21899985.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.67s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  2.45it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:03<00:00,  3.75s/it]


Extracting data from The_catholic_university_of_america/P21849198-P11371813-P21859547.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.00it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  3.88it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:03<00:00,  3.75s/it]


Extracting data from University_of_colorado/P21915816-P21464707-P21913817.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.11it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  3.57it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:03<00:00,  3.79s/it]


Extracting data from University_of_minesota/P11814086-P21430908-P11830336.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.01s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  3.21it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:03<00:00,  3.87s/it]


Extracting data from stevens_institue_of_technology/P21870185-P21430712-P21875325.pdf


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.16it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  3.78it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:03<00:00,  3.77s/it]

All schools written to output/all_schools.xlsx





In [41]:
#Combine all the tabs into one sheet if wanted
file_path = "output/all_schools.xlsx"

sheets = pd.read_excel(file_path, sheet_name=None, index_col=0)

df_comb = pd.concat(sheets, axis=1)
df_comb.columns = df_comb.columns.get_level_values(0)

output_path = "output/all_schools_combined.xlsx"
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
    df_comb.to_excel(writer, sheet_name="Combined")