In [2]:
import os
import pandas as pd
from llama_cloud_services import LlamaExtract
from schemas import Enrollment2024_25  #This could be adjusted through schemas.py
from dotenv import load_dotenv

In [4]:
PDF_ROOT = "university_pdfs_test"
OUTPUT_ROOT = "output_1"
os.makedirs(OUTPUT_ROOT, exist_ok=True)  
AGENT_ID = "6edffe95-2dac-4992-8f32-7c179c60850a" #Different based on your LLamaCloud account
load_dotenv() #make sure the API key is in the .env file

True

In [6]:
extractor = LlamaExtract()

#uncomment the below line if you are creating the agent for the first time
# agent = extractor.create_agent(name = "enrollment-parser-2024", data_schema=Enrollment2024_25)

agent = extractor.get_agent(id = AGENT_ID)

#uncomment the following lines if you updated the schema
agent.data_schema = Enrollment2024_25
agent.save()

No project_id provided, fetching default project.


The following two cell blocks extract all schools' info into one excel file per school.

In [6]:
def process_school(school_name, school_dir):
    combined   = {}
    first_keys = None

    for fname in sorted(os.listdir(school_dir)):
        if not fname.lower().endswith(".pdf"):
            continue
        path = os.path.join(school_dir, fname)
        print(f"Extracting data from {fname}")
        try:
            run  = agent.extract(path)
            data = run.data or {}
            if first_keys is None:
                first_keys = list(data.keys())
                combined  = {k: None for k in first_keys}
            for k, v in data.items():
                if v not in (None, "", []):
                    combined[k] = v
        except Exception as err:
            print(f"Skipped {fname}: {err}")

    if first_keys:
        df = pd.DataFrame.from_dict(combined, orient="index", columns=["2024-25"])
        df.index.name = "Metric"
        outfile = os.path.join(OUTPUT_ROOT, f"{school_name}.xlsx")
        df.to_excel(outfile)
        print(f"Saved output to {outfile}")
    else:
        print(f"No PDF data found for {school_name}")

In [None]:
# Loop over schools
for school in sorted(os.listdir(PDF_ROOT)):
    school_dir = os.path.join(PDF_ROOT, school)
    if not os.path.isdir(school_dir):
        continue
    print(f"Processing school: {school}")
    process_school(school, school_dir)

print("Extraction complete.")

The following cell block extracts all the schools' info into one excel sheet but in different tabs.

In [15]:
OUTPUT_FILE = os.path.join(OUTPUT_ROOT, "all_schools.xlsx")

writer = pd.ExcelWriter(OUTPUT_FILE, engine="openpyxl")

for school in sorted(os.listdir(PDF_ROOT)):
    school_dir = os.path.join(PDF_ROOT, school)
    if not os.path.isdir(school_dir):
        continue

    combined   = {}
    first_keys = None
    for fname in sorted(os.listdir(school_dir)):
        if not fname.lower().endswith(".pdf"):
            continue
        path = os.path.join(school_dir, fname)
        print(f"Extracting data from {school}/{fname}")
        try:
            run  = agent.extract(path)
            data = run.data or {}
            if first_keys is None:
                first_keys = list(data.keys())
                combined  = {k: None for k in first_keys}
            for k, v in data.items():
                if v not in (None, "", []):
                    combined[k] = v
        except Exception as err:
            print(f"Skipped {fname}: {err}")

    if first_keys:
        df = pd.DataFrame.from_dict(combined, orient="index", columns=["2024-25"])
        df.index.name = "Metric"
        sheet_name = school[:31]
        df.to_excel(writer, sheet_name=sheet_name)
    else:
        print(f"No data for {school}.")

writer.close()
print(f"All schools written to {OUTPUT_FILE}")

Extracting data from ASU/P11817713-P11393130-P11833489.pdf
No data for ASU.
Extracting data from Bradley University/P21862068-P21425154-P21869099.pdf
No data for Bradley University.
Extracting data from California_state_university/P21878315-P21436983-P21882690.pdf
No data for California_state_university.
Extracting data from Cornell_university/P11799657-P11380074-P11818843.pdf
No data for Cornell_university.
Extracting data from Culinary_institute_of_America/P11790595-P11373821-P11811978.pdf
No data for Culinary_institute_of_America.
Extracting data from Gannon_university/P21859160-P21423095-P21866877.pdf
No data for Gannon_university.
Extracting data from Harvard_university/P21889042-P21444694-P21891364.pdf
No data for Harvard_university.
Extracting data from Lewis_univsersity/P11819634-P11394595-P11835096.pdf
No data for Lewis_univsersity.
Extracting data from MT_ST_MARY/Mt St Mary's fall 24 continuing disclosure.pdf
No data for MT_ST_MARY.
Extracting data from Michigan_state_univers

IndexError: At least one sheet must be visible

In [None]:
#Combine all the tabs into one sheet if wanted
file_path   = "output/all_schools.xlsx"
output_path = "output/all_schools_combined.xlsx"

raw = pd.read_excel(file_path, sheet_name=None, index_col=0)

school_series = {
    school: df.iloc[:, 0]                      # first (only) value column
    for school, df in raw.items()
}

df_comb = pd.DataFrame(school_series).T
df_comb.index.name = "School"                 
df_comb.insert(0, "Year", "2024‑2025")

with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
    df_comb.to_excel(writer, sheet_name="Combined")

print("Saved:", output_path)