In [19]:
from llama_cloud_services import LlamaParse, LlamaReport, LlamaExtract
from pydantic import BaseModel, Field
from typing import Optional
from dotenv import load_dotenv
import os
import pandas as pd
load_dotenv()

True

In [21]:
class Enrollment2024_25(BaseModel):
    Undergraduate_Headcount: Optional[int] = Field(
        description=(
            "Total undergraduate headcount for the 2024–2025 academic year "
            "(Different than undergraduate FTE. Sometimes you need to combine both full-time and part time)."
            "Only extract data for the 2024–2025 year or terms labeled Fall 2024, etc"
            "ignore any data from other years or terms (e.g., 2023–2024, Fall 2023). "
            "it's possible for a school to have multiple campuses, so combine all campuses' count if applicable."
        )
    )
    Graduate_Headcount: Optional[int] = Field(
        description=(
            "Total graduate headcount for the 2024–2025 academic year "
            "(Different than graduate FTE. Sometimes you need to combine both full-time and part time), "
            "Combine enrollment across all graduate schools (e.g. Business, Education, etc.), "
            "which may be labeled “GR”, “Grad”, or “Graduate”. "
            "Only extract data for the 2024–2025 year or terms labeled Fall 2024, etc"
            "ignore any data from other years or terms (e.g., 2023–2024, Fall 2023). "
            "it's possible for a school to have multiple campuses, so combine all campuses' count if applicabl.e"
        )
    )
    Professional_Headcount: Optional[int] = Field(
        description=(
            "Combined professional school headcount (e.g. med, law) for 2024–2025 "
            "(Different than professional FTE. Sometimes you need to combine both full-time and part time)."
            "Only extract data for the 2024–2025 year or terms labeled Fall 2024, etc"
            "ignore any data from other years or terms (e.g., 2023–2024, Fall 2023). "
            "it's possible for a school to have multiple campuses, so combine all campuses' count if applicable."
        )
    )
    Total_Headcount: Optional[int] = Field(
        description=(
            "Overall student headcount for 2024–2025 "
            "(if not given, sum undergrad + grad + professional headcounts)."
            "Only extract data for the 2024–2025 year or terms labeled Fall 2024, etc"
            "ignore any data from other years or terms (e.g., 2023–2024, Fall 2023). "
            "it's possible for a school to have multiple campuses, so combine all campuses' count if applicable."
        )
    )

    Undergraduate_FTE: Optional[int] = Field(
        description=(
            "Undergraduate full-time headcount or FTE for 2024–2025 "
            "(may appear as “FT”, “Full-Time”, or “FTE”)."
            "Only extract data for the 2024–2025 year or terms labeled Fall 2024, etc"
            "ignore any data from other years or terms (e.g., 2023–2024, Fall 2023). "
            "it's possible for a school to have multiple campuses, so combine all campuses' count if applicable."
        )
    )
    Graduate_FTE: Optional[int] = Field(
        description=(
            "Graduate full-time headcount or FTE for 2024–2025 "
            "(may appear as “FT”, “Full-Time”, or “FTE”)."
            "Combine enrollment across all graduate schools (e.g. Business, Education, etc.), "
            "which may be labeled “GR”, “Grad”, or “Graduate”. "
            "Only extract data for the 2024–2025 year or terms labeled Fall 2024, etc"
            "ignore any data from other years or terms (e.g., 2023–2024, Fall 2023). "
            "it's possible for a school to have multiple campuses, so combine all campuses' count if applicable."
        )
    )
    Professional_FTE: Optional[int] = Field(
        description=(
            "Professional school full-time headcount or FTE for 2024–2025 "
            "(may appear as “FT”, “Full-Time”, or “FTE”)."
            "Only extract data for the 2024–2025 year or terms labeled Fall 2024, etc"
            "ignore any data from other years or terms (e.g., 2023–2024, Fall 2023). "
            "it's possible for a school to have multiple campuses, so combine all campuses' count if applicable."
        )
    )
    Total_Full_Time_Equivalent_Students: Optional[int] = Field(
        description=(
            "Total full-time equivalent students (FTE) for the 2024–2025 academic year."
            "Only extract data for the 2024–2025 year or terms labeled Fall 2024, etc"
            "ignore any data from other years or terms (e.g., 2023–2024, Fall 2023). "
            "it's possible for a school to have multiple campuses, so combine all campuses' count if applicable."
        )
    )

    Applications_Rcvd: Optional[int] = Field(
        description=(
            "Total applications received for the 2024–2025 cycle. "
            "Only extract data for the 2024–2025 year or terms labeled Fall 2024, etc.; "
            "ignore any data from other years or terms (e.g., 2023–2024, Fall 2023)."
            "it's possible for a school to have multiple campuses, so combine all campuses' count if applicable."
        )
    )
    
    Acceptances: Optional[int] = Field(
        description=(
            "Total acceptances for the 2024–2025 cycle. "
            "Only extract data for the 2024–2025 year or terms labeled Fall 2024, etc.; "
            "ignore any data from other years or terms (e.g., 2023–2024, Fall 2023)."
            "it's possible for a school to have multiple campuses, so combine all campuses' count if applicable."
        )
    )
    Matriculants: Optional[int] = Field(
        description=(
            "Number of students who matriculated in Fall 2024 / 2024–2025. "
            "Only extract data for the 2024–2025 year or terms labeled Fall 2024, etc.; "
            "ignore any data from other years or terms (e.g., 2023–2024, Fall 2023)."
        )
    )
    Retention_Rate: Optional[float] = Field(
        description=(
            "Retention rate % for the 2024–2025 entering class. "
            "Only extract data for the 2024–2025 year or terms labeled Fall 2024, etc.; "
            "ignore any data from other years or terms (e.g., 2023–2024, Fall 2023)."
            "it's possible for a school to have multiple campuses, so combine all campuses' count if applicable."
        )
    )

    Full_Time_Employee_Equivalents: Optional[int] = Field(
        description=(
            "Full-time employee equivalents (staff/faculty) in 2024–2025. "
            "Only extract data for the 2024–2025 year or terms labeled Fall 2024, etc.; "
            "ignore any data from other years or terms (e.g., 2023–2024, Fall 2023)."
            "it's possible for a school to have multiple campuses, so combine all campuses' count if applicable."
        )
    )
    Tuition: Optional[int] = Field(
        description=(
            "Undergraduate tuition rate for the 2024–2025 academic year. "
            "This is different than revenue generated by tuition or any financial accounting data"
            "Only extract data for the 2024–2025 year or terms labeled Fall 2024, etc.; "
            "ignore any data from other years or terms (e.g., 2023–2024, Fall 2023)."
            "it's possible for a school to have multiple campuses, so average the tuition per student per campus"
        )
    )
    Room_and_Board_20_meals: Optional[int] = Field(
        description=(
            "Room & board cost (20-meal plan) for the 2024–2025 year. "
            "Only extract data for the 2024–2025 year or terms labeled Fall 2024, etc.; "
            "ignore any data from other years or terms (e.g., 2023–2024, Fall 2023)."
            "it's possible for a school to have multiple campuses, so combine all campuses' count if applicable."
        )
    )

In [23]:
extractor = LlamaExtract()

# agent = extractor.create_agent(name = "enrollment-parser-2024", data_schema=Enrollment2024)
agent =extractor.get_agent(id = '711c6031-269c-4c51-9f10-d92990c296fe')

No project_id provided, fetching default project.


Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.07it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  2.66it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:05<00:00,  5.56s/it]
Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.77s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.76it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:03<00:00,  3.91s/it]
Uploading files: 100%|██████████████████████████████| 1/1 [00:01<00:00,  1.07s/it]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  1.29it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:02<00:00,  2.63s/it]
Uploading files: 100%|██████████████████████████████| 1/1 [00:00<00:00,  1.20it/s]
Creating extraction jobs: 100%|█████████████████████| 1/1 [00:00<00:00,  2.87it/s]
Extracting files: 100%|█████████████████████████████| 1/1 [00:03<00:00,  3.87s/it]
Uplo

In [24]:
agent.data_schema = Enrollment2024_25

agent.save()

In [26]:
pdf_root = "university_pdfs_test"
output_root = "output"
for school in os.listdir(pdf_root):
    school_dir = os.path.join(pdf_root, school)
    if not os.path.isdir(school_dir):
        continue

    print(f"Processing school: {school}")

    combined = {}
    first_keys = None

    for fname in os.listdir(school_dir):
        if not fname.lower().endswith(".pdf"):
            continue
        path = os.path.join(school_dir, fname)
        print("   •", fname)
        try:
            run = agent.extract(path)
            data = run.data

            if first_keys is None:
                first_keys = list(data.keys())
                combined = {k: None for k in first_keys}

            for k, v in data.items():
                if v is not None:
                    combined[k] = v

        except Exception as e:
            print("     skipped:", e)

    if first_keys is not None:
        df = pd.DataFrame.from_dict(combined, orient="index", columns=["2024 - 25"])
        df.index.name = "Metric"

        out_path = os.path.join(output_root, f"{school}.xlsx")
        df.to_excel(out_path)
        print("     saved", out_path)

    else:
        print("     no PDFs processed for", school)

Processing school: University_of_minesota
   • P11814086-P21430908-P11830336.pdf
     saved output/University_of_minesota.xlsx
Processing school: Texas_A&M
   • P21898799-P21452091-P21899985.pdf
     saved output/Texas_A&M.xlsx
Processing school: stevens_institue_of_technology
   • P21870185-P21430712-P21875325.pdf
     saved output/stevens_institue_of_technology.xlsx
Processing school: Michigan_state_university
   • P21870305-P21430806-P21875444.pdf
     saved output/Michigan_state_university.xlsx
Processing school: California_state_university
   • P21878315-P21436983-P21882690.pdf
     saved output/California_state_university.xlsx
Processing school: Lewis_univsersity
   • P11819634-P11394595-P11835096.pdf
     saved output/Lewis_univsersity.xlsx
Processing school: ST_Louis_univ
   • P21874807-P21434222-P21879459.pdf
     saved output/ST_Louis_univ.xlsx
Processing school: Bradley University
   • P21862068-P21425154-P21869099.pdf
     saved output/Bradley University.xlsx
Processing sch