In [1]:
import os
import pandas as pd
from llama_cloud_services import LlamaExtract
from schemas import Enrollment2024_25  #This could be adjusted through schemas.py
from dotenv import load_dotenv

In [13]:
PDF_ROOT = "university_pdfs_hy_e_1"
OUTPUT_ROOT = "output_scrapping"
os.makedirs(OUTPUT_ROOT, exist_ok=True)  
AGENT_ID = "ca221e4c-b3b2-4bf1-8862-d26016c9943a" #Different based on your LLamaCloud account
load_dotenv() #make sure the API key is in the .env file

True

In [5]:
extractor = LlamaExtract(project_id = '8c10e62e-3810-4193-915d-d2d11105826d')

#uncomment the below line if you are creating the agent for the first time
# agent = extractor.create_agent(name = "enrollment-parser-2024", data_schema=Enrollment2024_25)

agent = extractor.get_agent(id = AGENT_ID)

#uncomment the following lines if you updated the schema
agent.data_schema = Enrollment2024_25
agent.save()
agent = extractor.get_agent(id = AGENT_ID)


In [6]:
agent.data_schema

{'additionalProperties': False,
 'description': 'Statement of Cash Flows for the fiscal year 2024 or 2023–2024.\nOnly extract data from the 2023–2024 fiscal period (e.g. statements labeled ‘Fiscal Year 2024’ or date ranges covering 2023–2024).\nIgnore any figures outside this period.',
 'properties': {'Undergraduate_Headcount': {'anyOf': [{'type': 'integer'},
    {'type': 'null'}],
   'description': "Total undergraduate headcount for the 2024–2025 academic year (Different than undergraduate FTE. Sometimes you need to combine both full-time and part time).Search around the tables to locate what type of enrollment information it is.Only extract data for the 2024–2025 year or terms labeled Fall 2024, etcignore any data from other years or terms (e.g. 2023, 2023–2024, Fall 2023, Fall 2022, 2022). it's possible for a school to have multiple campuses, so combine all campuses' count or online and in-person count if applicable.If it didn't specify what kind of headcount is it, do not assume it

The following two cell blocks extract all schools' info into one excel file per school.

In [None]:
def process_school(school_name, school_dir):
    combined   = {}
    first_keys = None

    for fname in sorted(os.listdir(school_dir)):
        if not fname.lower().endswith(".pdf"):
            continue
        path = os.path.join(school_dir, fname)
        print(f"Extracting data from {fname}")
        try:
            run  = agent.extract(path)
            data = run.data or {}
            if first_keys is None:
                first_keys = list(data.keys())
                combined  = {k: None for k in first_keys}
            for k, v in data.items():
                if v not in (None, "", []):
                    combined[k] = v
        except Exception as err:
            print(f"Skipped {fname}: {err}")

    if first_keys:
        df = pd.DataFrame.from_dict(combined, orient="index", columns=["2024-25"])
        df.index.name = "Metric"
        outfile = os.path.join(OUTPUT_ROOT, f"{school_name}.xlsx")
        df.to_excel(outfile)
        print(f"Saved output to {outfile}")
    else:
        print(f"No PDF data found for {school_name}")

In [None]:
# Loop over schools
for school in sorted(os.listdir(PDF_ROOT)):
    school_dir = os.path.join(PDF_ROOT, school)
    if not os.path.isdir(school_dir):
        continue
    print(f"Processing school: {school}")
    process_school(school, school_dir)

print("Extraction complete.")

The following cell block extracts all the schools' info into one excel sheet but in different tabs.

In [18]:
OUTPUT_FILE

'output_scrapping/all_schools.xlsx'

In [20]:
OUTPUT_FILE = os.path.join(OUTPUT_ROOT, "all_schools.xlsx")

writer = pd.ExcelWriter(OUTPUT_FILE, engine="openpyxl")

for school in sorted(os.listdir(PDF_ROOT)):
    school_dir = os.path.join(PDF_ROOT, school)
    if not os.path.isdir(school_dir):
        continue

    combined   = {}
    first_keys = None
    for fname in sorted(os.listdir(school_dir)):
        if not fname.lower().endswith(".pdf"):
            continue
        path = os.path.join(school_dir, fname)
        print(f"Extracting data from {school}/{fname}")
        try:
            run  = agent.extract(path)
            data = run.data or {}
            if first_keys is None:
                first_keys = list(data.keys())
                combined  = {k: None for k in first_keys}
            for k, v in data.items():
                if v not in (None, "", []):
                    combined[k] = v
        except Exception as err:
            print(f"Skipped {fname}: {err}")

    if first_keys:
        df = pd.DataFrame.from_dict(combined, orient="index", columns=["2024-25"])
        df.index.name = "Metric"
        sheet_name = school[:31]
        df.to_excel(writer, sheet_name=sheet_name)
    else:
        print(f"No data for {school}.")

writer.close()
print(f"All schools written to {OUTPUT_FILE}")

Extracting data from KANSAS_CITY_UNIVERSITY_OF_MEDICINE_AND_BIOSCIENCES/FY24_Bond_Compliance_Annual_Report_for_the_year_ended_06_30_2024_Document2__316_KB_.pdf


BadZipFile: File is not a zip file

In [None]:
#Combine all the tabs into one sheet if wanted
file_path   = "output_scrapping/all_schools.xlsx"
output_path = "output_scrapping/all_schools_combined.xlsx"

raw = pd.read_excel(file_path, sheet_name=None, index_col=0)

school_series = {
    school: df.iloc[:, 0]                      # first (only) value column
    for school, df in raw.items()
}

df_comb = pd.DataFrame(school_series).T
df_comb.index.name = "School"                 
df_comb.insert(0, "Year", "2024‑2025")

# df_comb.loc['Texas_A&M', ['Total_Headcount','Undergraduate_Headcount']] = \
#     df_comb.loc['Texas_A&M', ['Undergraduate_Headcount','Total_Headcount']].values

# df_comb.loc['California_state_university', 'Undergraduate_Headcount'] = None 
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
    df_comb.to_excel(writer, sheet_name="Combined")

print("Saved:", output_path)