In [1]:
import os
import pandas as pd
from llama_cloud_services import LlamaExtract
#from schemas import Enrollment2024_25  #This could be adjusted through schemas.py
from enrollment_latest import Enrollment2024_25
#from enrollment_optimal import Enrollment2024_25
from dotenv import load_dotenv

In [2]:
#PDF_ROOT = "scraper/university_pdfs_hy_e"
#PDF_ROOT = "sample"
PDF_ROOT = "university_pdfs_sample"
OUTPUT_ROOT = "output_scrapping"
os.makedirs(OUTPUT_ROOT, exist_ok=True)  
#AGENT_ID = "ca221e4c-b3b2-4bf1-8862-d26016c9943a" #Different based on your LLamaCloud account - enrollment

#AGENT_ID = "09371b77-6cdd-4fee-89db-93b1f88544f5" # enrollment with all variables have yrs
#AGENT_ID = "1a3fb0ee-4ec7-4d73-8ccb-81dfaa1f3e01" #enrollment with important variables have yrs
AGENT_ID = "99a9123b-734a-462b-a3c0-2887f5e6a634" #enrollment grabs latest data no matter which yr

load_dotenv() #make sure the API key is in the .env file

True

In [3]:
extractor = LlamaExtract(project_id = '8c10e62e-3810-4193-915d-d2d11105826d')

#uncomment the below line if you are creating the agent for the first time
# agent = extractor.create_agent(name = "enrollment-parser-2024", data_schema=Enrollment2024_25)

agent = extractor.get_agent(id = AGENT_ID)

#uncomment the following lines if you updated the schema
agent.data_schema = Enrollment2024_25
agent.save()
agent = extractor.get_agent(id = AGENT_ID)


In [4]:
agent.data_schema

{'additionalProperties': False,
 'properties': {'Year_Headcount': {'anyOf': [{'type': 'string'},
    {'type': 'null'}],
   'description': "Collect the academic year or term (e.g., '2024–25','Fall 2024', 'AY 2024–2025') associated with the following fields:Undergraduate Headcount, Graduate Headcount, Total Headcount, and Total full-time equivalent students (FTE).If all of them refer to a year equivalent to 'Fall 2024', such as 'Fall 2024', 'AY 2024–2025','Academic Year 2024–2025', '2024-25', or 'AY 24–25',then convert it and return only the standardized value '2024–2025'.For example, convert 'Fall 2023', 'AY 2023–2024', '2023–24' to '2023–2024'; convert 'Fall 2022' to '2022–2023'; and so on.Always convert when a clearly matching year or term is present.Do not infer or guess — only convert when the input explicitly matches a known academic year."},
  'Year_Fee': {'anyOf': [{'type': 'string'}, {'type': 'null'}],
   'description': "Collect the academic year or term (e.g.,'2024–25', 'Fall 2

The following cell block extracts all the schools' info into one excel sheet but in different tabs.

In [5]:
# Set the output Excel file path
OUTPUT_FILE = os.path.join(OUTPUT_ROOT, "all_schools_sample.xlsx")

# Create an Excel writer using the openpyxl engine
writer = pd.ExcelWriter(OUTPUT_FILE, engine="openpyxl")

# Iterate through each folder (school) in the PDF_ROOT directory
for school in sorted(os.listdir(PDF_ROOT)):
    school_dir = os.path.join(PDF_ROOT, school)
    
    # Skip if not a directory (e.g., if it's a file)
    if not os.path.isdir(school_dir):
        continue

    combined   = {}       # Dictionary to accumulate extracted values
    first_keys = None     # Tracks the metric keys from the first valid PDF

    # Loop over each PDF file within the school's folder
    for fname in sorted(os.listdir(school_dir)):
        if not fname.lower().endswith(".pdf"):
            continue  # Skip non-PDF files

        path = os.path.join(school_dir, fname)
        print(f"Extracting data from {school}/{fname}")
        try:
            # Extract data from the PDF using the agent
            run  = agent.extract(path)
            data = run.data or {}  # Use empty dict if data is None

            # Initialize keys on the first successful PDF extraction
            if first_keys is None:
                first_keys = list(data.keys())
                combined   = {k: None for k in first_keys}

            # Update combined dictionary with non-empty values
            for k, v in data.items():
                if v not in (None, "", []):
                    combined[k] = v

        except Exception as err:
            print(f"Skipped {fname}: {err}")  # Log extraction errors

    # If we have extracted any data at all, write to Excel
    if first_keys:
        # Create a DataFrame from the combined dictionary
        df = pd.DataFrame.from_dict(combined, orient="index", columns=["2024-25"])
        df.index.name = "Metric"  # Set index name for clarity

        # Sheet name must be ≤31 characters due to Excel limitations
        sheet_name = school[:31]
        df.to_excel(writer, sheet_name=sheet_name)
    else:
        print(f"No data for {school}.")  # Log schools with no extractable content

# Save the Excel file with all the individual sheets
writer.close()
print(f"All schools written to {OUTPUT_FILE}")


Extracting data from BRADLEY_UNIVERSITY/Annual_Financial_Information_and_Operating_Data__Rule_15c2-12__for_FY24_for_the_year_ended_05_31_2024__227_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:00<00:00,  1.02it/s]
Creating extraction jobs: 100%|██████████| 1/1 [00:02<00:00,  2.33s/it]
Extracting files: 100%|██████████| 1/1 [00:06<00:00,  6.67s/it]


Extracting data from CULINARY_INSTITUTE_OF_AMERICA_THE/2024_Annual_Report_-_Corrected_for_the_year_ended_05_31_2024__130_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:00<00:00,  1.27it/s]
Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00,  1.88it/s]
Extracting files: 100%|██████████| 1/1 [00:11<00:00, 11.63s/it]


Extracting data from FLORIDA_INSTITUTE_OF_TECHNOLOGY_INC/2024_Annual_Report_-_Florida_Institute_of_Technology_for_the_year_ended_06_30_2024__162_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:00<00:00,  1.08it/s]
Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00,  1.30s/it]
Extracting files: 100%|██████████| 1/1 [00:06<00:00,  6.55s/it]


Extracting data from GANNON_UNIVERSITY/Audited_Financial_Statements_for_the_year_ended_06_30_2024__203_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:01<00:00,  1.08s/it]
Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00,  2.07it/s]
Extracting files: 100%|██████████| 1/1 [00:08<00:00,  8.87s/it]


Extracting data from GWYNEDD_MERCY_UNIVERSITY/Gwynedd_Mercy_University_Summary_of_Financial_Performance_and_Operating_Data_for_the_year_ended_06_30_2024__243_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:00<00:00,  1.10it/s]
Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00,  1.30s/it]
Extracting files: 100%|██████████| 1/1 [00:08<00:00,  8.96s/it]


Extracting data from LEWIS_UNIVERSITY/Continuing_Disclosure_for_the_year_ended_06_30_2024__298_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:01<00:00,  1.03s/it]
Creating extraction jobs: 100%|██████████| 1/1 [00:02<00:00,  2.02s/it]
Extracting files: 100%|██████████| 1/1 [00:10<00:00, 10.00s/it]


Extracting data from MOLLOY_COLLEGE/Financial_Operating_Filing_for_the_year_ended_06_30_2024_Document2__142_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:01<00:00,  1.22s/it]
Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00,  1.08s/it]
Extracting files: 100%|██████████| 1/1 [00:08<00:00,  8.13s/it]


Extracting data from NATIONAL_UNIVERSITY/2024_Operating_Data_Annual_Report_-_National_University_for_the_year_ended_06_30_2024__573_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:01<00:00,  1.17s/it]
Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00,  1.43it/s]
Extracting files: 100%|██████████| 1/1 [00:07<00:00,  8.00s/it]


Extracting data from ROSALIND_FRANKLIN_UNIVERSITY_OF_MEDICINE_AND_SCIENCE/2024_Operating_Data_for_the_year_ended_06_30_2024__172_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:01<00:00,  1.06s/it]
Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00,  1.68it/s]
Extracting files: 100%|██████████| 1/1 [00:06<00:00,  6.68s/it]


Extracting data from STEVENS_INSTITUTE_OF_TECHNOLOGY/Annual_Report_for_the_year_ended_06_30_2024__216_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:00<00:00,  1.15it/s]
Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00,  1.08s/it]
Extracting files: 100%|██████████| 1/1 [00:07<00:00,  7.08s/it]


Extracting data from ST_LAWRENCE_UNIVERSITY/SEC_Operating_Info-_SLU_2023-2024_for_the_year_ended_06_30_2024__80_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:00<00:00,  1.33it/s]
Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00,  2.04it/s]
Extracting files: 100%|██████████| 1/1 [00:07<00:00,  7.69s/it]


Extracting data from UNIVERSITY_OF_REDLANDS/Annual_Report_for_the_year_ended_06_30_2024__329_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:00<00:00,  1.02it/s]
Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00,  1.26s/it]
Extracting files: 100%|██████████| 1/1 [00:08<00:00,  8.05s/it]

All schools written to output_scrapping/all_schools_sample.xlsx





In [6]:
#Combine all the tabs into one sheet if wanted
file_path   = "output_scrapping/all_schools_sample.xlsx"
output_path = "output_scrapping/all_schools_combined_sample.xlsx"

raw = pd.read_excel(file_path, sheet_name=None, index_col=0)

school_series = {
    school: df.iloc[:, 0]                      # first (only) value column
    for school, df in raw.items()
}

df_comb = pd.DataFrame(school_series).T
df_comb.index.name = "School"                 
#df_comb.insert(0, "Year", "2024‑2025")
# df_comb.loc['Texas_A&M', ['Total_Headcount','Undergraduate_Headcount']] = \
#     df_comb.loc['Texas_A&M', ['Undergraduate_Headcount','Total_Headcount']].values

# df_comb.loc['California_state_university', 'Undergraduate_Headcount'] = None 
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
    df_comb.to_excel(writer, sheet_name="Combined")

print("Saved:", output_path)

Saved: output_scrapping/all_schools_combined_sample.xlsx


In [7]:
df = pd.read_excel("output_scrapping/all_schools_combined_sample.xlsx")
valid_years = {
    "Fall 2024", "AY 2024–2025", "2024–2025", "2024-25", "AY 24–25", "Fall 24", "2024"
}

for col in df.columns:
    if col.startswith("Year_"):
        value_col = col.replace("Year_", "")
        if value_col in df.columns:
            mask_invalid = ~df[col].isin(valid_years)
            df.loc[mask_invalid, value_col] = None


df.to_excel("output_scrapping/all_schools_cleaned_fall_2024.xlsx", index=False)


In [8]:
# import pandas as pd

# df = pd.read_excel("output_scrapping/all_schools_combined_sample.xlsx")
# import re
# import numpy as np

# equivalence_map = {
#     r"2024-25|2024-2025|Fall 2024 (FY2024-25)|AY 2024–25|Fall 2024|Fiscal 2025|2024": "2024-2025",
#     r"2023-24|2023-2024|AY 2023–24|Fall 2023|Fiscal 2024|2023": "2023-2024"
# }

# # def standardize_year(value):
# #     if isinstance(value, str):
# #         for pattern, standard in equivalence_map.items():
# #             if re.search(pattern, value):
# #                 return standard
# #     return np.nan

# # # Apply to each column that starts with 'Year_'
# # for col in df.columns:
# #     if col.startswith("Year_"):
# #         category = col.replace("Year_", "")
# #         new_col = f"year_{'fee' if 'Tuition' in category or 'Room' in category else 'headcount'}"
# #         df[new_col] = df[col].apply(standardize_year)

# # # Combine year_headcount and year_fee into year_combine
# # def combine_years(row):
# #     if pd.notnull(row['year_headcount']):
# #         return row['year_headcount']
# #     elif pd.notnull(row['year_fee']):
# #         return row['year_fee']
# #     else:
# #         return np.nan

# # df['year_combine'] = df.apply(combine_years, axis=1)
# # import pandas as pd
# import re
# from pathlib import Path


# # Define function to standardize year format
# def normalize_year(text):
#     if pd.isna(text):
#         return None
#     text = str(text)
#     matches = re.findall(r"20\d{2}", text)
#     if len(matches) >= 2:
#         return f"{matches[0]}-{matches[1]}"
#     elif len(matches) == 1:
#         try:
#             y1 = int(matches[0])
#             return f"{y1}-{y1+1}"
#         except:
#             return None
#     else:
#         return None

# # Apply normalization for headcount, fee, and combine
# year_headcount_cols = [col for col in df.columns if "Year_" in col and any(x in col for x in ["Headcount", "Full", "Total"])]
# year_fee_cols = [col for col in df.columns if "Year_" in col and any(x in col for x in ["Tuition", "Room"])]

# df["year_headcount"] = df[year_headcount_cols].bfill(axis=1).iloc[:, 0].apply(normalize_year)
# df["year_fee"] = df[year_fee_cols].bfill(axis=1).iloc[:, 0].apply(normalize_year)

# def combine_years(row):
#     y1 = row["year_headcount"]
#     y2 = row["year_fee"]
#     return y1 if y1 else y2

# df["year_combine"] = df.apply(combine_years, axis=1)

# df.to_excel("output_scrapping/all_schools_cleaned_fall_2024.xlsx", index=False)
