In [5]:
import pydantic
from pydantic import BaseModel, Field, model_validator
import os
import pandas as pd
from llama_cloud_services import LlamaExtract
from pydantic import BaseModel, Field, model_validator
from llama_cloud.types import ExtractConfig, ExtractMode
from income import generate_income_statement_schema 
# from realized import generate_realized_schema
import os, re, pathlib, tempfile, shutil
from dotenv import load_dotenv

In [6]:
FISCAL_YEAR = 2024  #Change the year if you want different years
IncomeStatement_2024_25 = generate_income_statement_schema(FISCAL_YEAR)
PDF_ROOT = "sample" 
# PDF_ROOT = "templates/data"  # Change this to the point to the directory where you are storing the pdfs after scraping
TARGET_DIR = "templates"
os.makedirs(TARGET_DIR, exist_ok=True)
OUTPUT_FILE = os.path.join(TARGET_DIR, "all_schools_test.xlsx")

# api_key = os.getenv("LLAMACLOUD_API_KEY")

In [7]:
AGENT_ID = "49cba8ec-d3b6-4a1a-a914-32b81d3ce7ad" # all multi model
# AGENT_ID = "87c36b46-e9f0-4737-8467-355b3865bfc4" # all balance model

load_dotenv() #make sure the API key is in the .env file

extractor = LlamaExtract(project_id = '8c10e62e-3810-4193-915d-d2d11105826d')

agent = extractor.get_agent(id = AGENT_ID)

agent.data_schema = IncomeStatement_2024_25
agent.save()
agent = extractor.get_agent(id = AGENT_ID)
agent.data_schema

{'additionalProperties': False,
 'properties': {'unit_multiplier': {'anyOf': [{'type': 'number'},
    {'type': 'null'}],
   'description': "Numeric multiplier corresponding to the unit (e.g., 'in thousands', 'in millions') used in the 2024 fiscal year's Statement of Activities or Statement of Changes in Net Assets. EXTRACT ONLY from financial statement table headers, column headers, or table footnotes. DO NOT extract from narrative text or MD&A sections. Return 1000 for 'in thousands', 1000000 for 'in millions', 1 if values are reported in dollars with no multiplier. Look for unit indicators near the financial statement headers, column headers, or at the top/bottom of tables. Ensure this is strictly from the {fy_label} period only; ignore units from incorrect years or sections."},
  'gross_tuition_revenue': {'anyOf': [{'type': 'integer'}, {'type': 'null'}],
   'description': "Gross tuition and fees revenue BEFORE any deductions for 2024. Extract ONLY from the primary Statement of Activ

The following two cell blocks extract all schools' info into one excel file per school.

In [8]:
writer = pd.ExcelWriter(
    OUTPUT_FILE,
    engine="xlsxwriter",
    engine_kwargs={"options": {"use_zip64": True}}
)

def is_pdf(name: str) -> bool:
    return name.lower().endswith(".pdf")

OUTPUT_DIR = "templates/csv_outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

for school in sorted(os.listdir(PDF_ROOT)):
    school_dir = os.path.join(PDF_ROOT, school)
    if not os.path.isdir(school_dir) or school.startswith("."):
        continue

    print(f"\n=== {school} ===")
    combined = {}
    first_keys = None
    pdfs = [f for f in sorted(os.listdir(school_dir)) if is_pdf(f)]

    if not pdfs:
        print("  (no PDFs found)")
        continue

    for fname in pdfs:
        path = os.path.join(school_dir, fname)
        print(f"Extracting data from {school}/{fname}")

        try:
            run  = agent.extract(path)
            data = run.data or {}
            if first_keys is None:
                first_keys = list(data.keys())
                combined  = {k: None for k in first_keys}
            for k, v in data.items():
                if v not in (None, "", []):
                    combined[k] = v
        except Exception as err:
            print(f"Skipped {fname}: {err}")

    if first_keys:
        df = pd.DataFrame.from_dict(combined, orient="index", columns=[f"{FISCAL_YEAR - 1}-{str(FISCAL_YEAR)[-2:]}"])
        df.index.name = "Metric"
        sheet_name = school[:31]
        
        year = f"{FISCAL_YEAR - 1}-{str(FISCAL_YEAR)[-2:]}"            
        if pd.isna(df.loc['instructional_research_expense'].iloc[0]) and (not pd.isna(df.loc['instructional_expense'].iloc[0]) or not pd.isna(df.loc['research_expense'].iloc[0])):
            df.loc['instructional_research_expense'] = (df.loc['instructional_expense'].iloc[0] + df.loc['research_expense'].iloc[0])
        if (
            not pd.isna(df.loc['non_op_realized_investment_net_without_donor'].iloc[0])
            or not pd.isna(df.loc['extraordinary_gain_or_loss'].iloc[0])
            or not pd.isna(df.loc['net_assets_released_for_capital'].iloc[0])
            or not pd.isna(df.loc['change_fair_value_derivatives'].iloc[0])
            or not pd.isna(df.loc['capital_grants_gifts'].iloc[0])
        ):
            df.loc['other_non_op'] = (
                df.loc['non_op_realized_investment_net_without_donor'].iloc[0]
                - (
                    df.loc['extraordinary_gain_or_loss'].iloc[0]
                    + df.loc['net_assets_released_for_capital'].iloc[0]
                    + df.loc['change_fair_value_derivatives'].iloc[0]
                    + df.loc['capital_grants_gifts'].iloc[0]
                )
            )

        df.to_excel(writer, sheet_name=sheet_name)
        if os.path.exists("templates/final.parquet"):
            existing = pd.read_parquet("templates/final.parquet")
            df['school'] = school
            combined = pd.concat([existing, df])
        else:
            df['school'] = school
            combined = df
        combined.to_parquet("templates/final.parquet", index=True)
    if not df.empty:
        output_file = os.path.join(OUTPUT_DIR, f"{school}.csv")
        df.to_csv(output_file, index=True) # write to csv file
        print(f"Saved {school} to {output_file}")
    else:
        print(f"No data for {school}.")

    df.to_parquet("templates/final.parquet", index=True)

    # Write via temp file + ZIP64; then atomic move
    dest = pathlib.Path(OUTPUT_FILE) 
    dest.parent.mkdir(parents=True, exist_ok=True)

    with tempfile.TemporaryDirectory() as td:
        tmp = pathlib.Path(td) / dest.name
        with pd.ExcelWriter(
            tmp,
            engine="xlsxwriter",
            engine_kwargs={"options": {"use_zip64": True, "strings_to_urls": False}}
        ) as writer:
            df.to_excel(writer, sheet_name=sheet_name)
        shutil.move(str(tmp), dest)

    print(f"All schools written to {dest}")

writer.close()
print(f"All schools written to {OUTPUT_FILE}")



=== BRADLEY_UNIVERSITY ===
Extracting data from BRADLEY_UNIVERSITY/Audited_Financial_Statements_or_ACFR__Rule_15c2-12__for_FY24_for_the_year_ended_05_31_2024__541_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:01<00:00,  1.13s/it]
Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00,  1.01it/s]
Extracting files: 100%|██████████| 1/1 [04:44<00:00, 284.77s/it]
Extracting files: 100%|██████████| 1/1 [06:14<00:00, 374.41s/it]


Saved BRADLEY_UNIVERSITY to templates/csv_outputs/BRADLEY_UNIVERSITY.csv
All schools written to templates/all_schools_test.xlsx

=== CORNELL_UNIVERSITY ===
Extracting data from CORNELL_UNIVERSITY/2024_Audited_Financial_Statements_for_the_year_ended_06_30_2024__788_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:01<00:00,  1.25s/it]
Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00,  1.53s/it]
Extracting files: 100%|██████████| 1/1 [02:47<00:00, 167.76s/it]


Extracting data from CORNELL_UNIVERSITY/Incorporate_OS_by_Reference_as_of_04_25_2024__2.4_MB_.pdf


Uploading files: 100%|██████████| 1/1 [00:01<00:00,  1.44s/it]
Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00,  1.95s/it]
Extracting files: 100%|██████████| 1/1 [01:51<00:00, 111.58s/it]


Saved CORNELL_UNIVERSITY to templates/csv_outputs/CORNELL_UNIVERSITY.csv
All schools written to templates/all_schools_test.xlsx

=== CULINARY_INSTITUTE_OF_AMERICA_THE ===
Extracting data from CULINARY_INSTITUTE_OF_AMERICA_THE/2024_Audited_Financial_Statements_for_the_year_ended_05_31_2024__277_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:01<00:00,  1.10s/it]
Creating extraction jobs: 100%|██████████| 1/1 [00:03<00:00,  3.18s/it]
Extracting files: 100%|██████████| 1/1 [02:37<00:00, 157.94s/it]


Saved CULINARY_INSTITUTE_OF_AMERICA_THE to templates/csv_outputs/CULINARY_INSTITUTE_OF_AMERICA_THE.csv
All schools written to templates/all_schools_test.xlsx

=== GANNON_UNIVERSITY ===
Extracting data from GANNON_UNIVERSITY/Audited_Financial_Statements_for_the_year_ended_06_30_2024__786_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:01<00:00,  1.09s/it]
Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00,  1.26s/it]
Extracting files: 100%|██████████| 1/1 [02:00<00:00, 120.71s/it]


Saved GANNON_UNIVERSITY to templates/csv_outputs/GANNON_UNIVERSITY.csv
All schools written to templates/all_schools_test.xlsx

=== LEWIS_UNIVERSITY ===
Extracting data from LEWIS_UNIVERSITY/Audited_Financial_Statements_for_the_year_ended_06_30_2024__430_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:01<00:00,  1.00s/it]
Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00,  1.29s/it]
Extracting files: 100%|██████████| 1/1 [01:23<00:00, 83.64s/it]


Saved LEWIS_UNIVERSITY to templates/csv_outputs/LEWIS_UNIVERSITY.csv
All schools written to templates/all_schools_test.xlsx

=== MOLLOY_COLLEGE ===
Extracting data from MOLLOY_COLLEGE/Financial_Operating_Filing_for_the_year_ended_06_30_2024_Document1__304_KB_.pdf


Uploading files: 100%|██████████| 1/1 [00:01<00:00,  1.50s/it]
Creating extraction jobs: 100%|██████████| 1/1 [00:03<00:00,  3.32s/it]
Extracting files: 100%|██████████| 1/1 [04:39<00:00, 279.98s/it]

Saved MOLLOY_COLLEGE to templates/csv_outputs/MOLLOY_COLLEGE.csv
All schools written to templates/all_schools_test.xlsx
All schools written to templates/all_schools_test.xlsx



  warn("Calling close() on already closed file.")


In [None]:
# In case experiencing operation time out for Excel, we sort the data in csv and store it to the "all_schools_combined_test"
# Path to the folder where your CSV files are stored
CSV_DIR = "templates/csv_outputs"
OUTPUT_FILE = "templates/all_schools_combined_test.xlsx"


rows = []

for fname in sorted(os.listdir(CSV_DIR)):
    if not fname.endswith(".csv"):
        continue
    path = os.path.join(CSV_DIR, fname)
    df = pd.read_csv(path)

    # Clean up
    df.columns = [c.strip() for c in df.columns]
    df["Metric"] = df["Metric"].astype(str).str.strip()
    df = df.dropna(subset=["Metric"]).drop_duplicates(subset=["Metric"], keep="last")

    # Get university name (prefer the column; fall back to file name)
    school = (
        df.get("school").dropna().iloc[0]
        if "school" in df.columns and df["school"].notna().any()
        else fname.replace(".csv", "")
    )

    series = (
        df.set_index("Metric")["2023-24"]
          .rename(school)
          .apply(pd.to_numeric, errors="coerce")
    )
    rows.append(series)

# Combine to a DataFrame: index = schools, columns = metrics
combined = pd.DataFrame(rows)

# Sort: rows by university, columns (metrics) alphabetically
combined = combined.sort_index(axis=0)
combined = combined.reindex(sorted(combined.columns), axis=1)

combined.index.name = "school"
combined.to_excel(OUTPUT_FILE)

print(f"Saved: {OUTPUT_FILE}")



Saved: templates/all_schools_combined_test.xlsx


The following cell block extracts all the schools' info into one excel sheet but in different tabs.

In [None]:
# #Combine all the tabs into one sheet if wanted
# file_path   = "templates/all_schools_combined_test.xlsx"
# output_path = "templates/all_schools_combined.xlsx"

# raw = pd.read_excel(file_path, sheet_name=None, index_col=0)

# school_series = {
#     school: df.iloc[:, 0]                      # first (only) value column
#     for school, df in raw.items()
# }

# # df_comb = pd.DataFrame(school_series).T
# df_comb = pd.DataFrame(school_series)
# df_comb.index.name = "School"                 
# # df = pd.DataFrame.from_dict(combined, orient="index", columns=["Value"])
# # df.index.name = "Metric"

# # write or keep using `transposed`


# with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
#     df_comb.to_excel(writer, sheet_name="Combined")

# print("Saved:", output_path)

Saved: templates/all_schools_combined.xlsx


In [None]:
# import os, pathlib
# print("ABS:", os.path.realpath(OUTPUT_FILE))

In [None]:
# from pandas import ExcelWriter
# import pandas as pd, os
# probe = os.path.join(os.path.dirname(OUTPUT_FILE), "probe.xlsx")
# with ExcelWriter(probe, engine="openpyxl") as w:
#     pd.DataFrame({"a":[1]}).to_excel(w, index=False)


In [None]:
# #Combine all the tabs into one sheet if wanted
# file_path   = "output_incomestatement_final/all_schools_sample.xlsx"  #Change this if need be
# output_path = "output_incomestatement_final/all_schools_combined.xlsx"  #Change this if need be
# raw = pd.read_excel(file_path, sheet_name=None, index_col=0)

# raw = pd.read_excel(file_path, sheet_name=None, index_col=0)

# # school_series = {
# #     school: df.iloc[:, 0]                      # first (only) value column
# #     for school, df in raw.items()
# # }
# school_series = {}
# for school, df in raw.items():
#     # helpful debug: print shapes
#     # print(f"{school}: {df.shape}")
#     if df.shape[1] == 0:                 # only an index column or completely empty
#         print(f"Skipping '{school}': no value column found")
#         continue
#     s = df.iloc[:, 0].dropna()
#     school_series[school] = s

# df_comb = pd.DataFrame(school_series).T
# df_comb.index.name = "School"                 
# #df_comb.insert(0, "Year", "2024‑2025")
# # df_comb.loc['Texas_A&M', ['Total_Headcount','Undergraduate_Headcount']] = \
# #     df_comb.loc['Texas_A&M', ['Undergraduate_Headcount','Total_Headcount']].values

# # df_comb.loc['California_state_university', 'Undergraduate_Headcount'] = None 
# with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
#     df_comb.to_excel(writer, sheet_name="Combined")

# print("Saved:", output_path)
