In [None]:
import pydantic
from pydantic import BaseModel, Field, model_validator
import pandas as pd
from llama_cloud_services import LlamaExtract
from llama_cloud.types import ExtractConfig, ExtractMode
from income import generate_income_statement_schema 
import os, re, pathlib, tempfile, shutil
from dotenv import load_dotenv

In [None]:
FISCAL_YEAR = 2024  #Change the year if you want different years
IncomeStatement_2024_25 = generate_income_statement_schema(FISCAL_YEAR)
PDF_ROOT = "sample" 
# PDF_ROOT = "templates/data"  # Change this to the point to the directory where you are storing the pdfs after scraping, also this is your input directory
TARGET_DIR = "templates" # output directory
os.makedirs(TARGET_DIR, exist_ok=True)
OUTPUT_FILE = os.path.join(TARGET_DIR, "all_schools_test.xlsx")

In [None]:
AGENT_ID = "49cba8ec-d3b6-4a1a-a914-32b81d3ce7ad" # all multi model
# AGENT_ID = "87c36b46-e9f0-4737-8467-355b3865bfc4" # all balance model

load_dotenv() #make sure the API key is in the .env file

extractor = LlamaExtract(project_id = '8c10e62e-3810-4193-915d-d2d11105826d')

agent = extractor.get_agent(id = AGENT_ID)

agent.data_schema = IncomeStatement_2024_25
agent.save()
agent = extractor.get_agent(id = AGENT_ID)
agent.data_schema

The following two cell blocks extract all schools' info into one csv file per school in the csv_outputs folder.

In [None]:
writer = pd.ExcelWriter(
    OUTPUT_FILE,
    engine="xlsxwriter",
    engine_kwargs={"options": {"use_zip64": True}}
)

def is_pdf(name: str) -> bool:
    return name.lower().endswith(".pdf")

OUTPUT_DIR = "templates/csv_outputs" # set folder
os.makedirs(OUTPUT_DIR, exist_ok=True)

for school in sorted(os.listdir(PDF_ROOT)):
    school_dir = os.path.join(PDF_ROOT, school)
    if not os.path.isdir(school_dir) or school.startswith("."):
        continue

    print(f"\n=== {school} ===")
    combined = {}
    first_keys = None
    pdfs = [f for f in sorted(os.listdir(school_dir)) if is_pdf(f)]

    if not pdfs:
        print("  (no PDFs found)")
        continue

    for fname in pdfs:
        path = os.path.join(school_dir, fname)
        print(f"Extracting data from {school}/{fname}")

        try:
            run  = agent.extract(path)
            data = run.data or {}
            if first_keys is None:
                first_keys = list(data.keys())
                combined  = {k: None for k in first_keys}
            for k, v in data.items():
                if v not in (None, "", []):
                    combined[k] = v
        except Exception as err:
            print(f"Skipped {fname}: {err}")

    if first_keys:
        df = pd.DataFrame.from_dict(combined, orient="index", columns=[f"{FISCAL_YEAR - 1}-{str(FISCAL_YEAR)[-2:]}"])
        df.index.name = "Metric"
        sheet_name = school[:31]
        
        year = f"{FISCAL_YEAR - 1}-{str(FISCAL_YEAR)[-2:]}"    
        # Fill NA by using instructional_research_expense  =  instructional_expense + research_expense
        if pd.isna(df.loc['instructional_research_expense'].iloc[0]) and (not pd.isna(df.loc['instructional_expense'].iloc[0]) or not pd.isna(df.loc['research_expense'].iloc[0])):
            df.loc['instructional_research_expense'] = (df.loc['instructional_expense'].iloc[0] + df.loc['research_expense'].iloc[0])
        
        #Create new variable  non_op_realized_investment_net_without_donor = non_op_realized_investment_net_without_donor 
        # - (extraordinary_gain_or_loss + net_assets_released_for_capital + change_fair_value_derivatives+ capital_grants_gifts)
        if (
            not pd.isna(df.loc['non_op_realized_investment_net_without_donor'].iloc[0])
            or not pd.isna(df.loc['extraordinary_gain_or_loss'].iloc[0])
            or not pd.isna(df.loc['net_assets_released_for_capital'].iloc[0])
            or not pd.isna(df.loc['change_fair_value_derivatives'].iloc[0])
            or not pd.isna(df.loc['capital_grants_gifts'].iloc[0])
        ):
            df.loc['other_non_op'] = (
                df.loc['non_op_realized_investment_net_without_donor'].iloc[0]
                - (
                    df.loc['extraordinary_gain_or_loss'].iloc[0]
                    + df.loc['net_assets_released_for_capital'].iloc[0]
                    + df.loc['change_fair_value_derivatives'].iloc[0]
                    + df.loc['capital_grants_gifts'].iloc[0]
                )
            )

        df.to_excel(writer, sheet_name=sheet_name)
        if os.path.exists("templates/final.parquet"):
            existing = pd.read_parquet("templates/final.parquet")
            df['school'] = school
            combined = pd.concat([existing, df])
        else:
            df['school'] = school
            combined = df
        combined.to_parquet("templates/final.parquet", index=True)
    if not df.empty:
        output_file = os.path.join(OUTPUT_DIR, f"{school}.csv")
        df.to_csv(output_file, index=True) # write to csv file
        print(f"Saved {school} to {output_file}")
    else:
        print(f"No data for {school}.")
    # save results to parquet format
    df.to_parquet("templates/final.parquet", index=True)

    # Write via temp file + ZIP64; then atomic move
    dest = pathlib.Path(OUTPUT_FILE) 
    dest.parent.mkdir(parents=True, exist_ok=True)

    with tempfile.TemporaryDirectory() as td:
        tmp = pathlib.Path(td) / dest.name
        with pd.ExcelWriter(
            tmp,
            engine="xlsxwriter",
            engine_kwargs={"options": {"use_zip64": True, "strings_to_urls": False}}
        ) as writer:
            df.to_excel(writer, sheet_name=sheet_name)
        shutil.move(str(tmp), dest)

    print(f"All schools written to {dest}")

writer.close()
print(f"All schools written to {OUTPUT_FILE}")


In [None]:
# In case experiencing operation time out for Excel, we sort the data in csv and store it to the "all_schools_combined_test"
# Path to the folder where your CSV files are stored
CSV_DIR = "templates/csv_outputs"
OUTPUT_FILE = "templates/all_schools_combined_test.xlsx"


rows = []

for fname in sorted(os.listdir(CSV_DIR)):
    if not fname.endswith(".csv"):
        continue
    path = os.path.join(CSV_DIR, fname)
    df = pd.read_csv(path)

    # Clean up
    df.columns = [c.strip() for c in df.columns]
    df["Metric"] = df["Metric"].astype(str).str.strip()
    df = df.dropna(subset=["Metric"]).drop_duplicates(subset=["Metric"], keep="last")

    # Get university name (prefer the column; fall back to file name)
    school = (
        df.get("school").dropna().iloc[0]
        if "school" in df.columns and df["school"].notna().any()
        else fname.replace(".csv", "")
    )

    series = (
        df.set_index("Metric")["2023-24"]
          .rename(school)
          .apply(pd.to_numeric, errors="coerce")
    )
    rows.append(series)

# Combine to a DataFrame: index = schools, columns = metrics
combined = pd.DataFrame(rows)

# Sort: rows by university, columns (metrics) alphabetically
combined = combined.sort_index(axis=0)
combined = combined.reindex(sorted(combined.columns), axis=1)

combined.index.name = "school"
combined.to_excel(OUTPUT_FILE) # save to excel

print(f"Saved: {OUTPUT_FILE}")

