In [1]:
import os, re, logging
from typing import List, Tuple, Optional, Dict
import pandas as pd

# OCR 
_OCR_AVAILABLE = True
try:
    import fitz  # PyMuPDF
    from PIL import Image
    import pytesseract
except Exception:
    _OCR_AVAILABLE = False

try:
    import pdfplumber
except Exception as e:
    raise RuntimeError("Miss: pdfplumber")

In [None]:
FOLDER_PATH = "university_pdfs_hy_e/"  
NUM_FIRST_PAGES = 30
NUM_LAST_PAGES  = 30
OUTPUT_CSV      = "university_flag_hye17.csv"
ENABLE_OCR      = True  
LOG_LEVEL       = logging.INFO
logging.basicConfig(level=LOG_LEVEL, format="%(levelname)s: %(message)s")
assert os.path.isdir(FOLDER_PATH), f"Directory not found: {FOLDER_PATH} (please check your working directory and path)"

In [3]:
KEYWORDS = {
    "FS": [
        r"financial statements?",
        r"annual financial report",
        r"financial report",
        r"financial position",
        r"statement of activities",
        r"statement of cash flows?",
        r"net (revenue|revenues)",
        r"net assets?",
        r"audited financial",
        r"management discussion and analysis",
    ],
    "Enrollment": [
        r"\benrollment\b",
        r"\bfte\b",  # full-time equivalent
        r"student headcount",
        r"admissions? report",
        r"enrolled students?",
        r"undergraduate enrollment",
        r"graduate enrollment",
        r"full-time equivalents?",   #add
        r"enrollments and degrees",   #add
        r"admissions? statistics?"
    ],
}
KEYWORDS_COMPILED = {k: [re.compile(p, re.IGNORECASE) for p in v] for k, v in KEYWORDS.items()}

In [None]:
def parse_school_from_path(pdf_path: str) -> str:
    return os.path.basename(os.path.dirname(pdf_path))

def _ocr_pages(pdf_path: str, page_numbers: List[int]) -> str:
    if not (_OCR_AVAILABLE and ENABLE_OCR):
        return ""
    text_parts: List[str] = []
    try:
        doc = fitz.open(pdf_path)
        for pno in page_numbers:
            if pno < 0 or pno >= len(doc):
                continue
            page = doc.load_page(pno)
            zoom = 300 / 72.0
            mat = fitz.Matrix(zoom, zoom)
            pix = page.get_pixmap(matrix=mat)
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            try:
                txt = pytesseract.image_to_string(img)
            except Exception:
                txt = ""
            if txt:
                text_parts.append(txt)
        doc.close()
    except Exception as e:
        logging.debug(f"OCR wrong: {e}")
        return ""
    return "\n".join(text_parts)

def _extract_with_pdfplumber(pdf_path: str, first_n: int, last_m: int) -> Tuple[str, List[int], List[int]]:
    text_parts: List[str] = []
    first_pages_idx: List[int] = []
    last_pages_idx: List[int]  = []
    with pdfplumber.open(pdf_path) as pdf:
        total = len(pdf.pages)
        if total == 0:
            return "", [], []
        first_n = min(first_n, total)
        last_m  = min(last_m, total - first_n) if total > first_n else 0
        for i in range(first_n):
            try:
                t = (pdf.pages[i].extract_text() or "").strip()
                if t:
                    text_parts.append(t)
                    first_pages_idx.append(i)
            except Exception:
                pass
        for i in range(max(0, total - last_m), total):
            try:
                t = (pdf.pages[i].extract_text() or "").strip()
                if t:
                    text_parts.append(t)
                    last_pages_idx.append(i)
            except Exception:
                pass
    return "\n".join(text_parts), first_pages_idx, last_pages_idx

def extract_text_pages(pdf_path: str, first_n: int = 6, last_m: int = 3) -> str:
    base_text, first_idx, last_idx = _extract_with_pdfplumber(pdf_path, first_n, last_m)
    if base_text.strip():
        return base_text
 
    if _OCR_AVAILABLE and ENABLE_OCR:
        try:
            doc = fitz.open(pdf_path)
            total = len(doc)
            doc.close()
        except Exception:
            total = 0
        front = list(range(min(first_n, total)))
        tail  = list(range(max(0, total - last_m), total))
        return _ocr_pages(pdf_path, front + tail) or ""
    return ""

def classify_flags(text: str) -> Dict[str, bool]:
    txt = text or ""
    fs_hit = any(p.search(txt) for p in KEYWORDS_COMPILED["FS"])
    enroll_hit = any(p.search(txt) for p in KEYWORDS_COMPILED["Enrollment"])
    other = (not fs_hit and not enroll_hit)   # ← removed bool(txt.strip())
    return {"FS": fs_hit, "Enrollment": enroll_hit, "Other": other}

def build_flag_df(folder_path: str, first_n: int = NUM_FIRST_PAGES, last_m: int = NUM_LAST_PAGES) -> pd.DataFrame:
    records = []
    total_files = 0
    if not os.path.isdir(folder_path):
        raise FileNotFoundError(f"Directory not found: {FOLDER_PATH} (please check your working directory and path)")
    for root, dirs, files in os.walk(folder_path):
        for fname in files:
            if not fname.lower().endswith(".pdf"):
                continue
            total_files += 1
            fpath = os.path.join(root, fname)
            try:
                text = extract_text_pages(fpath, first_n, last_m)
                flags = classify_flags(text)
                school = parse_school_from_path(fpath)
                records.append({
                    "school": school,
                    "document": fpath,
                    **flags
                })
            except Exception as e:
                logging.warning(f"Jump: {fpath}: {e}")
    logging.info(f"Number of PDF file:{total_files}, Success record:{len(records)}")
    df = pd.DataFrame(records, columns=["school", "document", "FS", "Enrollment", "Other"])
    return df

In [5]:
df = build_flag_df(FOLDER_PATH)
df.to_csv(OUTPUT_CSV, index=False)
print(f"[OK] Saved:{os.path.abspath(OUTPUT_CSV)}")
with pd.option_context("display.max_colwidth", 120):
    display(df.head(10))

INFO: Number of PDF file:138, Success record:138


[OK] Saved:/Users/xinjile/Desktop/Franklin_Templton_Project_Phase_Two-main/university_flag_hye17.csv


Unnamed: 0,school,document,FS,Enrollment,Other
0,SETON_HALL_UNIVERSITY,university_pdfs_hy_e/SETON_HALL_UNIVERSITY/2024_Operating_Data__Seton_Hall_University__for_the_year_ended_06_30_2024...,True,True,False
1,ROANOKE_COLLEGE,university_pdfs_hy_e/ROANOKE_COLLEGE/Appendix_A_for_Series_2020_Bonds_for_Year_Ending_06.30.24_for_the_year_ended_06...,False,True,False
2,UNIVERSITY_OF_DAYTON,university_pdfs_hy_e/UNIVERSITY_OF_DAYTON/University_of_Dayton_FY_2024_Continuing_Disclosure_with_Transmittal_Letter...,False,False,True
3,LASELL_UNIVERSITY,university_pdfs_hy_e/LASELL_UNIVERSITY/Annual_Report_for_the_year_ended_06_30_2024__288_KB_.pdf,False,True,False
4,NORTHWEST_NAZARENE_UNIVERSITY_INC,university_pdfs_hy_e/NORTHWEST_NAZARENE_UNIVERSITY_INC/Audited_Financial_Statements_and_Continuing_Disclosure_for_th...,True,True,False
5,NORTHEAST_OHIO_MEDICAL_UNIVERSITY_FOUNDATION,university_pdfs_hy_e/NORTHEAST_OHIO_MEDICAL_UNIVERSITY_FOUNDATION/FY24_Annual_Filing_for_the_year_ended_06_30_2024__...,True,True,False
6,NORTHEAST_OHIO_MEDICAL_UNIVERSITY_FOUNDATION,university_pdfs_hy_e/NORTHEAST_OHIO_MEDICAL_UNIVERSITY_FOUNDATION/Enrollment_Data_As_Of_12_1_23_for_the_year_ended_0...,False,True,False
7,LOYOLA_UNIVERSITY_MARYLAND_INC,university_pdfs_hy_e/LOYOLA_UNIVERSITY_MARYLAND_INC/2024_Operating_Data_for_the_year_ended_05_31_2024__232_KB_.pdf,False,True,False
8,UNIVERSITY_OF_THE_PACIFIC,university_pdfs_hy_e/UNIVERSITY_OF_THE_PACIFIC/Continuing_Disclosure_Statement_for_the_year_ended_06_30_2024__392_KB...,True,True,False
9,KANSAS_CITY_UNIVERSITY_OF_MEDICINE_AND_BIOSCIENCES,university_pdfs_hy_e/KANSAS_CITY_UNIVERSITY_OF_MEDICINE_AND_BIOSCIENCES/FY24_Bond_Compliance_Annual_Report_for_the_y...,True,True,False



### Steps
1. **Keyword Identification** (already done above): compute initial `FS` and `Enrollment` from regex; if neither detected, both are `False` and `Other=True`.
2. **Same-School Correction**: Within the same school, if there's at least one `FS=True & Enrollment=False`, then any `FS=True & Enrollment=True` becomes `FS=False & Enrollment=True`.
3. **Short-Document Adjustment (by page count)**:
   - If `FS=True & Enrollment=False` and `page < 5` → set `Other=True, FS=False` (Enrollment unchanged).
   - If `FS=True & Enrollment=True` and `page < 5` → set `FS=False` (keep `Enrollment` intact).

Finally, we export **one CSV** with exactly 10 columns (in order):  
`FS_1, Enrollment_1, Other_1, FS_2, Enrollment_2, Other_2, FS_3, Enrollment_3, Other_3, page`.


In [10]:

# ===== Step 1 snapshot: ensure we have *_1 columns =====
# We rely on your existing functions, variables, and KEYWORDS from the cells above.
# If 'df' is already produced by Step 1, we reuse it; otherwise we rebuild via build_flag_df.
import pandas as pd

try:
    _df_step1_source = df
except NameError:
    _df_step1_source = build_flag_df(
        folder_path=FOLDER_PATH,
        first_n=NUM_FIRST_PAGES,
        last_m=NUM_LAST_PAGES
    )

df_step1 = _df_step1_source.copy()
for c in ["FS", "Enrollment", "Other"]:
    if c not in df_step1.columns:
        raise KeyError(f"[Step 1] Missing expected column: {c}")

df_step1["FS_1"] = df_step1["FS"].astype(bool)
df_step1["Enrollment_1"] = df_step1["Enrollment"].astype(bool)
df_step1["Other_1"] = df_step1["Other"].astype(bool)

display(df_step1.head(10))


Unnamed: 0,school,document,FS,Enrollment,Other,FS_1,Enrollment_1,Other_1
0,SETON_HALL_UNIVERSITY,university_pdfs_hy_e/SETON_HALL_UNIVERSITY/202...,True,True,False,True,True,False
1,ROANOKE_COLLEGE,university_pdfs_hy_e/ROANOKE_COLLEGE/Appendix_...,False,True,False,False,True,False
2,UNIVERSITY_OF_DAYTON,university_pdfs_hy_e/UNIVERSITY_OF_DAYTON/Univ...,False,False,True,False,False,True
3,LASELL_UNIVERSITY,university_pdfs_hy_e/LASELL_UNIVERSITY/Annual_...,False,True,False,False,True,False
4,NORTHWEST_NAZARENE_UNIVERSITY_INC,university_pdfs_hy_e/NORTHWEST_NAZARENE_UNIVER...,True,True,False,True,True,False
5,NORTHEAST_OHIO_MEDICAL_UNIVERSITY_FOUNDATION,university_pdfs_hy_e/NORTHEAST_OHIO_MEDICAL_UN...,True,True,False,True,True,False
6,NORTHEAST_OHIO_MEDICAL_UNIVERSITY_FOUNDATION,university_pdfs_hy_e/NORTHEAST_OHIO_MEDICAL_UN...,False,True,False,False,True,False
7,LOYOLA_UNIVERSITY_MARYLAND_INC,university_pdfs_hy_e/LOYOLA_UNIVERSITY_MARYLAN...,False,True,False,False,True,False
8,UNIVERSITY_OF_THE_PACIFIC,university_pdfs_hy_e/UNIVERSITY_OF_THE_PACIFIC...,True,True,False,True,True,False
9,KANSAS_CITY_UNIVERSITY_OF_MEDICINE_AND_BIOSCIE...,university_pdfs_hy_e/KANSAS_CITY_UNIVERSITY_OF...,True,True,False,True,True,False


In [11]:

# ===== Step 2: Same-School Correction (English comments) =====
# Rule: Within the same school, if there's at least one (FS_1=True & Enrollment_1=False),
# then any (FS_1=True & Enrollment_1=True) becomes (FS_2=False & Enrollment_2=True).
# Implementation details:
# - Compute a school-level boolean flag 'has_fs_only' = any(FS_1=True & Enrollment_1=False) per school.
# - Initialize FS_2, Enrollment_2 from FS_1, Enrollment_1.
# - For rows where 'has_fs_only' is True AND row is (FS_1=True & Enrollment_1=True), set FS_2=False.
# - Recompute Other_2 = NOT(FS_2) AND NOT(Enrollment_2).

def apply_same_school_correction(df_step1_view: pd.DataFrame) -> pd.DataFrame:
    required = {"school", "document", "FS_1", "Enrollment_1", "Other_1"}
    missing = required - set(df_step1_view.columns)
    if missing:
        raise KeyError(f"[Step 2] Missing required columns: {missing}")

    out = df_step1_view.copy()
    out["FS_1"] = out["FS_1"].astype(bool)
    out["Enrollment_1"] = out["Enrollment_1"].astype(bool)

    has_fs_only = (
        (out["FS_1"] & ~out["Enrollment_1"])
        .groupby(out["school"])
        .any()
        .rename("has_fs_only")
    )

    out = out.merge(has_fs_only, left_on="school", right_index=True, how="left")
    out["FS_2"] = out["FS_1"]
    out["Enrollment_2"] = out["Enrollment_1"]

    to_flip = out["has_fs_only"] & out["FS_1"] & out["Enrollment_1"]
    out.loc[to_flip, "FS_2"] = False

    out["Other_2"] = (~out["FS_2"]) & (~out["Enrollment_2"])
    out.drop(columns=["has_fs_only"], inplace=True)
    return out

df_step2 = apply_same_school_correction(
    df_step1[["school", "document", "FS_1", "Enrollment_1", "Other_1"]].copy()
)

display(df_step2.head(10))


Unnamed: 0,school,document,FS_1,Enrollment_1,Other_1,FS_2,Enrollment_2,Other_2
0,SETON_HALL_UNIVERSITY,university_pdfs_hy_e/SETON_HALL_UNIVERSITY/202...,True,True,False,True,True,False
1,ROANOKE_COLLEGE,university_pdfs_hy_e/ROANOKE_COLLEGE/Appendix_...,False,True,False,False,True,False
2,UNIVERSITY_OF_DAYTON,university_pdfs_hy_e/UNIVERSITY_OF_DAYTON/Univ...,False,False,True,False,False,True
3,LASELL_UNIVERSITY,university_pdfs_hy_e/LASELL_UNIVERSITY/Annual_...,False,True,False,False,True,False
4,NORTHWEST_NAZARENE_UNIVERSITY_INC,university_pdfs_hy_e/NORTHWEST_NAZARENE_UNIVER...,True,True,False,True,True,False
5,NORTHEAST_OHIO_MEDICAL_UNIVERSITY_FOUNDATION,university_pdfs_hy_e/NORTHEAST_OHIO_MEDICAL_UN...,True,True,False,True,True,False
6,NORTHEAST_OHIO_MEDICAL_UNIVERSITY_FOUNDATION,university_pdfs_hy_e/NORTHEAST_OHIO_MEDICAL_UN...,False,True,False,False,True,False
7,LOYOLA_UNIVERSITY_MARYLAND_INC,university_pdfs_hy_e/LOYOLA_UNIVERSITY_MARYLAN...,False,True,False,False,True,False
8,UNIVERSITY_OF_THE_PACIFIC,university_pdfs_hy_e/UNIVERSITY_OF_THE_PACIFIC...,True,True,False,True,True,False
9,KANSAS_CITY_UNIVERSITY_OF_MEDICINE_AND_BIOSCIE...,university_pdfs_hy_e/KANSAS_CITY_UNIVERSITY_OF...,True,True,False,True,True,False


In [12]:

# ===== Step 3: Short-Document Adjustment (by page count, English comments) =====
# EXACT rules:
#   A) If (FS_2=True & Enrollment_2=False) and page < 5 → set Other_3=True, FS_3=False (Enrollment_3 unchanged/False).
#   B) If (FS_2=True & Enrollment_2=True) and page < 5 → set FS_3=False (Enrollment_3 remains True).
# We always recompute Other_3 = NOT(FS_3) AND NOT(Enrollment_3).
import pdfplumber

PAGE_THRESHOLD = 5  # per specification

def get_pdf_pages_safe(pdf_path: str) -> int:
    try:
        with pdfplumber.open(pdf_path) as pdf:
            return len(pdf.pages)
    except Exception:
        return 0

def apply_short_doc_adjustment(df_step2_view: pd.DataFrame, page_threshold: int = PAGE_THRESHOLD) -> pd.DataFrame:
    required = {"document", "FS_2", "Enrollment_2", "Other_2"}
    missing = required - set(df_step2_view.columns)
    if missing:
        raise KeyError(f"[Step 3] Missing required columns: {missing}")

    out = df_step2_view.copy()
    out["page"] = out["document"].apply(get_pdf_pages_safe).astype(int)

    out["FS_3"] = out["FS_2"].astype(bool)
    out["Enrollment_3"] = out["Enrollment_2"].astype(bool)
    out["Other_3"] = out["Other_2"].astype(bool)

    short_mask = out["page"] < int(page_threshold)

    # Rule A
    mask_A = short_mask & out["FS_2"] & (~out["Enrollment_2"])
    out.loc[mask_A, "FS_3"] = False
    # Enrollment_3 stays as False; ensure Other_3 becomes True
    out.loc[mask_A, "Other_3"] = True

    # Rule B
    mask_B = short_mask & out["FS_2"] & out["Enrollment_2"]
    out.loc[mask_B, "FS_3"] = False
    # Enrollment_3 stays True

    # Final recomputation for consistency
    out["Other_3"] = (~out["FS_3"]) & (~out["Enrollment_3"])
    return out

df_step3 = apply_short_doc_adjustment(
    df_step2[["school", "document", "FS_1", "Enrollment_1", "Other_1",
              "FS_2", "Enrollment_2", "Other_2"]].copy(),
    page_threshold=PAGE_THRESHOLD
)

display(df_step3.head(10))


Unnamed: 0,school,document,FS_1,Enrollment_1,Other_1,FS_2,Enrollment_2,Other_2,page,FS_3,Enrollment_3,Other_3
0,SETON_HALL_UNIVERSITY,university_pdfs_hy_e/SETON_HALL_UNIVERSITY/202...,True,True,False,True,True,False,17,True,True,False
1,ROANOKE_COLLEGE,university_pdfs_hy_e/ROANOKE_COLLEGE/Appendix_...,False,True,False,False,True,False,2,False,True,False
2,UNIVERSITY_OF_DAYTON,university_pdfs_hy_e/UNIVERSITY_OF_DAYTON/Univ...,False,False,True,False,False,True,15,False,False,True
3,LASELL_UNIVERSITY,university_pdfs_hy_e/LASELL_UNIVERSITY/Annual_...,False,True,False,False,True,False,8,False,True,False
4,NORTHWEST_NAZARENE_UNIVERSITY_INC,university_pdfs_hy_e/NORTHWEST_NAZARENE_UNIVER...,True,True,False,True,True,False,2,False,True,False
5,NORTHEAST_OHIO_MEDICAL_UNIVERSITY_FOUNDATION,university_pdfs_hy_e/NORTHEAST_OHIO_MEDICAL_UN...,True,True,False,True,True,False,13,True,True,False
6,NORTHEAST_OHIO_MEDICAL_UNIVERSITY_FOUNDATION,university_pdfs_hy_e/NORTHEAST_OHIO_MEDICAL_UN...,False,True,False,False,True,False,1,False,True,False
7,LOYOLA_UNIVERSITY_MARYLAND_INC,university_pdfs_hy_e/LOYOLA_UNIVERSITY_MARYLAN...,False,True,False,False,True,False,7,False,True,False
8,UNIVERSITY_OF_THE_PACIFIC,university_pdfs_hy_e/UNIVERSITY_OF_THE_PACIFIC...,True,True,False,True,True,False,21,True,True,False
9,KANSAS_CITY_UNIVERSITY_OF_MEDICINE_AND_BIOSCIE...,university_pdfs_hy_e/KANSAS_CITY_UNIVERSITY_OF...,True,True,False,True,True,False,6,True,True,False


In [13]:

# ===== Single CSV Export (EXACT 10 columns in order) =====
final_cols = [
    "FS_1", "Enrollment_1", "Other_1",
    "FS_2", "Enrollment_2", "Other_2",
    "FS_3", "Enrollment_3", "Other_3",
    "page",
]
final_out = df_step3[final_cols].copy()

# Ensure booleans are booleans
for c in ["FS_1","Enrollment_1","Other_1","FS_2","Enrollment_2","Other_2","FS_3","Enrollment_3","Other_3"]:
    final_out[c] = final_out[c].astype(bool)

FINAL_OUTPUT_CSV = "flags_all_steps.csv"
final_out.to_csv(FINAL_OUTPUT_CSV, index=False)

print(f"[Done] Saved single unified CSV to: {FINAL_OUTPUT_CSV}")
print("Shape:", final_out.shape)
display(final_out.head(10))


[Done] Saved single unified CSV to: flags_all_steps.csv
Shape: (138, 10)


Unnamed: 0,FS_1,Enrollment_1,Other_1,FS_2,Enrollment_2,Other_2,FS_3,Enrollment_3,Other_3,page
0,True,True,False,True,True,False,True,True,False,17
1,False,True,False,False,True,False,False,True,False,2
2,False,False,True,False,False,True,False,False,True,15
3,False,True,False,False,True,False,False,True,False,8
4,True,True,False,True,True,False,False,True,False,2
5,True,True,False,True,True,False,True,True,False,13
6,False,True,False,False,True,False,False,True,False,1
7,False,True,False,False,True,False,False,True,False,7
8,True,True,False,True,True,False,True,True,False,21
9,True,True,False,True,True,False,True,True,False,6
