In [49]:
import pandas as pd

In [50]:
from pathlib import Path

base = Path(r"J:\ADMIN-eFILES\CHEN_W154867_VXC\z_Reports\Monthly Operating Statements")

year_dirs = [p for p in base.iterdir() if p.is_dir() and p.name.isdigit()]
if not year_dirs:
    raise FileNotFoundError(f"No year folders found under {base!s}")
latest_year_dir = max(year_dirs, key=lambda p: int(p.name))

pattern_files = list(
    latest_year_dir.glob("Cumulative Report - Operating Statements - *.xlsx")
)
xlsx_files = pattern_files or list(latest_year_dir.glob("*.xlsx"))
if not xlsx_files:
    raise FileNotFoundError(f"No .xlsx files found in {latest_year_dir!s}")

latest_report = max(xlsx_files, key=lambda p: p.stat().st_mtime)

report_path = latest_report
print(report_path)

dor = pd.read_excel(report_path, sheet_name="Cumulative Report", skiprows=8)

J:\ADMIN-eFILES\CHEN_W154867_VXC\z_Reports\Monthly Operating Statements\2025\Cumulative Report - Operating Statements - 0925 - Hard Coded.xlsx


In [None]:
# drop Unnamed: 0 only if it exists
if "Unnamed: 0" in dor.columns:
    dor = dor.drop(columns=["Unnamed: 0"])

# desired columns (use canonical names)
desired_cols = [
    "Project ID",
    "Project Title",
    "Program Area",
    "Funder Type",
    "Principal Investigator (PI)",
    "Award Term Start Date",
    "Project Status",
    "Total Cash Receipts",
    "Total Personnel",
    "Total Contractual/ Outside Services Costs",
    "Total Non-Personnel",
    "Total Cost",
]

# normalize helper to match columns ignoring whitespace/newlines/case
normalize = lambda s: "".join(s.split()).lower() if isinstance(s, str) else s
col_map = {normalize(c): c for c in dor.columns}

# build selected column list from available columns (skip missing ones)
selected = []
missing = []
for c in desired_cols:
    key = normalize(c)
    if key in col_map:
        selected.append(col_map[key])
    else:
        missing.append(c)

if missing:
    print(
        f"Warning: these desired columns were not found and will be skipped: {missing}"
    )

# subset dataframe to the selected (available) columns
dor = dor[selected]

print(dor.dtypes)

dor.to_excel(
    "C:\\Users\\O304312\\OneDrive - Kaiser Permanente\\Documents\\New Financial Snapshot\\Data\\DOR Data Preprocessed.xlsx",
    index=False,
)

Project ID                                            object
Project Title                                         object
Program Area                                          object
Funder Type                                           object
Principal Investigator (PI)                           object
Award Term Start Date                         datetime64[ns]
Project Status                                        object
Total Cash Receipts                                  float64
Total Personnel                                      float64
Total Contractual/\nOutside Services Costs           float64
Total \nNon-Personnel                                float64
Total Cost                                           float64
dtype: object
