In [None]:
import pandas as pd
from pathlib import Path
import re
import json

In [None]:
base = Path(r"J:\ADMIN-eFILES\CHEN_W154867_VXC\z_Reports\Monthly Operating Statements")

year_dirs = [p for p in base.iterdir() if p.is_dir() and p.name.isdigit()]
if not year_dirs:
    raise FileNotFoundError(f"No year folders found under {base!s}")
latest_year_dir = max(year_dirs, key=lambda p: int(p.name))

pattern_files = list(
    latest_year_dir.glob("Cumulative Report - Operating Statements - *.xlsx")
)
xlsx_files = pattern_files or list(latest_year_dir.glob("*.xlsx"))
if not xlsx_files:
    raise FileNotFoundError(f"No .xlsx files found in {latest_year_dir!s}")

latest_report = max(xlsx_files, key=lambda p: p.stat().st_mtime)

report_path = latest_report

dor = pd.read_excel(report_path, sheet_name="Summary - DC only", skiprows=6)
dor = dor.rename(columns={'Short "Project" Title': "DOR Project Title"})

dor_end_date = report_path.stem.split(" - ")[-2]
dor_end_date = (
    pd.to_datetime(dor_end_date, format="%m%y", errors="raise")
    .to_period("M")
    .to_timestamp("M")
    .date()
)

print(dor_end_date)

In [None]:
normalize = lambda s: "".join(s.split()).lower() if isinstance(s, str) else s

In [None]:
# drop Unnamed: 0 only if it exists
# if "Unnamed: 0" in dor.columns:
#     dor = dor.drop(columns=["Unnamed: 0"])

# # desired columns (use canonical names)
# desired_cols = [
#     "Project ID",
#     "DOR Project Title",
#     "Program Area",
#     "Funder Type",
#     "Principal Investigator (PI)",
#     "Award Term Start Date",
#     "Project Status",
#     "Total Cash Receipts",
#     "Total Personnel",
#     "Total Contractual/ Outside Services Costs",
#     "Total Non-Personnel",
#     "Total Cost",
# ]

# # normalize helper to match columns ignoring whitespace/newlines/case
# normalize = lambda s: "".join(s.split()).lower() if isinstance(s, str) else s
# col_map = {normalize(c): c for c in dor.columns}

# # build selected column list from available columns (skip missing ones)
# selected = []
# missing = []
# for c in desired_cols:
#     key = normalize(c)
#     if key in col_map:
#         selected.append(col_map[key])
#     else:
#         missing.append(c)

# if missing:
#     print(
#         f"Warning: these desired columns were not found and will be skipped: {missing}"
#     )

# # subset dataframe to the selected (available) columns
# dor = dor[selected]

# print(dor.dtypes)

dor.to_excel(
    "C:\\Users\\O304312\\OneDrive - Kaiser Permanente\\Documents\\Tableau Dashboards\\New Financial Snapshot\\Data\\DOR Data Preprocessed.xlsx",
    index=False,
)

In [None]:
txn_base = Path(r"J:\ADMIN-eFILES\CHEN_W154867_VXC\z_Reports\Transaction Detail")
pattern = "CTP Transaction Detail *.xlsx"
matches = list(txn_base.glob(pattern))

if matches:
    ctp_path = max(matches, key=lambda p: p.stat().st_mtime)
else:

    fallback = txn_base / "CTP Transaction Detail 103125.xlsx"
    if fallback.exists():
        ctp_path = fallback
    else:
        raise FileNotFoundError(
            f"No files matching {pattern!s} and fallback {fallback!s} not found in {txn_base!s}"
        )

ctp_hours = pd.read_excel(ctp_path, sheet_name="Hours", header=[8, 9, 10, 11])


def tidy(col):
    parts = [
        str(x).strip()
        for x in col
        if str(x).strip() not in {"nan", ""} and not str(x).startswith("Unnamed")
    ]
    return "_".join(parts).strip("_")


ctp_hours.columns = [tidy(col) for col in ctp_hours.columns]

ctp_hours = ctp_hours.rename(columns={"Project": "Project ID"})

mask = ctp_hours["Project ID"].astype(str).str.strip().str.lower().str.endswith("total")
ctp_hours = ctp_hours[mask].copy()
ctp_hours.reset_index(drop=True, inplace=True)

rng_mask = (
    ctp_hours["Project ID"].astype(str).str.match(r"^(RNG\d+)\s+Total$", na=False)
)
ctp_hours.loc[rng_mask, "Project ID"] = (
    ctp_hours.loc[rng_mask, "Project ID"]
    .astype(str)
    .str.replace(r"^(RNG\d+)\s+Total$", r"\1", regex=True)
    .str.strip()
)

print("Columns:", ctp_hours.columns.tolist())

print("Loaded dataframe shape:", ctp_hours.shape)

ctp_hours.to_excel(
    "C:\\Users\\O304312\\OneDrive - Kaiser Permanente\\Documents\\Tableau Dashboards\\New Financial Snapshot\\Data\\DOR Personnel.xlsx",
    index=False,
)

In [None]:
vp_folder = Path("J:\\VIEWPOINT\\SiteStudyDetails_Response")

pattern = re.compile(r"^\d{4}-\d{2}-\d{2}-SiteStudyDetails\.json$")

matches = [p for p in vp_folder.iterdir() if p.is_file() and pattern.match(p.name)]
if not matches:
    raise FileNotFoundError(f"No SiteStudyDetails json files found in {vp_folder!s}")

latest_json = max(matches, key=lambda p: p.stat().st_mtime)
print("Loading:", latest_json)

with latest_json.open("r", encoding="utf-8") as f:
    site_details = json.load(f)

try:
    ss_df = pd.json_normalize(site_details)
    print("Converted to DataFrame with shape:", ss_df.shape)
except Exception:
    ss_df = None
    print("JSON loaded into 'site_details' (not converted to DataFrame).")

vp_study_details = ss_df.copy(deep=True)

In [None]:
vp_accountables_folder = Path("J:\\VIEWPOINT\\Accountables_Response")

pattern = re.compile(r"^\d{4}-\d{2}-\d{2}-Accountables\.json$")

matches = [
    p for p in vp_accountables_folder.iterdir() if p.is_file() and pattern.match(p.name)
]
if not matches:
    raise FileNotFoundError(
        f"No SiteStudyDetails json files found in {vp_accountables_folder!s}"
    )

latest_json = max(matches, key=lambda p: p.stat().st_mtime)
print("Loading:", latest_json)

with latest_json.open("r", encoding="utf-8") as f:
    site_details = json.load(f)

try:
    account = pd.json_normalize(site_details)
    print("Converted to DataFrame with shape:", account.shape)
except Exception:
    account = None
    print("JSON loaded into 'site_details' (not converted to DataFrame).")

vp_accountables = account.copy(deep=True)

In [None]:
vp_accountables = vp_accountables.merge(
    vp_study_details[["site_study_service_line", "network_study_uuid"]],
    on="network_study_uuid",
    how="left",
)

In [None]:
vp_accountables["completion_date_parsed"] = pd.to_datetime(
    vp_accountables["completion_date"], errors="coerce"
)

cutoff = pd.to_datetime(dor_end_date)

vp_accountables = vp_accountables[
    vp_accountables["completion_date_parsed"] < cutoff
].copy()

vp_accountables.drop(columns=["completion_date_parsed"], inplace=True)

In [None]:
vp_accountables = vp_accountables[
    ["site_study_id", "site_study_service_line", "amount"]
]
vp_accountables = vp_accountables.groupby("site_study_service_line").sum().reset_index()

In [None]:
ctl_re = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]")
for col in vp_accountables.select_dtypes(include=["object"]).columns:
    mask = vp_accountables[col].notna()
    vp_accountables.loc[mask, col] = (
        vp_accountables.loc[mask, col].astype(str).map(lambda s: ctl_re.sub("", s))
    )

vp_accountables.to_excel(
    r"C:\Users\O304312\OneDrive - Kaiser Permanente\Documents\Tableau Dashboards\New Financial Snapshot\Data\Viewpoint Accountables.xlsx",
    index=False,
)

In [None]:
vp_study_details.to_excel(
    r"C:\Users\O304312\OneDrive - Kaiser Permanente\Documents\Tableau Dashboards\New Financial Snapshot\Data\Viewpoint Site Study Details.xlsx",
    index=False,
)

In [None]:
vp_map = dict(
    zip(
        vp_accountables["site_study_service_line"],
        vp_accountables["site_study_service_line"],
    )
)
vp_set = {
    normalize(rng) for rng in vp_accountables["site_study_service_line"].astype(str)
}

dor["Project ID normalized"] = dor["Project ID"].astype(str).apply(normalize)
missing_out = dor[~dor["Project ID normalized"].isin(vp_set)].copy()
missing_out = missing_out.drop(columns=["Project ID normalized"])

dor_only = missing_out.merge(
    dor[["Project ID", "DOR Project Title"]], on="Project ID", how="left"
)
if (
    "DOR Project Title_x" in dor_only.columns
    and "DOR Project Title_y" in dor_only.columns
):
    dor_only["DOR Project Title"] = dor_only["DOR Project Title_x"].combine_first(
        dor_only["DOR Project Title_y"]
    )
    dor_only = dor_only.drop(columns=["DOR Project Title_x", "DOR Project Title_y"])

dor_only = dor_only[
    ["DOR Project Title", "Project ID"]
    + [
        col
        for col in dor_only.columns
        if col not in ["DOR Project Title", "Project ID"]
    ]
]

dor_only = dor_only[["DOR Project Title", "Project ID"]]

vp_only_rngs = sorted(
    vp_set - {normalize(pid) for pid in dor["Project ID"].astype(str)}
)

vp_only_site_ids = []
for rng in vp_only_rngs:
    matching = vp_accountables[
        vp_accountables["site_study_service_line"].astype(str).apply(normalize) == rng
    ]
    if not matching.empty:
        vp_only_site_ids.append(matching["site_study_id"].iloc[0])
    else:
        vp_only_site_ids.append(None)

vp_only_df = pd.DataFrame(
    {
        "RNG Number": vp_only_rngs,
        "site_study_id": [
            sid.split(sid.split("#")[1])[0] + "#" + sid.split("#")[1] if sid else None
            for sid in vp_only_site_ids
        ],
    }
)

out_path = r"C:\Users\O304312\OneDrive - Kaiser Permanente\Documents\Tableau Dashboards\New Financial Snapshot\Data\Missing RNG Numbers.xlsx"
with pd.ExcelWriter(out_path, engine="openpyxl") as writer:
    dor_only.to_excel(writer, sheet_name="DOR Only", index=False)
    vp_only_df.to_excel(writer, sheet_name="VP Only", index=False)

print(f"{len(dor_only)} DOR Project ID(s) not in VP")
print(f"{len(vp_only_df)} VP RNG number(s) not in DOR")