In [1]:
# --- build_latest.ipynb — Cell 1: Env, paths, .env loader, tokens (original) ---
import os, json, re, time
from pathlib import Path
from datetime import date, timedelta
import pandas as pd

# Optional dotenv
try:
    from dotenv import load_dotenv, find_dotenv
except Exception:
    load_dotenv = None
    find_dotenv = None

def mask(s: str | None) -> str:
    if not s: return "<missing>"
    return (s[:4] + "…" + s[-4:]) if len(s) > 8 else "***"

# Resolve repo root (works from / or /scripts)
cwd = Path.cwd().resolve()
repo_root = next((p for p in [cwd, *cwd.parents] if (p / ".git").exists() or p.name == "spending-dashboard"), cwd)

# ✅ CI-safe override: prefer GitHub workspace path if present
gw = os.getenv("GITHUB_WORKSPACE")
if gw:
    repo_root = Path(gw).resolve()

# Load .env if present (scripts/.env preferred)
def load_envs():
    if load_dotenv is None:
        return
    abs_override = os.getenv("ENV_PATH", str(repo_root / "scripts" / ".env"))
    if abs_override and Path(abs_override).exists():
        try:
            load_dotenv(abs_override, override=False, encoding="utf-8")
        except TypeError:
            load_dotenv(abs_override, override=False)
    for p in [
        repo_root / "scripts" / ".env",
        repo_root / ".env",
        repo_root / "config" / ".env",
        cwd / ".env",
    ]:
        if Path(p).exists():
            try:
                load_dotenv(str(p), override=False, encoding="utf-8")
            except TypeError:
                load_dotenv(str(p), override=False)
    if find_dotenv:
        found = find_dotenv(usecwd=True)
        if found:
            try:
                load_dotenv(found, override=False, encoding="utf-8")
            except TypeError:
                load_dotenv(found, override=False)

load_envs()

# Normalize env
PLAID_CLIENT_ID = os.getenv("PLAID_CLIENT_ID")
PLAID_SECRET    = os.getenv("PLAID_SECRET")
PLAID_ENV       = (os.getenv("PLAID_ENV", "production") or "production").strip().lower()
alias = {"prod":"production","live":"production","dev":"development","devel":"development","sb":"sandbox"}
PLAID_ENV = alias.get(PLAID_ENV, PLAID_ENV)
if PLAID_ENV not in {"production","development","sandbox"}:
    PLAID_ENV = "production"

# Paths (env-overridable)
OUTPUT_DIR = Path(os.getenv("OUTPUT_DIR", str(repo_root / "data" / "raw")))
STATE_DIR  = Path(os.getenv("STATE_DIR",  str(repo_root / ".state")))
TOKENS_PATH = Path(os.getenv("TOKENS_PATH", str(STATE_DIR / "access_tokens.json")))
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
STATE_DIR.mkdir(parents=True, exist_ok=True)

# --- Load & validate access tokens (env > file), canonical-only ---
def _strip_bom(s: str) -> str:
    return s.lstrip("\ufeff") if isinstance(s, str) else s

def _parse_pairs_blob(blob: str) -> dict:
    raw = [p.strip() for sep in ["\n",";","|",","] for p in (blob.split(sep) if sep in blob else []) if p.strip()]
    if not raw: raw = [blob.strip()]
    out = {}
    for p in raw:
        if "=" in p:
            k, v = p.split("=", 1)
        elif ":" in p:
            k, v = p.split(":", 1)
        else:
            continue
        k = k.strip().strip('"').strip("'")
        v = v.strip().strip('"').strip("'")
        if k and v:
            out[k] = v
    return out

def _normalize_tokens(obj) -> dict:
    if isinstance(obj, dict):
        return {str(k): str(v).strip() for k,v in obj.items()}
    if isinstance(obj, list):
        out = {}
        for item in obj:
            if isinstance(item, dict):
                name = item.get("issuer") or item.get("bank") or item.get("name")
                token = item.get("access_token") or item.get("token")
                if name and token:
                    out[str(name)] = str(token).strip()
        return out
    if isinstance(obj, str):
        s = _strip_bom(obj).strip()
        try:
            parsed = json.loads(s)    # JSON first
            return _normalize_tokens(parsed)
        except Exception:
            return _parse_pairs_blob(s)
    return {}

def load_access_tokens():
    blob = os.getenv("PLAID_ACCESS_TOKENS", "").strip()
    if blob:
        tokens = _normalize_tokens(blob)
        if tokens:
            return tokens
    if TOKENS_PATH.exists():
        raw = TOKENS_PATH.read_text(encoding="utf-8", errors="ignore")
        tokens = _normalize_tokens(raw)
        if tokens:
            return tokens
    raise AssertionError(
        f"Could not load access tokens. Provide PLAID_ACCESS_TOKENS env or a valid JSON at {TOKENS_PATH}."
    )

ACCESS_TOKENS = load_access_tokens()

PAT = re.compile(r"^access-(?:production|development|sandbox)-[a-z0-9\-]+$")
expected_prefix = f"access-{PLAID_ENV}-"
bad = [k for k,v in ACCESS_TOKENS.items() if not isinstance(v, str) or not v.startswith(expected_prefix) or not PAT.match(v)]
assert not bad, f"Non-canonical or wrong-env tokens for: {bad}. Ensure tokens look like '{expected_prefix}…' (no '/', '+', '=')."

print(
    "Env OK →",
    "PLAID_CLIENT_ID:", mask(PLAID_CLIENT_ID),
    "| PLAID_SECRET:", mask(PLAID_SECRET),
    "| PLAID_ENV:", PLAID_ENV,
    "| OUTPUT_DIR:", str(OUTPUT_DIR),
    "| TOKENS_PATH:", str(TOKENS_PATH),
)
print(f"Loaded {len(ACCESS_TOKENS)} token(s).")


Env OK → PLAID_CLIENT_ID: 68bb…6689 | PLAID_SECRET: a605…7df5 | PLAID_ENV: production | OUTPUT_DIR: C:\Users\kosis\Downloads\Automation\spending-dashboard\data\raw | TOKENS_PATH: C:\Users\kosis\Downloads\Automation\spending-dashboard\.state\access_tokens.json
Loaded 3 token(s).


In [2]:
# --- build_latest.ipynb — Cell 2: Plaid client init (original) ---
USE_PLAID_V10 = False
client = None

try:
    # v10+ path
    from plaid.api import plaid_api
    from plaid.configuration import Configuration
    try:
        from plaid.configuration import Environment  # newer enum
        env_host = {
            "production":  Environment.Production,
            "development": Environment.Development,
            "sandbox":     Environment.Sandbox,
        }[PLAID_ENV]
        config = Configuration(host=env_host)
    except Exception:
        # fallback if Environment enum not present
        host_url = {
            "production":  "https://production.plaid.com",
            "development": "https://development.plaid.com",
            "sandbox":     "https://sandbox.plaid.com",
        }[PLAID_ENV]
        config = Configuration(host=host_url)

    from plaid.api_client import ApiClient
    config.api_key["clientId"] = PLAID_CLIENT_ID
    config.api_key["secret"]   = PLAID_SECRET
    api_client = ApiClient(config)
    client = plaid_api.PlaidApi(api_client)
    USE_PLAID_V10 = True
    print("Plaid SDK: v10+ (plaid_api)")
except Exception as e_v10:
    try:
        # legacy path
        from plaid import Client as LegacyClient
        client = LegacyClient(
            client_id=PLAID_CLIENT_ID,
            secret=PLAID_SECRET,
            environment=PLAID_ENV
        )
        USE_PLAID_V10 = False
        print("Plaid SDK: legacy Client()")
    except Exception as e_legacy:
        raise ImportError(
            "Could not initialize Plaid client. Ensure 'plaid-python' is installed. "
            f"v10 error: {e_v10}\nlegacy error: {e_legacy}"
        )

# --- Expanded PRECHECK: list accounts per token when PRECHECK=1 ---
if os.getenv("PRECHECK", "0") == "1" and USE_PLAID_V10:
    from plaid.model.accounts_get_request import AccountsGetRequest
    from plaid.api_client import ApiException
    print("\n[PRECHECK] Listing accounts per token:")
    for issuer, tok in ACCESS_TOKENS.items():
        try:
            acc_resp = client.accounts_get(AccountsGetRequest(access_token=tok)).to_dict()
            accounts = acc_resp.get("accounts", []) or []
            print(f"- {issuer}: {len(accounts)} account(s)")
            for a in accounts:
                name = a.get("name") or a.get("official_name") or ""
                mask = a.get("mask") or ""
                subtype = (a.get("subtype") or "").upper()
                a_id = a.get("account_id")
                print(f"    • {name}  (subtype={subtype}, mask={mask}, id={a_id})")
        except ApiException as e:
            print(f"- {issuer}: ❌ API {e.status} -> {getattr(e, 'body', e)}")


Plaid SDK: v10+ (plaid_api)


In [3]:
# --- build_latest.ipynb — Cell 3 (UPDATED): Pull & consolidate via /transactions/sync ---

import os
from pathlib import Path
import numpy as np
import pandas as pd
from datetime import date, timedelta

# Rolling window (override with DAYS_BACK env if you like)
DAYS_BACK = int(os.getenv("DAYS_BACK", "180"))
end_date = date.today()
start_date = end_date - timedelta(days=DAYS_BACK)

CURSORS_PATH = STATE_DIR / "plaid_cursors.json"

def load_cursors() -> dict:
    if CURSORS_PATH.exists():
        try:
            return json.loads(CURSORS_PATH.read_text(encoding="utf-8"))
        except Exception:
            pass
    return {}

def save_cursors(cur: dict):
    CURSORS_PATH.parent.mkdir(parents=True, exist_ok=True)
    CURSORS_PATH.write_text(json.dumps(cur, ensure_ascii=False, indent=2), encoding="utf-8")

def normalize_category(x):
    return " > ".join(x) if isinstance(x, (list, tuple)) else x

# Convert raw txn dicts to our normalized schema (KEEP PFC)
def df_from_txns(txns: list[dict], bank_name: str) -> pd.DataFrame:
    if not txns:
        return pd.DataFrame()
    df = pd.DataFrame(txns)

    # Ensure expected columns exist (include PFC so we can map it later)
    expected_cols = [
        "name","merchant_name","payment_channel","pending",
        "account_id","transaction_id","category","date","amount",
        "personal_finance_category",
    ]
    for col in expected_cols:
        if col not in df.columns:
            df[col] = None

    # Normalize
    df["category"] = df["category"].apply(normalize_category)
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df["amount"] = pd.to_numeric(df["amount"], errors="coerce")

    # Bank & card
    df["bank_name"] = bank_name
    df["card_name"] = bank_name  # upgrade later via accounts_dim if desired

    keep_cols = [
        "date","name","merchant_name","category","amount",
        "payment_channel","pending","account_id","transaction_id",
        "bank_name","card_name","personal_finance_category"
    ]
    return df[[c for c in keep_cols if c in df.columns]].copy()

# Read previous latest.csv (acts as our cache for SYNC deltas)
latest_csv_path = (repo_root / "data" / "raw" / "latest.csv")
prev = pd.DataFrame()
if latest_csv_path.exists():
    try:
        prev = pd.read_csv(latest_csv_path)
        prev["date"] = pd.to_datetime(prev["date"], errors="coerce")
        if "transaction_id" not in prev.columns:
            prev["transaction_id"] = pd.Series(dtype=object)
    except Exception as e:
        print(f"⚠️ Could not read previous latest.csv: {e}")

# Optional one-time full resync (set FORCE_FULL_RESYNC=1 in env)
if (os.getenv("FORCE_FULL_RESYNC", "0") or "0").strip().lower() in {"1","true","yes","on"}:
    try:
        if CURSORS_PATH.exists():
            CURSORS_PATH.unlink()
            print("⚠️ Reset: deleted .state/plaid_cursors.json to force a full resync.")
    except Exception as e:
        print("Reset warning:", e)
    cursors = {}
    prev = pd.DataFrame()
else:
    cursors = load_cursors()

all_added_mod = []   # list of DataFrames of added/modified across banks
all_removed_ids = set()

if 'USE_PLAID_V10' in globals() and USE_PLAID_V10:
    # --- Sync-first path using Plaid v10 client (simple + robust) ---
    from plaid.model.transactions_sync_request import TransactionsSyncRequest

    def sync_one(bank_name: str, access_token: str, cursor: str | None):
        """
        Call /transactions/sync until has_more is False.
        Explicitly requests Personal Finance Category (PFC) on every page.
        Returns: (next_cursor, added, modified, removed_ids)
        """
        added, modified, removed_ids = [], [], []
        next_cursor = cursor

        while True:
            req_kwargs = {"access_token": access_token, "count": 500}
            if isinstance(next_cursor, str) and next_cursor:
                req_kwargs["cursor"] = next_cursor

            # Ask Plaid to include PFC (and original description for debugging)
            try:
                from plaid.model.transactions_sync_request_options import TransactionsSyncRequestOptions
                req_kwargs["options"] = TransactionsSyncRequestOptions(
                    include_personal_finance_category=True,
                    include_original_description=True
                )
            except Exception:
                # Older SDKs may not expose the Options model; skip silently.
                pass

            req = TransactionsSyncRequest(**req_kwargs)
            resp = client.transactions_sync(req).to_dict()

            added.extend(resp.get("added", []) or [])
            modified.extend(resp.get("modified", []) or [])

            # removed may be list[str] or list[dict]
            rem = resp.get("removed", []) or []
            for r in rem:
                if isinstance(r, dict):
                    rid = r.get("transaction_id")
                    if rid: removed_ids.append(rid)
                elif isinstance(r, str):
                    removed_ids.append(r)

            next_cursor = resp.get("next_cursor", next_cursor)
            if not resp.get("has_more", False):
                break

        # Tiny diag: count how many returned txns actually have PFC payloads
        def _pfc_count(lst):
            c = 0
            for t in lst:
                pfc = t.get("personal_finance_category")
                if isinstance(pfc, dict) and (pfc.get("primary") or pfc.get("detailed")):
                    c += 1
            return c

        print(f"   [sync diag] PFC in added: {_pfc_count(added)} / {len(added)}, "
              f"modified: {_pfc_count(modified)} / {len(modified)}")

        return (next_cursor if isinstance(next_cursor, str) and next_cursor else None,
                added, modified, removed_ids)

    for bank_name, token in ACCESS_TOKENS.items():
        print(f"🔄 SYNC {bank_name} (start: {'cursor-present' if cursors.get(token) else 'no-cursor'})")
        cur0 = cursors.get(token)
        next_cur, added, modified, removed_ids = sync_one(bank_name, token, cur0)

        df_add = df_from_txns(added, bank_name)
        df_mod = df_from_txns(modified, bank_name)
        all_added_mod.append(df_add)
        all_added_mod.append(df_mod)
        all_removed_ids.update(removed_ids)

        cursors[token] = next_cur
        print(f"   → added={len(df_add):,}, modified={len(df_mod):,}, removed={len(removed_ids):,}, next_cursor={'set' if next_cur else 'None'}")

    # Start from previous CSV and apply deltas
    cur = prev.copy()

    # Remove deleted transaction_ids (guard if column is present)
    if not cur.empty and all_removed_ids and "transaction_id" in cur.columns:
        cur = cur[~cur["transaction_id"].astype(str).isin({str(x) for x in all_removed_ids})]
    elif not cur.empty and all_removed_ids:
        print("ℹ️ Skipping delete-apply: previous cache lacks 'transaction_id' column.")

    # Replace modified ids and add new ones
    if any(len(x) for x in all_added_mod):
        new_mod = (pd.concat([df for df in all_added_mod if not df.empty], ignore_index=True)
                   if all_added_mod else pd.DataFrame())
        if not new_mod.empty and "transaction_id" not in new_mod.columns:
            new_mod["transaction_id"] = pd.Series(dtype=object)

        if not cur.empty and "transaction_id" in cur.columns and "transaction_id" in new_mod.columns:
            mod_ids = set(new_mod["transaction_id"].dropna().astype(str).tolist())
            if mod_ids:
                cur = cur[~cur["transaction_id"].astype(str).isin(mod_ids)]
        else:
            if not cur.empty and not new_mod.empty:
                print("ℹ️ Skipping modify-replace: one frame lacks 'transaction_id'; appending only.")

        combined = pd.concat([cur, new_mod], ignore_index=True) if not new_mod.empty else cur.copy()
    else:
        combined = cur.copy()

else:
    # --- Fallback: windowed GET per item (legacy client) ---
    def fetch_transactions_get(bank_name: str, access_token: str) -> pd.DataFrame:
        txns = []
        offset = 0
        while True:
            resp = client.Transactions.get(
                access_token=access_token,
                start_date=start_date,
                end_date=end_date,
                options={"count": 500, "offset": offset}
            )
            total = resp["total_transactions"]
            txns.extend(resp["transactions"])
            if len(txns) >= total:
                break
            offset = len(txns)
            if offset > 50_000:
                raise RuntimeError(f"Pagination runaway for {bank_name}")
        return df_from_txns(txns, bank_name)

    frames = []
    for bank_name, token in ACCESS_TOKENS.items():
        print(f"🔄 GET {bank_name} ({start_date} → {end_date})…")
        frames.append(fetch_transactions_get(bank_name, token))
    combined = pd.concat([f for f in frames if f is not None and not f.empty], ignore_index=True) if frames else pd.DataFrame()

# Finalize for both paths: trim to window, sort, dedupe by best available key
if combined is None or combined.empty:
    combined = pd.DataFrame(columns=[
        "date","name","merchant_name","category","amount",
        "payment_channel","pending","account_id","transaction_id",
        "bank_name","card_name","personal_finance_category"
    ])
else:
    combined["date"] = pd.to_datetime(combined["date"], errors="coerce")
    combined = combined[
        (combined["date"] >= pd.Timestamp(start_date)) &
        (combined["date"] <= pd.Timestamp(end_date))
    ]
    # Primary dedupe: transaction_id; fallback: (account_id, date, amount, name)
    if "transaction_id" in combined.columns and combined["transaction_id"].notna().any():
        combined = (combined
                    .sort_values("date", ascending=False)
                    .drop_duplicates(subset=["transaction_id"], keep="first")
                    .reset_index(drop=True))
    else:
        subset = [c for c in ["account_id","date","amount","name"] if c in combined.columns]
        if subset:
            combined = (combined
                        .sort_values("date", ascending=False)
                        .drop_duplicates(subset=subset, keep="first")
                        .reset_index(drop=True))
        else:
            combined = combined.sort_values("date", ascending=False).reset_index(drop=True)

# Persist cursors (local/CI workspace)
try:
    save_cursors(cursors if 'cursors' in locals() else {})
except Exception as e:
    print(f"⚠️ Could not save cursors: {e}")

print(f"✅ Consolidated using {'SYNC' if 'USE_PLAID_V10' in globals() and USE_PLAID_V10 else 'GET'} → rows={len(combined):,} across {len(ACCESS_TOKENS)} bank(s)")
print(f"Window: {start_date} → {end_date} | DAYS_BACK={DAYS_BACK}")


🔄 SYNC Discover (start: cursor-present)
   [sync diag] PFC in added: 0 / 0, modified: 0 / 0
   → added=0, modified=0, removed=0, next_cursor=set
🔄 SYNC Petal (start: cursor-present)
   [sync diag] PFC in added: 0 / 0, modified: 0 / 0
   → added=0, modified=0, removed=0, next_cursor=set
🔄 SYNC Silver State Schools Credit Union (start: cursor-present)
   [sync diag] PFC in added: 0 / 0, modified: 0 / 0
   → added=0, modified=0, removed=0, next_cursor=set
✅ Consolidated using SYNC → rows=156 across 3 bank(s)
Window: 2025-03-21 → 2025-09-17 | DAYS_BACK=180


In [4]:
# --- Cell 3.5: Extract PFC and map to friendly base category ---

# Ensure the column exists even if missing
if "personal_finance_category" not in combined.columns:
    combined["personal_finance_category"] = None

# Extract PFC primary/detailed safely
def _pfc_get(x, key):
    if isinstance(x, dict):
        return x.get(key)
    # Some SDKs may serialize nested objects as strings; try to parse
    if isinstance(x, str) and x.strip().startswith("{"):
        try:
            d = json.loads(x)
            if isinstance(d, dict):
                return d.get(key)
        except Exception:
            return None
    return None

combined["category_pfc_primary"]  = combined["personal_finance_category"].apply(lambda x: _pfc_get(x, "primary"))
combined["category_pfc_detailed"] = combined["personal_finance_category"].apply(lambda x: _pfc_get(x, "detailed"))

# Friendly names for PFC primary
_pfc_map = {
    "FOOD_AND_DRINK":"Dining",
    "GROCERIES":"Groceries",
    "GENERAL_MERCHANDISE":"Shopping",
    "TRANSPORTATION":"Transportation",
    "TRAVEL":"Travel",
    "HEALTHCARE":"Health",
    "ENTERTAINMENT":"Entertainment",
    "HOME_IMPROVEMENT":"Home Improvement",
    "RENT_AND_UTILITIES":"Utilities",
    "SERVICE":"Services",
    "GOVERNMENT_AND_NON_PROFIT":"Government/Non-Profit",
    "BANK_FEES":"Fees",
    "INCOME":"Income",
    "TRANSFER_OUT":"Transfers",
    "TRANSFER_IN":"Transfers",
    "LOAN_PAYMENTS":"Debt Payments",
    "SUBSCRIPTION":"Subscriptions",
    "RECURRING_SUBSCRIPTIONS":"Subscriptions",
}

# Preserve legacy for debugging
combined["category_plaid_legacy"] = combined.get("category")

# Final base 'category' that leaves build:
combined["category"] = (
    combined.get("category_pfc_primary")
      .map(lambda s: _pfc_map.get(str(s).upper()) if pd.notna(s) and str(s).strip() else None)
      .fillna(combined.get("category_plaid_legacy"))
      .fillna("Uncategorized")
)

print("\n[BUILD] Counters")
print("Rows:", len(combined))
print("Has PFC primary:", int(combined["category_pfc_primary"].notna().sum()))
print("\nTop 15 category (post-PFC mapping):")
print(combined["category"].fillna("<<NULL>>").value_counts().head(15))



[BUILD] Counters
Rows: 156
Has PFC primary: 0

Top 15 category (post-PFC mapping):
category
Transfers                46
Debt Payments            32
Shopping                 30
Dining                   17
Services                 14
Transportation            7
Entertainment             4
Home Improvement          2
Uncategorized             1
Fees                      1
Health                    1
Government/Non-Profit     1
Name: count, dtype: int64


In [5]:
# --- DIAG: Do we actually have PFC fields in the raw frames? ---
print("[BUILD DIAG v2] Columns present:", list(combined.columns))

has_pfc_col = "personal_finance_category" in combined.columns
print("Has personal_finance_category column:", has_pfc_col)

# Peek a few raw dicts if we can (best-effort; may be empty if we already collapsed earlier)
sample_cols = [c for c in combined.columns if "personal_finance" in c.lower() or c in ("category","name","merchant_name")]
print("Sample columns for PFC probe:", sample_cols[:8])

# Try to show any non-null personal_finance_category values if the column exists
if has_pfc_col:
    nn = combined["personal_finance_category"].dropna()
    print("Non-null PFC rows:", int(nn.shape[0]))
    if not nn.empty:
        print("Example PFC dicts (up to 3):", nn.head(3).tolist())

print("\nLegacy 'category' top 10 BEFORE any mapping:")
print(combined.get("category", pd.Series(dtype=object)).fillna("<<NULL>>").value_counts().head(10))


[BUILD DIAG v2] Columns present: ['date', 'name', 'merchant_name', 'merchant_key', 'display_name', 'category', 'subcategory', 'tags', 'amount', 'payment_channel', 'pending', 'account_id', 'transaction_id', 'bank_name', 'card_name', 'is_non_spend_flow', 'display_name_final', 'category_final', 'subcategory_final', 'tags_final', 'confidence_final', 'source_final', 'personal_finance_category', 'category_pfc_primary', 'category_pfc_detailed', 'category_plaid_legacy']
Has personal_finance_category column: True
Sample columns for PFC probe: ['name', 'merchant_name', 'category', 'personal_finance_category']
Non-null PFC rows: 0

Legacy 'category' top 10 BEFORE any mapping:
category
Transfers           46
Debt Payments       32
Shopping            30
Dining              17
Services            14
Transportation       7
Entertainment        4
Home Improvement     2
Uncategorized        1
Fees                 1
Name: count, dtype: int64


In [6]:
# --- build_latest.ipynb — Cell 4 ---
if combined.empty:
    combined = pd.DataFrame(columns=[
        "date","name","merchant_name","category","amount","payment_channel","pending",
        "account_id","transaction_id","bank_name","card_name"
    ])

# Normalize category list → string
if "category" in combined.columns:
    combined["category"] = combined["category"].apply(
        lambda x: " > ".join(x) if isinstance(x, (list, tuple)) else x
    )

# Ensure date type
combined["date"] = pd.to_datetime(combined["date"], errors="coerce")

# Keep only expected columns
keep_cols = [
    "date","name","merchant_name","category","amount",
    "payment_channel","pending","account_id","transaction_id",
    "bank_name","card_name"
]
combined = combined[[c for c in keep_cols if c in combined.columns]].copy()

# 🔗 Upgrade card_name from accounts_dim.csv if available
acc_dim_path = repo_root / "config" / "accounts_dim.csv"
if acc_dim_path.exists() and "account_id" in combined.columns:
    try:
        acc = pd.read_csv(acc_dim_path)
        if {"account_id","card_name"}.issubset(set(acc.columns)):
            acc_small = acc[["account_id","card_name","bank_name"]].drop_duplicates("account_id")
            combined = combined.merge(acc_small, on="account_id", how="left", suffixes=("","_dim"))
            # prefer dim's card_name when present
            combined["card_name"] = combined["card_name_dim"].fillna(combined["card_name"])
            # prefer dim bank_name only if bank_name missing
            if "bank_name_dim" in combined.columns:
                combined["bank_name"] = combined["bank_name"].fillna(combined["bank_name_dim"])
            combined.drop(columns=[c for c in ["card_name_dim","bank_name_dim"] if c in combined.columns], inplace=True)
    except Exception as e:
        print(f"⚠️ accounts_dim join skipped: {e}")

# Sort newest first
combined = combined.sort_values("date", ascending=False).reset_index(drop=True)

# --- PFC extraction and friendly mapping (base category) ---
if "personal_finance_category" not in combined.columns:
    combined["personal_finance_category"] = None

combined["category_pfc_primary"] = combined["personal_finance_category"].apply(
    lambda x: (x or {}).get("primary") if isinstance(x, dict) else None
)
combined["category_pfc_detailed"] = combined["personal_finance_category"].apply(
    lambda x: (x or {}).get("detailed") if isinstance(x, dict) else None
)

_pfc_map = {
    "FOOD_AND_DRINK":"Dining",
    "GROCERIES":"Groceries",
    "GENERAL_MERCHANDISE":"Shopping",
    "TRANSPORTATION":"Transportation",
    "TRAVEL":"Travel",
    "HEALTHCARE":"Health",
    "ENTERTAINMENT":"Entertainment",
    "HOME_IMPROVEMENT":"Home Improvement",
    "RENT_AND_UTILITIES":"Utilities",
    "SERVICE":"Services",
    "GOVERNMENT_AND_NON_PROFIT":"Government/Non-Profit",
    "BANK_FEES":"Fees",
    "INCOME":"Income",
    "TRANSFER_OUT":"Transfers",
    "TRANSFER_IN":"Transfers",
    "LOAN_PAYMENTS":"Debt Payments",
    "SUBSCRIPTION":"Subscriptions",
    "RECURRING_SUBSCRIPTIONS":"Subscriptions",
}

# Keep legacy for debugging
combined["category_plaid"] = combined.get("category")

# Final base category: PFC (friendly) -> legacy -> Uncategorized
combined["category"] = (
    combined["category_pfc_primary"]
      .map(lambda s: _pfc_map.get(str(s).upper()) if pd.notna(s) else None)
      .fillna(combined["category_plaid"])
      .fillna("Uncategorized")
)

# Fill minimal NA for downstream friendliness (do NOT blank 'category')
for c in ["name","merchant_name","payment_channel","bank_name","card_name"]:
    if c in combined.columns:
        combined[c] = combined[c].fillna("")


In [7]:
# --- build_latest.ipynb — Cell 5 (REVISED) ---
from pathlib import Path
import re
import yaml
import pandas as pd

def merchant_key_from(name: str) -> str:
    """
    Aggressive normalization for merchant identity:
    - Canonicalize brand patterns (AMZN/AMAZON, PAYPAL, SQUARE, APPLE.COM/BILL, GOOGLE*)
    - Strip bank noise (POS/DEBIT/CHECK CRD/ACH/ZELLE/TRANSFER/etc.)
    - Remove store numbers/digits/punctuation; keep letters, &, spaces, and '/' '.' for brand URLs
    - Collapse whitespace; fallback to 'UNKNOWN'
    """
    u = (name or "").upper()

    # Canonical brand replacements (before stripping)
    canon = [
        (r"AMZN\s+MKTPL?C?E?|AMAZON\.?\s*COM", "AMAZON"),
        (r"APPLE\.?\s*COM/?BILL", "APPLE.COM/BILL"),
        (r"\bGOOGLE\*", "GOOGLE "),
        (r"\bSQC?\*", "SQUARE "),
        (r"\bPAYPAL\*?", "PAYPAL "),
    ]
    for pat, repl in canon:
        u = re.sub(pat, repl, u)

    # Strip common bank/payments noise tokens
    noise = [
        r"APPLE PAY ENDING IN \d{4}",
        r"POS(?:\s+PURCHASE)?",
        r"DEBIT(?:\s+CARD)?(?:\s+PURCHASE)?",
        r"CHECK ?CRD",
        r"VISA(?:\s+POS)?", r"MASTERCARD", r"DISCOVER", r"AMEX",
        r"ACH(?:\s+(CREDIT|DEBIT))?", r"WEB AUTHORIZED PMT", r"ONLINE PMT",
        r"ZELLE(?:\s+PAYMENT)?", r"VENMO(?:\s+PAYMENT)?",
        r"XFER", r"TRANSFER",
        r"PURCHASE", r"PENDING", r"REVERSAL", r"ADJ(?:USTMENT)?",
        r"ID[: ]?\d+",
    ]
    for pat in noise:
        u = re.sub(rf"\b{pat}\b", " ", u)

    # Remove store numbers & digits
    u = re.sub(r"#\d{2,}", " ", u)
    u = re.sub(r"\d+", " ", u)

    # Keep letters, '&', spaces, plus '/' '.' for URLish brands; collapse spaces
    u = re.sub(r"[^A-Z&\s\./]", " ", u)
    u = re.sub(r"\s+", " ", u).strip()

    # Post-canon tidy
    u = u.replace("APPLE COM BILL", "APPLE.COM/BILL").strip()
    return u or "UNKNOWN"


def _extract_mapping_dict(loaded_yaml: dict) -> dict:
    """
    Accepts multiple YAML shapes:
    - Flat: { "MICROSOFT": {...}, "ARCO": {...} }
    - Nested: { "MAPPING": { "MICROSOFT": {...}, ... }, "DEFAULTS": {...}, ... }
    - Case-insensitive key for the section name.
    Returns a dict mapping merchant_key → mapping.
    """
    if not isinstance(loaded_yaml, dict):
        return {}

    # Try well-known section names, case-insensitive
    for key in ["MAPPING", "MERCHANTS", "MERCHANT_MAP", "MAP"]:
        for k in loaded_yaml.keys():
            if str(k).strip().upper() == key and isinstance(loaded_yaml[k], dict):
                return loaded_yaml[k]

    # Fall back to flat if keys look like merchant names (values are dicts)
    values_are_dicts = all(isinstance(v, dict) for v in loaded_yaml.values())
    if values_are_dicts:
        # But exclude obvious meta sections (CATEGORIES, DEFAULTS, etc.)
        meta = {"CATEGORIES","DEFAULTS","NECESSITY_FLAGS","NON_SPEND_CATEGORIES","VERSION"}
        if not any(str(k).strip().upper() in meta for k in loaded_yaml.keys()):
            return loaded_yaml

    return {}

def _normalize_yaml_keys(mapping: dict) -> dict:
    """
    Normalize YAML keys with the same merchant_key_from() used for data,
    so exact matches succeed.
    """
    norm = {}
    for k, v in mapping.items():
        mk = merchant_key_from(str(k))
        if mk:
            norm[mk] = v if isinstance(v, dict) else {}
    return norm

def apply_yaml_mapping(df: pd.DataFrame, yobj: dict) -> pd.DataFrame:
    """
    Apply YAML merchant mapping with precedence:
    1) Exact merchant_key in mapping
    2) Token buckets under mapping.<bucket>.tokens (substring match)
    Produces *_final columns; leaves originals intact.
    """
    out = df.copy()
    for c in ["display_name_final","category_final","subcategory_final","tags_final","confidence_final","source_final"]:
        if c not in out.columns:
            out[c] = None

    # Extract 'mapping' block or flat map
    ymap = _extract_mapping_dict(yobj)
    if not isinstance(ymap, dict) or not ymap:
        out["source_final"] = out["source_final"].fillna("raw")
        out["confidence_final"] = out["confidence_final"].fillna("raw")
        return out

    # Split into exact-key map vs token buckets
    exact_map = {}
    token_buckets = []  # list of dicts: {bucket, tokens[], category, subcategory, display_name, tags}
    for k, v in ymap.items():
        if not isinstance(v, dict):
            continue
        if "tokens" in v and isinstance(v["tokens"], list):
            toks = [str(t).strip().upper() for t in v["tokens"] if str(t).strip()]
            if toks:
                token_buckets.append({
                    "bucket": str(k),
                    "tokens": toks,
                    "category": v.get("category"),
                    "subcategory": v.get("subcategory"),
                    "display_name": v.get("display_name"),
                    "tags": v.get("tags"),
                })
        else:
            exact_map[k] = v

    exact_map_norm = _normalize_yaml_keys(exact_map)

    rows = []
    for _, r in out.iterrows():
        mk = str(r.get("merchant_key") or "").upper()

        # 1) Exact match
        m = exact_map_norm.get(mk)
        if m:
            disp = m.get("display_name") or r.get("merchant_name") or r.get("name") or mk
            cat  = m.get("category")
            sub  = m.get("subcategory")
            tags = m.get("tags")
            if isinstance(tags, (list, tuple)): tags = ",".join(str(t) for t in tags)
            rows.append({**r,
                "display_name_final": disp,
                "category_final": cat,
                "subcategory_final": sub,
                "tags_final": tags,
                "confidence_final": m.get("confidence", "map"),
                "source_final": "yaml",
            })
            continue

        # 2) Token buckets (first hit wins)
        hit = None
        for b in token_buckets:
            if any(t in mk for t in b["tokens"]):
                hit = b; break

        if hit:
            disp = hit.get("display_name") or r.get("merchant_name") or r.get("name") or mk
            tags = hit.get("tags")
            if isinstance(tags, (list, tuple)): tags = ",".join(str(t) for t in tags)
            rows.append({**r,
                "display_name_final": disp,
                "category_final": hit.get("category"),
                "subcategory_final": hit.get("subcategory"),
                "tags_final": tags,
                "confidence_final": "bucket",
                "source_final": "yaml-token",
            })
        else:
            # raw passthrough
            rows.append({**r,
                "display_name_final": r.get("display_name_final") or r.get("merchant_name") or r.get("name"),
                "category_final": r.get("category_final"),
                "subcategory_final": r.get("subcategory_final"),
                "tags_final": r.get("tags_final"),
                "confidence_final": r.get("confidence_final") or "raw",
                "source_final": r.get("source_final") or "raw",
            })

    return pd.DataFrame(rows)

def mark_non_spend_flows(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty:
        return df
    pats = [
        r"PAYMENT", r"TRANSFER", r"DIRECTPAY", r"CREDIT",
        r"REFUND", r"REIMBURSE", r"ADJUSTMENT", r"REVERSAL",
        r"ACH(?!.*APPLE\s+CASH)",
        r"WEALTHFRONT"
    ]
    pat = re.compile("|".join(pats))
    names = (df.get("name", pd.Series("", index=df.index)).fillna("") + " " +
             df.get("merchant_name", pd.Series("", index=df.index)).fillna("")).str.upper()
    out = df.copy()
    out["is_non_spend_flow"] = names.str.contains(pat)
    return out


In [8]:
# --- build_latest.ipynb — Cell 6 (REVISED) ---
# Build merchant_key
combined["merchant_key"] = combined["merchant_name"].where(
    combined["merchant_name"].astype(str).str.len() > 0,
    combined["name"]
).map(merchant_key_from)

# Load YAML (repo_root/config/categories.yaml)
PATH_YAML = (repo_root / "config" / "categories.yaml")
ymap_obj = {}
if PATH_YAML.exists():
    try:
        with open(PATH_YAML, "r", encoding="utf-8") as f:
            ymap_obj = yaml.safe_load(f) or {}
    except Exception as e:
        print(f"⚠️ YAML parse error: {e}")
        
# === DIAG 2: Inspect YAML structure & see whether any tokens would match merchant_key ===
from collections import defaultdict

print("\n[DIAG 2] YAML top-level keys:", list((ymap_obj or {}).keys()))
mapping_block = None
for k in ("mapping","MAPPING"):
    if isinstance((ymap_obj or {}).get(k), dict):
        mapping_block = ymap_obj[k]
        break
print("[DIAG 2] Has nested 'mapping' block:", isinstance(mapping_block, dict))
if isinstance(mapping_block, dict):
    print("[DIAG 2] mapping categories (first 20):", list(mapping_block.keys())[:20])

def _strings_in(obj):
    out=[]
    if isinstance(obj, str): out.append(obj)
    elif isinstance(obj, list):
        for x in obj: out.extend(_strings_in(x))
    elif isinstance(obj, dict):
        for v in obj.values(): out.extend(_strings_in(v))
    return out

cat_tokens = {}
if isinstance(mapping_block, dict):
    for cat, node in mapping_block.items():
        toks = [s for s in _strings_in(node) if isinstance(s, str)]
        cat_tokens[cat] = [t for t in toks if len(str(t).strip()) >= 3]

print("\n[DIAG 2] token counts per category (first 10):")
for cat in list(cat_tokens.keys())[:10]:
    print(f" - {cat}: {len(cat_tokens[cat])} tokens")

mks = combined["merchant_key"].astype(str).str.upper().unique() if "merchant_key" in combined.columns else []
hits = []
for cat, toks in cat_tokens.items():
    for tok in toks:
        T = str(tok).upper().strip()
        if not T: continue
        count = sum(1 for mk in mks if T in mk)
        if count:
            hits.append((cat, T, count))

hits = sorted(hits, key=lambda x: (-x[2], x[0], x[1]))
print("\n[DIAG 2] Top token→merchant_key matches (first 20):")
for row in hits[:20]:
    print(" ", row)

DEBUG_DIR = (repo_root / "data" / "processed")
DEBUG_DIR.mkdir(parents=True, exist_ok=True)
pd.DataFrame(hits, columns=["category","token","merchant_key_hits"]).to_csv(DEBUG_DIR / "debug_yaml_token_hits.csv", index=False)
print("Wrote →", DEBUG_DIR / "debug_yaml_token_hits.csv")

# Apply mapping + non-spend
enriched = apply_yaml_mapping(combined, ymap_obj)
enriched = mark_non_spend_flows(enriched)

# COALESCE: canonical columns for downstream visuals
enriched["display_name"] = (
    enriched.get("display_name_final")
            .fillna(enriched.get("merchant_name"))
            .fillna(enriched.get("name"))
)
enriched["category"]    = enriched.get("category_final").fillna(enriched.get("category"))
enriched["subcategory"] = enriched.get("subcategory_final")
enriched["tags"]        = enriched.get("tags_final").fillna("")

# Final export columns (canonical first; keep finals for debug)
cols = [
    "date","name","merchant_name","merchant_key",
    "display_name","category","subcategory","tags",
    "amount","payment_channel","pending","account_id","transaction_id",
    "bank_name","card_name",
    "is_non_spend_flow",
    "display_name_final","category_final","subcategory_final","tags_final",
    "confidence_final","source_final"
]
for c in cols:
    if c not in enriched.columns:
        enriched[c] = None
enriched = enriched[cols].copy()

# Types
enriched["date"] = pd.to_datetime(enriched["date"], errors="coerce")

# Quick visibility on YAML usage
print("YAML mapping hits:", int((enriched["source_final"] == "yaml").sum()))
print("Examples of YAML-mapped rows:")
print(enriched.loc[enriched["source_final"] == "yaml",
                   ["merchant_key","display_name","category","subcategory","tags"]].head(10).to_string(index=False))



[DIAG 2] YAML top-level keys: ['version', 'defaults', 'categories', 'non_spend_categories', 'necessity_flags', 'mapping']
[DIAG 2] Has nested 'mapping' block: True
[DIAG 2] mapping categories (first 20): ['dining', 'groceries', 'shopping', 'transportation', 'travel', 'auto_service', 'health_wellness', 'fitness_sports', 'government_fees', 'credit_card_payment', 'refunds_income']

[DIAG 2] token counts per category (first 10):
 - dining: 9 tokens
 - groceries: 6 tokens
 - shopping: 10 tokens
 - transportation: 5 tokens
 - travel: 2 tokens
 - auto_service: 3 tokens
 - health_wellness: 3 tokens
 - fitness_sports: 3 tokens
 - government_fees: 6 tokens
 - credit_card_payment: 8 tokens

[DIAG 2] Top token→merchant_key matches (first 20):
  ('shopping', 'TARGET', 2)
  ('auto_service', 'FLETCHER JONES', 1)
  ('credit_card_payment', 'CASHBACK BONUS REDEMPTION', 1)
  ('credit_card_payment', 'PAYMENT THANK', 1)
  ('fitness_sports', 'JIU JITSU', 1)
  ('fitness_sports', 'SPECTATION SPORTS', 1)
  ('

In [9]:
# --- build_latest.ipynb — Cell 7 ---
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
latest_path = OUTPUT_DIR / "latest.csv"

# Write enriched (canonical columns included)
enriched.to_csv(latest_path, index=False)

assert latest_path.exists(), "latest.csv was not written."
assert "bank_name" in enriched.columns, "bank_name column missing."
assert "card_name" in enriched.columns, "card_name column missing."

print(f"✅ Latest CSV saved → {latest_path}  rows={len(enriched):,}")
try:
    print("\nPreview (top 10):")
    print(enriched.head(10).to_string(index=False))
    print("\nValue counts — source_final:")
    print(enriched["source_final"].value_counts(dropna=False).head(10))
except Exception:
    pass


✅ Latest CSV saved → C:\Users\kosis\Downloads\Automation\spending-dashboard\data\raw\latest.csv  rows=156

Preview (top 10):
      date                                                                                                                            name merchant_name                                                                       merchant_key                                                                                                                    display_name      category subcategory tags   amount payment_channel  pending  account_id  transaction_id                         bank_name                         card_name  is_non_spend_flow                                                                                                              display_name_final category_final subcategory_final tags_final confidence_final source_final
2025-09-16 Withdrawal ALLY / TYPE: ALLY PAYMT ID: 9833122002 CO: ALLY NAME: Kosisonna Ugochukw %% ACH ECC WEB %% ACH Trace 021000