In [13]:
# --- Cell 1: Env + paths + Plaid tokens (simple, robust) ---
import os, json, re
from pathlib import Path

# Try to load .env files if present (no hard dependency)
try:
    from dotenv import load_dotenv
    for p in [Path.cwd() / "scripts" / ".env", Path.cwd() / ".env"]:
        if p.exists():
            load_dotenv(p, override=False)
except Exception:
    pass

def mask(s: str | None) -> str:
    if not s:
        return "<missing>"
    return (s[:4] + "‚Ä¶" + s[-4:]) if len(s) > 8 else "***"

# Resolve repo root (prefer GitHub workspace in CI)
gw = os.getenv("GITHUB_WORKSPACE")
if gw:
    repo_root = Path(gw).resolve()
else:
    cwd = Path.cwd().resolve()
    repo_root = next((p for p in [cwd, *cwd.parents] if (p / ".git").exists()), cwd)

# Paths
OUTPUT_DIR  = Path(os.getenv("OUTPUT_DIR", repo_root / "data" / "raw"))
STATE_DIR   = Path(os.getenv("STATE_DIR",  repo_root / ".state"))
TOKENS_PATH = Path(os.getenv("TOKENS_PATH", STATE_DIR / "access_tokens.json"))
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
STATE_DIR.mkdir(parents=True, exist_ok=True)

# Basic Plaid env
PLAID_CLIENT_ID = os.getenv("PLAID_CLIENT_ID", "")
PLAID_SECRET    = os.getenv("PLAID_SECRET", "")
PLAID_ENV       = (os.getenv("PLAID_ENV", "production") or "production").strip().lower()
PLAID_ENV = {"prod":"production","live":"production","dev":"development","devel":"development","sb":"sandbox"}.get(PLAID_ENV, PLAID_ENV)
if PLAID_ENV not in {"production","development","sandbox"}:
    PLAID_ENV = "production"

# --- Load access tokens (env first, then .state/access_tokens.json) ---
def _parse_tokens(blob) -> dict[str, str]:
    if isinstance(blob, dict):
        return {str(k): str(v).strip() for k, v in blob.items() if v}
    if isinstance(blob, list):
        out = {}
        for item in blob:
            if isinstance(item, dict):
                name  = item.get("issuer") or item.get("bank") or item.get("name")
                token = item.get("access_token") or item.get("token")
                if name and token:
                    out[str(name)] = str(token).strip()
        return out
    if isinstance(blob, str):
        s = blob.strip()
        # Try JSON first
        try:
            return _parse_tokens(json.loads(s))
        except Exception:
            # Fallback: key=value or key:value pairs separated by , ; | or newlines
            out = {}
            for part in re.split(r"[,\n;|]+", s):
                part = part.strip()
                if not part:
                    continue
                sep = "=" if "=" in part else (":" if ":" in part else None)
                if not sep:
                    continue
                k, v = part.split(sep, 1)
                k, v = k.strip().strip('"\' '), v.strip().strip('"\' ')
                if k and v:
                    out[k] = v
            return out
    return {}

def load_access_tokens() -> dict[str, str]:
    env_blob = os.getenv("PLAID_ACCESS_TOKENS", "").strip()
    if env_blob:
        t = _parse_tokens(env_blob)
        if t:
            return t
    if TOKENS_PATH.exists():
        try:
            t = _parse_tokens(TOKENS_PATH.read_text(encoding="utf-8", errors="ignore"))
            if t:
                return t
        except Exception:
            pass
    raise AssertionError(
        f"Could not load access tokens. Set PLAID_ACCESS_TOKENS or place JSON/kv pairs at {TOKENS_PATH}."
    )

ACCESS_TOKENS = load_access_tokens()

# Gentle validation (non-fatal): tokens should match the selected environment
expected_prefix = f"access-{PLAID_ENV}-"
bad = [name for name, tok in ACCESS_TOKENS.items() if not str(tok).startswith(expected_prefix)]
if bad:
    print(f"‚ö†Ô∏è Some tokens don‚Äôt look like '{expected_prefix}‚Ä¶' ‚Üí {bad} (continuing anyway)")

print(
    "Env OK ‚Üí",
    "PLAID_CLIENT_ID:", mask(PLAID_CLIENT_ID),
    "| PLAID_ENV:", PLAID_ENV,
    "| OUTPUT_DIR:", str(OUTPUT_DIR),
    "| TOKENS_PATH:", str(TOKENS_PATH),
)
print(f"Loaded {len(ACCESS_TOKENS)} token(s).")


Env OK ‚Üí PLAID_CLIENT_ID: 68bb‚Ä¶6689 | PLAID_ENV: production | OUTPUT_DIR: C:\Users\kosis\Downloads\Automation\spending-dashboard\data\raw | TOKENS_PATH: C:\Users\kosis\Downloads\Automation\spending-dashboard\.state\access_tokens.json
Loaded 3 token(s).


In [14]:
# --- Cell 2: Plaid client init (v10+ preferred, legacy fallback) ---
USE_PLAID_V10 = False
client = None

try:
    # v10+ path
    from plaid.api import plaid_api
    from plaid.configuration import Configuration
    try:
        from plaid.configuration import Environment  # newer enum
        env_host = {
            "production":  Environment.Production,
            "development": Environment.Development,
            "sandbox":     Environment.Sandbox,
        }[PLAID_ENV]
        config = Configuration(host=env_host)
    except Exception:
        # fallback if Environment enum not present
        host_url = {
            "production":  "https://production.plaid.com",
            "development": "https://development.plaid.com",
            "sandbox":     "https://sandbox.plaid.com",
        }[PLAID_ENV]
        config = Configuration(host=host_url)

    from plaid.api_client import ApiClient
    config.api_key["clientId"] = PLAID_CLIENT_ID
    config.api_key["secret"]   = PLAID_SECRET
    api_client = ApiClient(config)
    client = plaid_api.PlaidApi(api_client)
    USE_PLAID_V10 = True
    print("Plaid SDK: v10+ (plaid_api)")
except Exception as e_v10:
    try:
        # legacy path
        from plaid import Client as LegacyClient
        client = LegacyClient(
            client_id=PLAID_CLIENT_ID,
            secret=PLAID_SECRET,
            environment=PLAID_ENV
        )
        USE_PLAID_V10 = False
        print("Plaid SDK: legacy Client()")
    except Exception as e_legacy:
        raise ImportError(
            "Could not initialize Plaid client. Ensure 'plaid-python' is installed. "
            f"v10 error: {e_v10}\nlegacy error: {e_legacy}"
        )

# Optional quick probe (set PRECHECK=1 to enable)
if os.getenv("PRECHECK", "0") == "1" and USE_PLAID_V10:
    from plaid.model.accounts_get_request import AccountsGetRequest
    from plaid.api_client import ApiException
    for issuer, tok in ACCESS_TOKENS.items():
        try:
            n = len(client.accounts_get(AccountsGetRequest(access_token=tok)).to_dict().get("accounts", []))
            print(f"{issuer}: ‚úÖ accounts_get OK ({n} accounts)")
        except ApiException as e:
            print(f"{issuer}: ‚ùå API {e.status} -> {getattr(e, 'body', e)}")

Plaid SDK: v10+ (plaid_api)


In [15]:
# --- Cell 3: Pull & CONSOLIDATE (growing latest.csv, no rolling window) ---
from pathlib import Path
from datetime import date, timedelta  # <-- needed
import hashlib                         # <-- needed for txn_uid/txn_key
import numpy as np
import pandas as pd

# For GET fallback only ‚Äî doesn't affect growth because we always union with prev
DAYS_BACK = int(os.getenv("DAYS_BACK", "730"))
end_date = date.today()
start_date = end_date - timedelta(days=DAYS_BACK)

CURSORS_PATH = STATE_DIR / "plaid_cursors.json"

def load_cursors() -> dict:
    if CURSORS_PATH.exists():
        try:
            return json.loads(CURSORS_PATH.read_text(encoding="utf-8"))
        except Exception:
            pass
    return {}

def save_cursors(cur: dict):
    CURSORS_PATH.parent.mkdir(parents=True, exist_ok=True)
    CURSORS_PATH.write_text(json.dumps(cur, ensure_ascii=False, indent=2), encoding="utf-8")

def normalize_category(x):
    return " > ".join(x) if isinstance(x, (list, tuple)) else x

def ensure_txn_keys(df: pd.DataFrame) -> pd.DataFrame:
    if df is None or df.empty:
        return df
    if "date" in df.columns:
        df["date"] = pd.to_datetime(df["date"], errors="coerce")
    if "amount" in df.columns:
        df["amount"] = pd.to_numeric(df["amount"], errors="coerce")

    # Fallback UID if not present
    if "txn_uid" not in df.columns:
        def _mk(row):
            key = f"{row.get('date')}_{row.get('name')}_{row.get('merchant_name')}_{row.get('amount')}_{row.get('bank_name')}"
            return hashlib.sha1(str(key).encode("utf-8")).hexdigest()
        df["txn_uid"] = df.apply(_mk, axis=1)

    # Normalize transaction_id and build stable key
    if "transaction_id" not in df.columns:
        df["transaction_id"] = pd.Series([pd.NA] * len(df))
    else:
        df["transaction_id"] = df["transaction_id"].replace("", pd.NA)

    df["txn_key"] = df["transaction_id"].where(df["transaction_id"].notna(), df["txn_uid"]).astype(str)
    return df

# Convert raw txn dicts to our normalized schema
def df_from_txns(txns: list[dict], bank_name: str) -> pd.DataFrame:
    if not txns:
        return pd.DataFrame()
    df = pd.DataFrame(txns)

    expected_cols = [
        "name","merchant_name","payment_channel","pending",
        "account_id","transaction_id","category","date","amount"
    ]
    for col in expected_cols:
        if col not in df.columns:
            df[col] = None

    df["category"] = df["category"].apply(normalize_category)
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df["amount"] = pd.to_numeric(df["amount"], errors="coerce")

    df["bank_name"] = bank_name
    df["card_name"] = bank_name

    keep_cols = [
        "date","name","merchant_name","category","amount",
        "payment_channel","pending","account_id","transaction_id",
        "bank_name","card_name"
    ]
    return df[[c for c in keep_cols if c in df.columns]].copy()

# Load previous latest.csv (the growing archive)
latest_csv_path = repo_root / "data" / "raw" / "latest.csv"
prev = pd.DataFrame()
if latest_csv_path.exists():
    try:
        prev = pd.read_csv(latest_csv_path)
        for c in ["transaction_id","txn_uid","txn_key"]:
            if c not in prev.columns:
                prev[c] = pd.Series(dtype=object)
        prev = ensure_txn_keys(prev)
    except Exception as e:
        print(f"‚ö†Ô∏è Could not read previous latest.csv: {e}")

cursors = load_cursors()
all_added_mod = []
all_removed_ids = set()

use_sync = ('USE_PLAID_V10' in globals() and USE_PLAID_V10)

if use_sync:
    # --- Sync-first path using Plaid v10 client (deltas) ---
    from plaid.model.transactions_sync_request import TransactionsSyncRequest

    def sync_one(bank_name: str, access_token: str, cursor: str | None):
        added, modified, removed_ids = [], [], []
        next_cursor = cursor
        while True:
            req_kwargs = {"access_token": access_token, "count": 500}
            if isinstance(next_cursor, str) and next_cursor:
                req_kwargs["cursor"] = next_cursor
            req = TransactionsSyncRequest(**req_kwargs)

            resp = client.transactions_sync(req).to_dict()
            added.extend(resp.get("added", []) or [])
            modified.extend(resp.get("modified", []) or [])

            rem = resp.get("removed", []) or []
            for r in rem:
                if isinstance(r, dict):
                    rid = r.get("transaction_id")
                    if rid: removed_ids.append(rid)
                elif isinstance(r, str):
                    removed_ids.append(r)

            next_cursor = resp.get("next_cursor", next_cursor)
            if not resp.get("has_more", False):
                break

        return (next_cursor if isinstance(next_cursor, str) and next_cursor else None,
                added, modified, removed_ids)

    for bank_name, token in ACCESS_TOKENS.items():
        print(f"üîÑ SYNC {bank_name} (start: {'cursor-present' if cursors.get(token) else 'no-cursor'})")
        cur0 = cursors.get(token)
        next_cur, added, modified, removed_ids = sync_one(bank_name, token, cur0)

        df_add = df_from_txns(added, bank_name)
        df_mod = df_from_txns(modified, bank_name)
        all_added_mod.extend([df_add, df_mod])
        all_removed_ids.update(removed_ids)

        cursors[token] = next_cur
        print(f"   ‚Üí added={len(df_add):,}, modified={len(df_mod):,}, removed={len(removed_ids):,}, next_cursor={'set' if next_cur else 'None'}")

    # Start from previous archive and apply deltas
    combined = prev.copy()

    # Remove deleted ids if present
    if not combined.empty and all_removed_ids and "transaction_id" in combined.columns:
        combined = combined[~combined["transaction_id"].astype(str).isin({str(x) for x in all_removed_ids})]

    # Append adds/mods, then dedupe by txn_key
    new_mod = pd.concat([df for df in all_added_mod if df is not None and not df.empty], ignore_index=True) if all_added_mod else pd.DataFrame()
    if not new_mod.empty:
        union_cols = sorted(set(combined.columns).union(new_mod.columns))
        combined = combined.reindex(columns=union_cols)
        new_mod = new_mod.reindex(columns=union_cols)
        combined = pd.concat([combined, new_mod], ignore_index=True)

else:
    # --- GET fallback (windowed fetch), then UNION with previous archive ---
    def fetch_transactions_get(bank_name: str, access_token: str) -> pd.DataFrame:
        txns = []
        offset = 0
        while True:
            resp = client.Transactions.get(
                access_token=access_token,
                start_date=start_date,
                end_date=end_date,
                options={"count": 500, "offset": offset}
            )
            total = resp["total_transactions"]
            txns.extend(resp["transactions"])
            if len(txns) >= total:
                break
            offset = len(txns)
            if offset > 200_000:
                raise RuntimeError(f"Pagination runaway for {bank_name}")
        return df_from_txns(txns, bank_name)

    frames = []
    for bank_name, token in ACCESS_TOKENS.items():
        print(f"üîÑ GET {bank_name} ({start_date} ‚Üí {end_date})‚Ä¶")
        frames.append(fetch_transactions_get(bank_name, token))
    fetched = pd.concat([f for f in frames if f is not None and not f.empty], ignore_index=True) if frames else pd.DataFrame()

    # UNION with previous archive (no trimming)
    union_cols = sorted(set(prev.columns).union(fetched.columns))
    prev_u = prev.reindex(columns=union_cols)
    fetched_u = fetched.reindex(columns=union_cols)
    combined = pd.concat([prev_u, fetched_u], ignore_index=True)

# --- Final: ensure keys, dedupe by txn_key, sort by date (DESC), ready for save in later cell ---
combined = ensure_txn_keys(combined)
if not combined.empty:
    # Keep most recent per txn_key (modified rows win)
    if "date" in combined.columns:
        combined = combined.sort_values("date").drop_duplicates(subset=["txn_key"], keep="last")
        combined = combined.sort_values("date", ascending=False).reset_index(drop=True)
    else:
        combined = combined.drop_duplicates(subset=["txn_key"], keep="last").reset_index(drop=True)

# Persist updated cursor state (safe to keep)
try:
    save_cursors(cursors)
except Exception as e:
    print(f"‚ö†Ô∏è Could not save cursors: {e}")

print(f"‚úÖ Consolidated {'SYNC' if use_sync else 'GET'} ‚Üí rows={len(combined):,} across {len(ACCESS_TOKENS)} bank(s)")
if not combined.empty and "date" in combined.columns:
    print(f"Dates in latest.csv (post-merge): {str(pd.to_datetime(combined['date']).min())[:10]} ‚Üí {str(pd.to_datetime(combined['date']).max())[:10]}")


üîÑ SYNC Discover (start: cursor-present)
   ‚Üí added=0, modified=0, removed=0, next_cursor=set
üîÑ SYNC Petal (start: cursor-present)
   ‚Üí added=0, modified=0, removed=0, next_cursor=set
üîÑ SYNC Silver State Schools Credit Union (start: cursor-present)
   ‚Üí added=0, modified=0, removed=0, next_cursor=set


ValueError: No objects to concatenate

In [None]:
# --- Cell 4: Clean -> normalize schema ---
if combined.empty:
    # Create an empty but well-typed frame to keep Power BI stable
    combined = pd.DataFrame(columns=[
        "date","name","merchant_name","category","amount","payment_channel","pending",
        "account_id","transaction_id","bank_name","card_name"
    ])

# Normalize category: Plaid sometimes returns list; make it a short string
if "category" in combined.columns:
    combined["category"] = combined["category"].apply(
        lambda x: " > ".join(x) if isinstance(x, (list, tuple)) else x
    )

# Ensure date type
combined["date"] = pd.to_datetime(combined["date"], errors="coerce")

# Keep only expected columns (but don‚Äôt error if some are missing)
keep_cols = [
    "date","name","merchant_name","category","amount",
    "payment_channel","pending","account_id","transaction_id",
    "bank_name","card_name"
]
combined = combined[[c for c in keep_cols if c in combined.columns]].copy()

# Sort newest first
combined = combined.sort_values("date", ascending=False).reset_index(drop=True)

# Fill minimal NA for downstream friendliness
for c in ["name","merchant_name","category","payment_channel","bank_name","card_name"]:
    if c in combined.columns:
        combined[c] = combined[c].fillna("")


In [None]:
# --- Cell 5: YAML helpers (merchant key, mapping, non-spend) ---
from pathlib import Path
import re
import yaml

def merchant_key_from(name: str) -> str:
    s = (name or "").upper()
    s = re.sub(r"APPLE PAY ENDING IN \d{4}", "", s)
    s = re.sub(r"#\d{2,}", "", s)              # strip store numbers like #1234
    s = re.sub(r"\d+", "", s)                  # kill stray digits
    s = re.sub(r"[^A-Z&\s]", " ", s)           # keep letters, ampersand, spaces
    s = re.sub(r"\s+", " ", s).strip()
    return s

def apply_yaml_mapping(df: pd.DataFrame, ymap: dict) -> pd.DataFrame:
    if not ymap or df.empty:
        # Still add standard columns so schema is stable
        out = df.copy()
        for c in ["display_name_final","category_final","subcategory_final","tags_final","confidence_final","source_final"]:
            if c not in out.columns:
                out[c] = None
        out["source_final"] = out["source_final"].fillna("raw")
        out["confidence_final"] = out["confidence_final"].fillna("raw")
        return out

    look = {k.upper(): v for k, v in ymap.items()}
    rows = []
    # (kept for clarity; merge-based vectorization is overkill at this size)
    for _, r in df.iterrows():
        mk = r.get("merchant_key", "")
        m = look.get(mk, {})
        rows.append({
            **r,
            "display_name_final": m.get("display_name", r.get("merchant_name") or r.get("name")),
            "category_final":     m.get("category"),
            "subcategory_final":  m.get("subcategory"),
            "tags_final":         ",".join(m.get("tags", [])) if isinstance(m.get("tags", []), (list, tuple)) else m.get("tags"),
            "confidence_final":   m.get("confidence", "map"),
            "source_final":       "yaml" if m else "raw"
        })
    return pd.DataFrame(rows)

def mark_non_spend_flows(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty: return df
    pats = [
        r"PAYMENT", r"TRANSFER", r"ACH", r"ZELLE", r"DIRECTPAY", r"CREDIT",
        r"REFUND", r"REIMBURSE", r"ADJUSTMENT", r"REVERSAL"
    ]
    pat = re.compile("|".join(pats))
    names = (df.get("name", pd.Series("", index=df.index)).fillna("") + " " +
             df.get("merchant_name", pd.Series("", index=df.index)).fillna("")).str.upper()
    df = df.copy()
    df["is_non_spend_flow"] = names.str.contains(pat)
    return df


In [None]:
# --- Cell 6: Optional YAML enrichment, then finalize columns ---
# Build a robust merchant key
combined["merchant_key"] = combined["merchant_name"].where(
    combined["merchant_name"].astype(str).str.len() > 0,
    combined["name"]
).map(merchant_key_from)

# Load YAML map if exists (use repo_root)
PATH_YAML = (repo_root / "config" / "categories.yaml")
ymap = {}
if PATH_YAML.exists():
    with open(PATH_YAML, "r", encoding="utf-8") as f:
        ymap = yaml.safe_load(f) or {}

# Apply mapping + mark non-spend flows
enriched = apply_yaml_mapping(combined, ymap)
enriched = mark_non_spend_flows(enriched)

# ‚ö†Ô∏è FIXED: missing comma in your original list between card_name and display_name_final
cols = [
    "date","name","merchant_name","merchant_key","category","amount",
    "bank_name","card_name",
    "display_name_final","category_final","subcategory_final","tags_final",
    "is_non_spend_flow","confidence_final","source_final"
]
for c in cols:
    if c not in enriched.columns:
        enriched[c] = None
enriched = enriched[cols].copy()

# Keep dates as date (or datetime) for Power BI
enriched["date"] = pd.to_datetime(enriched["date"], errors="coerce")


In [None]:
# --- Cell 7: Write latest.csv + preview ---
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
latest_path = OUTPUT_DIR / "latest.csv"

# Write enriched directly (so Power BI gets the good stuff)
enriched.to_csv(latest_path, index=False)

# Sanity
assert latest_path.exists(), "latest.csv was not written."
assert "bank_name" in enriched.columns, "bank_name column missing."
assert "card_name" in enriched.columns, "card_name column missing."

print(f"‚úÖ Latest CSV saved ‚Üí {latest_path}  rows={len(enriched):,}")
try:
    print("\nPreview (top 10):")
    print(enriched.head(10).to_string(index=False))
except Exception:
    pass

‚úÖ Latest CSV saved ‚Üí C:\Users\kosis\Downloads\Automation\spending-dashboard\data\raw\latest.csv  rows=294

Preview (top 10):
      date                                                                                                                            name merchant_name                                                                             merchant_key      category  amount                         bank_name                         card_name                                                                                                              display_name_final category_final subcategory_final tags_final  is_non_spend_flow confidence_final source_final
2025-09-10                                                                                                                 Microsoft Store     Microsoft                                                                                MICROSOFT Subscriptions    1.00                          Discover                     