In [1]:
# One-Click Runner: Plaid sync → Silver → Gold (+ cursors, summary)

import os, re, json, yaml
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv

# ---------- CONFIG ----------
REPO = Path(r"C:\Users\kosis\Downloads\Automation\spending-dashboard").resolve()
ENV_EXPECTED = "sandbox"  # change to "production" when you switch
KEEP_PENDING = False      # leave False so we use posted transactions only
# ----------------------------

# Paths
DATA = REPO / "data"
RAW = DATA / "raw"
INTERIM = DATA / "interim"
PROCESSED = DATA / "processed"
DOCS = REPO / "docs"
CONFIG = REPO / "config"
for p in [RAW, INTERIM, PROCESSED, DOCS, CONFIG]: p.mkdir(parents=True, exist_ok=True)

SILVER    = INTERIM / "transactions_canonical.csv"
ENRICHED  = PROCESSED / "transactions_enriched.csv"
UNKNOWN   = DOCS / "review_unknowns.csv"
ITEMS_JSON   = CONFIG / "plaid_items.json"
CURSORS_JSON = CONFIG / "plaid_cursors.json"
YAML_PATH    = CONFIG / "categories.yaml"

# Load env
load_dotenv(REPO / ".env")
if (os.getenv("PLAID_ENV") or "").lower() != ENV_EXPECTED:
    print(f"⚠️ PLAID_ENV is '{os.getenv('PLAID_ENV')}', expected '{ENV_EXPECTED}'. Continue if intentional.")

# Plaid client
from plaid.configuration import Configuration, Environment
from plaid.api import plaid_api
from plaid import ApiClient
from plaid.model.transactions_sync_request import TransactionsSyncRequest
from plaid.model.transactions_sync_request_options import TransactionsSyncRequestOptions

env_map = {"sandbox": Environment.Sandbox, "production": Environment.Production}
client = plaid_api.PlaidApi(ApiClient(Configuration(
    host=env_map[(os.getenv("PLAID_ENV") or "sandbox").lower()],
    api_key={"clientId": os.getenv("PLAID_CLIENT_ID"), "secret": os.getenv("PLAID_SECRET")},
)))

# Load config
assert ITEMS_JSON.exists(),  "Missing config/plaid_items.json"
items   = json.loads(ITEMS_JSON.read_text(encoding="utf-8")).get("items", [])
cursors = json.loads(CURSORS_JSON.read_text(encoding="utf-8")).get("transactions", {}) if CURSORS_JSON.exists() else {}

# Load Silver (if any)
if SILVER.exists():
    silver = pd.read_csv(SILVER, dtype=str)
else:
    silver = pd.DataFrame(columns=[
        "transaction_id","item_id","account_id","date","name","merchant_name","amount","pending"
    ])
silver["transaction_id"] = silver["transaction_id"].astype(str)

def upsert_rows(df, rows):
    if not rows: return df
    add = pd.DataFrame(rows)
    if add.empty: return df
    add["transaction_id"] = add["transaction_id"].astype(str)
    df = df[~df["transaction_id"].isin(add["transaction_id"])]
    return pd.concat([df, add], ignore_index=True)

def delete_rows(df, removed_ids):
    if not removed_ids: return df
    ids = pd.Series([r["transaction_id"] for r in removed_ids], dtype=str)
    return df[~df["transaction_id"].isin(ids)]

# ---------- STAGE 1: Sync → update Silver ----------
total_added = total_modified = total_removed = 0
next_cursors = {}

for it in items:
    item_id = it["item_id"]; token = it["access_token"]
    cursor  = cursors.get(item_id, "")
    latest  = cursor
    has_more = True
    a=m=r=0

    while has_more:
        req_kwargs = dict(access_token=token, count=500,
                          options=TransactionsSyncRequestOptions(include_personal_finance_category=False))
        if latest: req_kwargs["cursor"] = latest
        resp = client.transactions_sync(TransactionsSyncRequest(**req_kwargs))

        added    = resp["added"]
        modified = resp["modified"]
        removed  = resp["removed"]

        if not KEEP_PENDING:
            added    = [t for t in added    if not t.get("pending", False)]
            modified = [t for t in modified if not t.get("pending", False)]

        def to_row(t):
            return {
                "transaction_id": t.get("transaction_id"),
                "item_id": item_id,
                "account_id": t.get("account_id"),
                "date": t.get("date"),
                "name": t.get("name"),
                "merchant_name": t.get("merchant_name"),
                "amount": str(t.get("amount")),
                "pending": str(t.get("pending", False)),
            }
        silver = upsert_rows(silver, [to_row(t) for t in added])
        silver = upsert_rows(silver, [to_row(t) for t in modified])
        silver = delete_rows(silver, removed)

        latest = resp["next_cursor"]; has_more = resp["has_more"]
        a += len(added); m += len(modified); r += len(removed)

    next_cursors[item_id] = latest
    total_added += a; total_modified += m; total_removed += r
    print(f"[{item_id}] +{a} / ~{m} / -{r} | cursor updated")

silver.to_csv(SILVER, index=False)
cursors.update(next_cursors)
with CURSORS_JSON.open("w", encoding="utf-8") as f:
    json.dump({"transactions": cursors}, f, indent=2)
print(f"✔ Silver written ({len(silver)} rows) & cursors saved")

# ---------- STAGE 2: Silver → Gold (enrich) ----------
def normalize_merchant_key(s: str) -> str:
    if not isinstance(s, str): return ""
    s = s.upper()
    s = re.sub(r"[^A-Z0-9\s]", " ", s)
    return re.sub(r"\s+", " ", s).strip()

# Load YAML
yaml_map = {"merchants": [], "patterns": []}
if YAML_PATH.exists():
    try:
        yraw = yaml.safe_load(YAML_PATH.read_text(encoding="utf-8")) or {}
        yaml_map["merchants"] = yraw.get("merchants", []) or []
        yaml_map["patterns"]  = yraw.get("patterns", []) or []
    except Exception as e:
        print("⚠️ YAML parse issue:", e)

# Exact map
exact_map = {}
for m in yaml_map["merchants"]:
    key = normalize_merchant_key(m.get("match",""))
    if key:
        exact_map[key] = {
            "display_name_final": m.get("display_name") or key,
            "category_final": m.get("category") or "",
            "tags_final": ",".join(m.get("tags", []) or []),
            "confidence_final": "yaml",
            "source_final": "plaid",
        }

# Regex rules
regex_rules = []
for p in yaml_map["patterns"]:
    rx = p.get("regex")
    if not rx: continue
    try:
        regex_rules.append((re.compile(rx, re.I), {
            "display_name_final": p.get("display_name") or "",
            "category_final": p.get("category") or "",
            "tags_final": ",".join(p.get("tags", []) or []),
            "confidence_final": "yaml",
            "source_final": "plaid",
        }))
    except re.error as e:
        print("⚠️ Bad regex skipped:", rx, "|", e)

# Types
df = silver.copy()
df["amount"] = pd.to_numeric(df["amount"], errors="coerce")     # Plaid: outflows +, inflows -
df["date"]   = pd.to_datetime(df["date"], errors="coerce").dt.date

# Description & merchant_key
desc = df["merchant_name"].fillna("").replace("", pd.NA).fillna(df["name"])
df["description"] = desc
df["merchant_key"] = desc.apply(normalize_merchant_key)

# Enrichment scaffold
enriched = pd.DataFrame(index=df.index)
for col in ["display_name_final","category_final","tags_final","confidence_final","source_final"]:
    enriched[col] = ""
enriched["source_final"] = "plaid"

# Exact matches
mask_exact = df["merchant_key"].isin(exact_map.keys())
if mask_exact.any():
    enriched.loc[mask_exact, ["display_name_final","category_final","tags_final","confidence_final","source_final"]] = \
        pd.DataFrame([exact_map[k] for k in df.loc[mask_exact, "merchant_key"]]).values

# Regex matches for remaining blanks
to_regex = enriched["display_name_final"].eq("")
if to_regex.any() and regex_rules:
    candidates = df.loc[to_regex, "description"].fillna("")
    rows = []
    for i, text in candidates.items():
        applied = False
        for rx, mapping in regex_rules:
            if rx.search(text or ""):
                rows.append({
                    "display_name_final": mapping["display_name_final"] or df.at[i, "merchant_key"],
                    "category_final": mapping["category_final"],
                    "tags_final": mapping["tags_final"],
                    "confidence_final": "yaml",
                    "source_final": "plaid",
                })
                applied = True
                break
        if not applied:
            rows.append({"display_name_final":"", "category_final":"", "tags_final":"", "confidence_final":"", "source_final":"plaid"})
    enriched.loc[to_regex, ["display_name_final","category_final","tags_final","confidence_final","source_final"]] = \
        pd.DataFrame(rows).values

# Fallback display_name
still_blank = enriched["display_name_final"].eq("")
enriched.loc[still_blank, "display_name_final"] = df.loc[still_blank, "merchant_key"]

# Non-spend
rx_non_spend = re.compile("|".join([
    r"\bPAYMENT\b", r"\bAUTOPAY\b", r"\bDIRECT\s?PAY\b", r"\bCREDIT\b", r"\bREFUND\b",
    r"\bTRANSFER\b", r"\bZELLE\b", r"\bVENMO\b", r"\bREVERSAL\b"
]), re.I)
is_non_spend = df["name"].fillna("").str.contains(rx_non_spend) | df["merchant_name"].fillna("").str.contains(rx_non_spend)
is_non_spend = is_non_spend | (df["amount"] < 0)

# month_start as Python date
s  = pd.to_datetime(df["date"], errors="coerce")
ms = (s.values.astype("datetime64[M]")).astype("datetime64[D]")
month_start = pd.Series(pd.DatetimeIndex(ms).date, index=df.index)

gold = pd.DataFrame({
    "date": df["date"],
    "account": df["account_id"].astype(str),
    "description": df["name"].astype(str),
    "merchant_key": df["merchant_key"].astype(str),
    "display_name_final": enriched["display_name_final"].astype(str),
    "category_final": enriched["category_final"].astype(str),
    "tags_final": enriched["tags_final"].astype(str),
    "confidence_final": enriched["confidence_final"].replace("", "plaid"),
    "source_final": enriched["source_final"],
    "amount": df["amount"],
    "is_necessity": False,
    "is_non_spend_flow": is_non_spend.astype(bool),
    "month_start": month_start,
})

gold.to_csv(ENRICHED, index=False)

unknown_mask = (gold["category_final"] == "") & (~gold["is_non_spend_flow"])
unknowns = gold.loc[unknown_mask, ["merchant_key","description"]].drop_duplicates().sort_values("merchant_key")
unknowns.to_csv(UNKNOWN, index=False)

# ---------- Summary ----------
print("\n=== RUN SUMMARY ===")
print(f"Silver rows:  {len(silver):,}")
print(f"Gold rows:    {len(gold):,}")
print(f"Unknowns:     {len(unknowns):,}  → {UNKNOWN.name}")
print(f"Wrote Gold:   {ENRICHED}")
print(f"Cursors:      {CURSORS_JSON}")
print(f"Deltas:       +{total_added} / ~{total_modified} / -{total_removed}")
print("✅ Done.")


[V6NJvaWpwgtPd56vnMAbFKxraAkqKbTWDQ5lk] +386 / ~0 / -0 | cursor updated
✔ Silver written (772 rows) & cursors saved

=== RUN SUMMARY ===
Silver rows:  772
Gold rows:    772
Unknowns:     12  → review_unknowns.csv
Wrote Gold:   C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\transactions_enriched.csv
Cursors:      C:\Users\kosis\Downloads\Automation\spending-dashboard\config\plaid_cursors.json
Deltas:       +386 / ~0 / -0
✅ Done.
