In [None]:
# --- Cell 1: Env, paths, .env loader, tokens (robust) ---
import os, json, re
from pathlib import Path
from datetime import date, timedelta
import pandas as pd

# Optional dotenv
try:
    from dotenv import load_dotenv, find_dotenv
except Exception:
    load_dotenv = None
    find_dotenv = None

def mask(s: str | None) -> str:
    if not s: return "<missing>"
    return (s[:4] + "…" + s[-4:]) if len(s) > 8 else "***"

# Resolve repo root (works from / or /scripts)
cwd = Path.cwd().resolve()
repo_root = next((p for p in [cwd, *cwd.parents] if (p / ".git").exists() or p.name == "spending-dashboard"), cwd)

# Load .env if present (scripts/.env preferred)
def load_envs():
    if load_dotenv is None:
        return
    # explicit override via ENV_PATH, else standard locations
    abs_override = os.getenv("ENV_PATH", str(repo_root / "scripts" / ".env"))
    if abs_override and Path(abs_override).exists():
        try:
            load_dotenv(abs_override, override=False, encoding="utf-8")
        except TypeError:
            load_dotenv(abs_override, override=False)
    for p in [
        repo_root / "scripts" / ".env",
        repo_root / ".env",
        repo_root / "config" / ".env",
        cwd / ".env",
    ]:
        if Path(p).exists():
            try:
                load_dotenv(str(p), override=False, encoding="utf-8")
            except TypeError:
                load_dotenv(str(p), override=False)
    if find_dotenv:
        found = find_dotenv(usecwd=True)
        if found:
            try:
                load_dotenv(found, override=False, encoding="utf-8")
            except TypeError:
                load_dotenv(found, override=False)

load_envs()

# Normalize env
PLAID_CLIENT_ID = os.getenv("PLAID_CLIENT_ID")
PLAID_SECRET    = os.getenv("PLAID_SECRET")
PLAID_ENV       = (os.getenv("PLAID_ENV", "production") or "production").strip().lower()
alias = {"prod":"production","live":"production","dev":"development","devel":"development","sb":"sandbox"}
PLAID_ENV = alias.get(PLAID_ENV, PLAID_ENV)
if PLAID_ENV not in {"production","development","sandbox"}:
    PLAID_ENV = "production"

# Paths (env-overridable)
OUTPUT_DIR = Path(os.getenv("OUTPUT_DIR", str(repo_root / "data" / "raw")))
STATE_DIR  = Path(os.getenv("STATE_DIR",  str(repo_root / ".state")))
TOKENS_PATH = Path(os.getenv("TOKENS_PATH", str(STATE_DIR / "access_tokens.json")))
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
STATE_DIR.mkdir(parents=True, exist_ok=True)

# --- Load & validate access tokens (env > file), canonical-only ---
def _strip_bom(s: str) -> str:
    return s.lstrip("\ufeff") if isinstance(s, str) else s

def _parse_pairs_blob(blob: str) -> dict:
    # Accept separators ; , | or newlines, and k=v or k:v
    raw = [p.strip() for sep in ["\n",";","|",","] for p in (blob.split(sep) if sep in blob else []) if p.strip()]
    if not raw: raw = [blob.strip()]
    out = {}
    for p in raw:
        if "=" in p:
            k, v = p.split("=", 1)
        elif ":" in p:
            k, v = p.split(":", 1)
        else:
            continue
        k = k.strip().strip('"').strip("'")
        v = v.strip().strip('"').strip("'")
        if k and v:
            out[k] = v
    return out

def _normalize_tokens(obj) -> dict:
    if isinstance(obj, dict):
        return {str(k): str(v).strip() for k,v in obj.items()}
    if isinstance(obj, list):
        out = {}
        for item in obj:
            if isinstance(item, dict):
                name = item.get("issuer") or item.get("bank") or item.get("name")
                token = item.get("access_token") or item.get("token")
                if name and token:
                    out[str(name)] = str(token).strip()
        return out
    if isinstance(obj, str):
        s = _strip_bom(obj).strip()
        # JSON first
        try:
            parsed = json.loads(s)
            return _normalize_tokens(parsed)
        except Exception:
            # pairs fallback
            return _parse_pairs_blob(s)
    return {}

def load_access_tokens():
    # 1) env
    blob = os.getenv("PLAID_ACCESS_TOKENS", "").strip()
    if blob:
        tokens = _normalize_tokens(blob)
        if tokens:
            return tokens
    # 2) file
    if TOKENS_PATH.exists():
        raw = TOKENS_PATH.read_text(encoding="utf-8", errors="ignore")
        tokens = _normalize_tokens(raw)
        if tokens:
            return tokens
    raise AssertionError(
        f"Could not load access tokens. Provide PLAID_ACCESS_TOKENS env or a valid JSON at {TOKENS_PATH}."
    )

ACCESS_TOKENS = load_access_tokens()

# Canonical format guard: access-<env>-<identifier> (lowercase letters/digits/hyphens; no / + =)
PAT = re.compile(r"^access-(?:production|development|sandbox)-[a-z0-9\-]+$")
expected_prefix = f"access-{PLAID_ENV}-"
bad = [k for k,v in ACCESS_TOKENS.items() if not isinstance(v, str) or not v.startswith(expected_prefix) or not PAT.match(v)]
assert not bad, f"Non-canonical or wrong-env tokens for: {bad}. Ensure tokens look like '{expected_prefix}…' (no '/', '+', '=')."

print(
    "Env OK →",
    "PLAID_CLIENT_ID:", mask(PLAID_CLIENT_ID),
    "| PLAID_SECRET:", mask(PLAID_SECRET),
    "| PLAID_ENV:", PLAID_ENV,
    "| OUTPUT_DIR:", str(OUTPUT_DIR),
    "| TOKENS_PATH:", str(TOKENS_PATH),
)
print(f"Loaded {len(ACCESS_TOKENS)} token(s).")


Env OK → PLAID_CLIENT_ID: 68bb…6689 | PLAID_SECRET: a605…7df5 | PLAID_ENV: production | OUTPUT_DIR: C:\Users\kosis\Downloads\Automation\spending-dashboard\data\raw | TOKENS_PATH: C:\Users\kosis\Downloads\Automation\spending-dashboard\.state\access_tokens.json
Loaded 3 token(s).


In [13]:
# --- Cell 2: Plaid client init (v10+ preferred, legacy fallback) ---
USE_PLAID_V10 = False
client = None

try:
    # v10+ path
    from plaid.api import plaid_api
    from plaid.configuration import Configuration
    try:
        from plaid.configuration import Environment  # newer enum
        env_host = {
            "production":  Environment.Production,
            "development": Environment.Development,
            "sandbox":     Environment.Sandbox,
        }[PLAID_ENV]
        config = Configuration(host=env_host)
    except Exception:
        # fallback if Environment enum not present
        host_url = {
            "production":  "https://production.plaid.com",
            "development": "https://development.plaid.com",
            "sandbox":     "https://sandbox.plaid.com",
        }[PLAID_ENV]
        config = Configuration(host=host_url)

    from plaid.api_client import ApiClient
    config.api_key["clientId"] = PLAID_CLIENT_ID
    config.api_key["secret"]   = PLAID_SECRET
    api_client = ApiClient(config)
    client = plaid_api.PlaidApi(api_client)
    USE_PLAID_V10 = True
    print("Plaid SDK: v10+ (plaid_api)")
except Exception as e_v10:
    try:
        # legacy path
        from plaid import Client as LegacyClient
        client = LegacyClient(
            client_id=PLAID_CLIENT_ID,
            secret=PLAID_SECRET,
            environment=PLAID_ENV
        )
        USE_PLAID_V10 = False
        print("Plaid SDK: legacy Client()")
    except Exception as e_legacy:
        raise ImportError(
            "Could not initialize Plaid client. Ensure 'plaid-python' is installed. "
            f"v10 error: {e_v10}\nlegacy error: {e_legacy}"
        )

# Optional quick probe (set PRECHECK=1 to enable)
if os.getenv("PRECHECK", "0") == "1" and USE_PLAID_V10:
    from plaid.model.accounts_get_request import AccountsGetRequest
    from plaid.api_client import ApiException
    for issuer, tok in ACCESS_TOKENS.items():
        try:
            n = len(client.accounts_get(AccountsGetRequest(access_token=tok)).to_dict().get("accounts", []))
            print(f"{issuer}: ✅ accounts_get OK ({n} accounts)")
        except ApiException as e:
            print(f"{issuer}: ❌ API {e.status} -> {getattr(e, 'body', e)}")


Plaid SDK: v10+ (plaid_api)


In [14]:
# --- Cell 3: Pull & consolidate transactions across all banks ---
DAYS_BACK = int(os.getenv("DAYS_BACK", "90"))
end_date = date.today()
start_date = end_date - timedelta(days=DAYS_BACK)

all_frames: list[pd.DataFrame] = []

if USE_PLAID_V10:
    from plaid.model.transactions_get_request import TransactionsGetRequest

    def fetch_transactions(bank_name: str, access_token: str) -> pd.DataFrame:
        txns = []
        offset = 0
        while True:
            req = TransactionsGetRequest(
                access_token=access_token,
                start_date=start_date,
                end_date=end_date,
                options={"count": 500, "offset": offset}
            )
            resp = client.transactions_get(req).to_dict()
            txns.extend(resp.get("transactions", []))
            total = resp.get("total_transactions", 0)
            if len(txns) >= total:
                break
            offset = len(txns)
            if offset > 50_000:
                raise RuntimeError(f"Pagination runaway for {bank_name}")
        df = pd.DataFrame(txns)
        if not df.empty:
            df["bank_name"] = bank_name
            df["card_name"] = bank_name  # TEMP: equals bank_name; later we can map account_id → exact card
        return df
else:
    def fetch_transactions(bank_name: str, access_token: str) -> pd.DataFrame:
        txns = []
        offset = 0
        while True:
            resp = client.Transactions.get(
                access_token=access_token,
                start_date=start_date,
                end_date=end_date,
                options={"count": 500, "offset": offset}
            )
            total = resp["total_transactions"]
            txns.extend(resp["transactions"])
            if len(txns) >= total:
                break
            offset = len(txns)
            if offset > 50_000:
                raise RuntimeError(f"Pagination runaway for {bank_name}")
        df = pd.DataFrame(txns)
        if not df.empty:
            df["bank_name"] = bank_name
            df["card_name"] = bank_name  # TEMP: equals bank_name; later we can map account_id → exact card
        return df

# Pull each bank
for bank_name, token in ACCESS_TOKENS.items():
    print(f"🔄 Fetching {bank_name} ({start_date} → {end_date})…")
    df_bank = fetch_transactions(bank_name, token)
    print(f"   → {len(df_bank):,} rows")
    all_frames.append(df_bank)

# Combine
combined = pd.concat(
    [df for df in all_frames if df is not None and not df.empty],
    ignore_index=True
) if all_frames else pd.DataFrame()

print(f"✅ Pulled total {0 if combined.empty else len(combined):,} transactions across {len(ACCESS_TOKENS)} bank(s).")
combined.head(3)


🔄 Fetching Discover (2025-06-11 → 2025-09-09)…
   → 29 rows
🔄 Fetching Petal (2025-06-11 → 2025-09-09)…
   → 27 rows
🔄 Fetching Silver State Schools Credit Union (2025-06-11 → 2025-09-09)…
   → 91 rows
✅ Pulled total 147 transactions across 3 bank(s).


  combined = pd.concat(


Unnamed: 0,account_id,account_owner,amount,authorized_date,authorized_datetime,category,category_id,check_number,counterparties,date,...,pending_transaction_id,personal_finance_category,personal_finance_category_icon_url,transaction_code,transaction_id,transaction_type,unofficial_currency_code,website,bank_name,card_name
0,MeB44vqbEwfQ5YJEbVR8UqrD3J9VKwFge99waB,,60.0,,,,,,"[{'name': 'Bullet Legal Servi', 'type': 'merch...",2025-09-09,...,,"{'confidence_level': 'LOW', 'detailed': 'GENER...",https://plaid-category-icons.plaid.com/PFC_GEN...,,ekDppAQbzvHjDZMJEaRgH9r8V8xARYTejoKpd,place,,,Discover,Discover
1,MeB44vqbEwfQ5YJEbVR8UqrD3J9VKwFge99waB,,-1.0,,,,,,[],2025-09-06,...,,"{'confidence_level': 'VERY_HIGH', 'detailed': ...",https://plaid-category-icons.plaid.com/PFC_GEN...,,YdXEEwaoQxsgvK9qePDdfbpekeq83NHprjbyJ,place,,,Discover,Discover
2,MeB44vqbEwfQ5YJEbVR8UqrD3J9VKwFge99waB,,-1.0,,,,,,[],2025-09-06,...,,"{'confidence_level': 'VERY_HIGH', 'detailed': ...",https://plaid-category-icons.plaid.com/PFC_GEN...,,mxkEEJQ8drir06dPjE5nU5A1R1aqOmC4dYPXv,place,,,Discover,Discover


In [15]:
# --- Cell 4: Clean -> write latest.csv (+ sanity) ---
# Light cleaning/schema for Power BI
if not combined.empty:
    combined["date"] = pd.to_datetime(combined["date"], errors="coerce").dt.date
    keep_cols = [
        "date",
        "name",
        "merchant_name",
        "category",
        "amount",
        "payment_channel",
        "pending",
        "account_id",
        "transaction_id",
        "bank_name",
    ]
    combined = combined[[c for c in keep_cols if c in combined.columns]]
    combined = combined.sort_values("date", ascending=False).reset_index(drop=True)

# Write output
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
latest_path = OUTPUT_DIR / "latest.csv"
combined.to_csv(latest_path, index=False)

# Sanity
assert latest_path.exists(), "latest.csv was not written."
if not combined.empty:
    assert "bank_name" in combined.columns, "bank_name column missing."

print("✅ Saved:", latest_path)
try:
    print("\nPreview (top 10):")
    print(combined.head(10).to_string(index=False))
except Exception:
    pass


✅ Saved: C:\Users\kosis\Downloads\Automation\spending-dashboard\data\raw\latest.csv

Preview (top 10):
      date                                                                                                                                                        name          merchant_name category  amount payment_channel  pending                             account_id                        transaction_id                         bank_name
2025-09-09                                                                                                                                      IN *BULLET LEGAL SERVI     Bullet Legal Servi     None   60.00        in store     True MeB44vqbEwfQ5YJEbVR8UqrD3J9VKwFge99waB ekDppAQbzvHjDZMJEaRgH9r8V8xARYTejoKpd                          Discover
2025-09-09                             Withdrawal ALLY / TYPE: ALLY PAYMT ID: 9833122002 CO: ALLY NAME: Kosisonna Ugochukw %% ACH ECC WEB %% ACH Trace 021000021948953                   None     None  504.22     

In [16]:
from pathlib import Path
import re
import yaml

def merchant_key_from(name: str) -> str:
    s = (name or "").upper()
    s = re.sub(r"APPLE PAY ENDING IN \d{4}", "", s)
    s = re.sub(r"#\d{2,}", "", s)              # strip store numbers like #1234
    s = re.sub(r"\d+", "", s)                  # kill stray digits
    s = re.sub(r"[^A-Z&\s]", " ", s)           # keep letters, ampersand, spaces
    s = re.sub(r"\s+", " ", s).strip()
    return s

def apply_yaml_mapping(df: pd.DataFrame, ymap: dict) -> pd.DataFrame:
    if not ymap:
        return df
    look = {k.upper(): v for k, v in ymap.items()}
    rows = []
    for _, r in df.iterrows():
        mk = r.get("merchant_key", "")
        m = look.get(mk, {})
        rows.append({
            **r,
            "display_name_final": m.get("display_name", r.get("merchant_name") or r.get("name")),
            "category_final":     m.get("category"),
            "subcategory_final":  m.get("subcategory"),
            "tags_final":         ",".join(m.get("tags", [])) if isinstance(m.get("tags", []), (list, tuple)) else m.get("tags"),
            "confidence_final":   m.get("confidence", "map"),
            "source_final":       "yaml"
        })
    return pd.DataFrame(rows)

def mark_non_spend_flows(df: pd.DataFrame) -> pd.DataFrame:
    pats = [
        r"PAYMENT", r"TRANSFER", r"ACH", r"ZELLE", r"DIRECTPAY", r"CREDIT",
        r"REFUND", r"REIMBURSE", r"ADJUSTMENT", r"REVERSAL"
    ]
    pat = re.compile("|".join(pats))
    names = (df["name"].fillna("") + " " + df["merchant_name"].fillna("")).str.upper()
    df["is_non_spend_flow"] = names.str.contains(pat)
    return df


In [None]:
# Build a robust merchant key
combined["merchant_key"] = combined["merchant_name"].fillna(combined["name"]).map(merchant_key_from)

# Load YAML map if exists
PATH_YAML = Path("../config/categories.yaml")
ymap = {}
if PATH_YAML.exists():
    with open(PATH_YAML, "r") as f:
        ymap = yaml.safe_load(f) or {}

# Apply mapping + mark non-spend flows
enriched = apply_yaml_mapping(combined, ymap)
enriched = mark_non_spend_flows(enriched)

# Keep stable columns for Power BI
cols = [
    "date","name","merchant_name","merchant_key","category","amount","bank_name", "card_name"
    "display_name_final","category_final","subcategory_final","tags_final",
    "is_non_spend_flow","confidence_final","source_final"
]
enriched = enriched.reindex(columns=cols)


In [18]:
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
latest_path = OUTPUT_DIR / "latest.csv"
enriched.to_csv(latest_path, index=False)
print(f"✅ Latest CSV saved → {latest_path}  rows={len(enriched)}")


✅ Latest CSV saved → C:\Users\kosis\Downloads\Automation\spending-dashboard\data\raw\latest.csv  rows=147
