In [1]:
# --- enrich_transactions.ipynb — Cell 1: Imports + robust Azure OpenAI init (original) ---
import os, re, json, math, hashlib, ast
from pathlib import Path
from datetime import datetime, timedelta, date
from collections import defaultdict

import numpy as np
import pandas as pd

# Ensure OpenAI SDK
try:
    from openai import OpenAI
except Exception:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "openai"])
    from openai import OpenAI

# Optional dotenv to load local secrets
try:
    from dotenv import load_dotenv, find_dotenv
except Exception:
    load_dotenv = None
    find_dotenv = None

def mask(s: str | None) -> str:
    if not s: return "<missing>"
    s = str(s)
    return (s[:4] + "…" + s[-4:]) if len(s) > 8 else "***"

# --- Paths (robust) ---
cwd = Path.cwd().resolve()
gw = os.getenv("GITHUB_WORKSPACE")
start = Path(gw).resolve() if gw else cwd
repo_root = next((p for p in [start, *start.parents] if (p / ".git").exists()), start)
REPO = repo_root

DATA_RAW       = REPO / "data" / "raw"
DATA_PROCESSED = REPO / "data" / "processed"
CONFIG_DIR     = REPO / "config"
STATE_DIR      = REPO / ".state"
VECTOR_DIR     = REPO / "vectorstore"

MERCHANT_DIM_PATH  = CONFIG_DIR / "merchants_dim.csv"
LATEST_CSV_PATH    = DATA_RAW / "latest.csv"
ENRICHED_OUT_PATH  = DATA_RAW / "latest.csv"               # overwrite stable file for Power BI
ENRICHED_COPY_PATH = DATA_PROCESSED / "latest_enriched.csv"
DIGEST_PATH        = DATA_PROCESSED / "digest_latest.txt"
GOAL_PATH          = DATA_PROCESSED / "goal_nudges_latest.txt"
EMBEDDINGS_PATH    = VECTOR_DIR / "embeddings.parquet"

# Ensure dirs
DATA_RAW.mkdir(parents=True, exist_ok=True)
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)
VECTOR_DIR.mkdir(parents=True, exist_ok=True)

# Config flags
MAP_ALL    = True
GOAL_SAVINGS = 1000.0
ANOMALY_Z  = 2.5

# --- Load .envs (mirror the build notebook behavior) ---
def load_envs():
    if load_dotenv is None:
        return
    # Explicit override
    abs_override = os.getenv("ENV_PATH", str(REPO / "scripts" / ".env"))
    if abs_override and Path(abs_override).exists():
        try:
            load_dotenv(abs_override, override=False, encoding="utf-8")
        except TypeError:
            load_dotenv(abs_override, override=False)
    # Common locations
    for p in [
        REPO / "scripts" / ".env",
        REPO / ".env",
        REPO / "config" / ".env",
        cwd / ".env",
    ]:
        if Path(p).exists():
            try:
                load_dotenv(str(p), override=False, encoding="utf-8")
            except TypeError:
                load_dotenv(str(p), override=False)
    # Last-ditch: auto-find
    if find_dotenv:
        found = find_dotenv(usecwd=True)
        if found:
            try:
                load_dotenv(found, override=False, encoding="utf-8")
            except TypeError:
                load_dotenv(found, override=False)

load_envs()

# --- Azure OpenAI env with fallbacks & normalization (AZURE CLIENT) ---
import os
from openai import AzureOpenAI

AZURE_OPENAI_ENDPOINT    = (os.getenv("AZURE_OPENAI_ENDPOINT") or "").strip().rstrip("/")
AZURE_OPENAI_API_KEY     = (os.getenv("AZURE_OPENAI_API_KEY")  or "").strip()
AZURE_OPENAI_DEPLOYMENT  = (os.getenv("AZURE_OPENAI_DEPLOYMENT") or "").strip()  # chat/completions deployment name
AZURE_OPENAI_API_VERSION = (os.getenv("AZURE_OPENAI_API_VERSION") or "2024-02-15-preview").strip()

def _mask(s: str | None) -> str:
    if not s: return "<missing>"
    s = str(s)
    return (s[:4] + "…" + s[-4:]) if len(s) > 8 else "***"

print(
    "Azure config →",
    "endpoint:", _mask(AZURE_OPENAI_ENDPOINT),
    "| key:", _mask(AZURE_OPENAI_API_KEY),
    "| chat deployment:", AZURE_OPENAI_DEPLOYMENT or "<missing>",
    "| version:", AZURE_OPENAI_API_VERSION
)

_missing = [k for k,v in {
    "AZURE_OPENAI_ENDPOINT": AZURE_OPENAI_ENDPOINT,
    "AZURE_OPENAI_API_KEY": AZURE_OPENAI_API_KEY,
    "AZURE_OPENAI_DEPLOYMENT": AZURE_OPENAI_DEPLOYMENT,
}.items() if not v]
if _missing:
    raise RuntimeError("Azure OpenAI configuration missing: " + ", ".join(_missing))

# ✅ Use AzureOpenAI so the library builds /openai/deployments/{deployment}/... paths
client = AzureOpenAI(
    api_key=AZURE_OPENAI_API_KEY,
    api_version=AZURE_OPENAI_API_VERSION,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
)
print("✅ AzureOpenAI client initialized (deployment-aware).")

Azure config → endpoint: http….com | key: 1kTH…msql | chat deployment: gpt-4o-mini | version: 2024-02-15-preview
✅ AzureOpenAI client initialized (deployment-aware).


In [2]:
# --- enrich_transactions.ipynb — Cell 2: Load latest.csv (original) ---
candidates = [
    LATEST_CSV_PATH,
    Path(os.getenv("OUTPUT_DIR", str(REPO / "data" / "raw"))) / "latest.csv",
    REPO / "data" / "raw" / "latest.csv",
]
src = next((p for p in candidates if p.exists()), None)
if src is None:
    raise FileNotFoundError(
        "latest.csv not found.\nChecked:\n- " + "\n- ".join(str(p) for p in candidates) +
        f"\nCWD={Path.cwd()}  REPO={REPO}"
    )

df = pd.read_csv(src)

# Ensure expected columns exist
expected = {"date","name","merchant_name","category","amount","bank_name"}
missing = expected - set(df.columns)
if missing:
    raise ValueError(f"latest.csv missing columns: {missing}")

# Ensure card_name exists (fallback to bank_name)
if "card_name" not in df.columns:
    df["card_name"] = df["bank_name"]

# Coerce types
df["date"] = pd.to_datetime(df["date"], errors="coerce")
df["amount"] = pd.to_numeric(df["amount"], errors="coerce")

# Basic cleanups
df["merchant_name"] = df["merchant_name"].fillna("")
df["name"] = df["name"].fillna("")

# A robust unique id for each transaction (for embeddings & caching)
def make_txn_uid(row):
    key = f"{row.get('date')}_{row.get('name')}_{row.get('merchant_name')}_{row.get('amount')}_{row.get('bank_name')}"
    return hashlib.sha1(key.encode("utf-8")).hexdigest()

df["txn_uid"] = df.apply(make_txn_uid, axis=1)

# Global sign convention: True if expenses are negative numbers
EXPENSES_ARE_NEGATIVE = (df["amount"] < 0).sum() > (df["amount"] > 0).sum()
print(f"Loaded {len(df)} transactions. expenses_are_negative={EXPENSES_ARE_NEGATIVE}")


Loaded 156 transactions. expenses_are_negative=False


In [3]:
print("\n[ENRICH DIAG] category non-null count:", df["category"].notna().sum())
print(df["category"].fillna("<<NULL>>").value_counts().head(12))



[ENRICH DIAG] category non-null count: 156
category
Transfers                46
Debt Payments            32
Shopping                 30
Dining                   17
Services                 14
Transportation            7
Entertainment             4
Home Improvement          2
Uncategorized             1
Fees                      1
Health                    1
Government/Non-Profit     1
Name: count, dtype: int64


In [4]:
# --- enrich_transactions.ipynb — Cell 3: Normalize merchant_key (original) ---
import numpy as np
import re

import re

def merchant_key_from(name: str) -> str:
    """
    Aggressive normalization for merchant identity:
    - Canonicalize brand patterns (AMZN/AMAZON, PAYPAL, SQUARE, APPLE.COM/BILL, GOOGLE*)
    - Strip bank noise (POS/DEBIT/CHECK CRD/ACH/ZELLE/TRANSFER/etc.)
    - Remove store numbers/digits/punctuation; keep letters, &, spaces, and '/' '.' for brand URLs
    - Collapse whitespace; fallback to 'UNKNOWN'
    """
    u = (name or "").upper()

    # Canonical brand replacements (before stripping)
    canon = [
        (r"AMZN\s+MKTPL?C?E?|AMAZON\.?\s*COM", "AMAZON"),
        (r"APPLE\.?\s*COM/?BILL", "APPLE.COM/BILL"),
        (r"\bGOOGLE\*", "GOOGLE "),
        (r"\bSQC?\*", "SQUARE "),
        (r"\bPAYPAL\*?", "PAYPAL "),
    ]
    for pat, repl in canon:
        u = re.sub(pat, repl, u)

    # Strip common bank/payments noise tokens
    noise = [
        r"APPLE PAY ENDING IN \d{4}",
        r"POS(?:\s+PURCHASE)?",
        r"DEBIT(?:\s+CARD)?(?:\s+PURCHASE)?",
        r"CHECK ?CRD",
        r"VISA(?:\s+POS)?", r"MASTERCARD", r"DISCOVER", r"AMEX",
        r"ACH(?:\s+(CREDIT|DEBIT))?", r"WEB AUTHORIZED PMT", r"ONLINE PMT",
        r"ZELLE(?:\s+PAYMENT)?", r"VENMO(?:\s+PAYMENT)?",
        r"XFER", r"TRANSFER",
        r"PURCHASE", r"PENDING", r"REVERSAL", r"ADJ(?:USTMENT)?",
        r"ID[: ]?\d+",
    ]
    for pat in noise:
        u = re.sub(rf"\b{pat}\b", " ", u)

    # Remove store numbers & digits
    u = re.sub(r"#\d{2,}", " ", u)
    u = re.sub(r"\d+", " ", u)

    # Keep letters, '&', spaces, plus '/' '.' for URLish brands; collapse spaces
    u = re.sub(r"[^A-Z&\s\./]", " ", u)
    u = re.sub(r"\s+", " ", u).strip()

    # Post-canon tidy
    u = u.replace("APPLE COM BILL", "APPLE.COM/BILL").strip()
    return u or "UNKNOWN"

# Use 'merchant_name' when available, else 'name'
df["merchant_key"] = np.where(
    df["merchant_name"].astype(str).str.len() > 0,
    df["merchant_name"].map(merchant_key_from),
    df["name"].map(merchant_key_from)
)

print("Merchant keys normalized (consistent with build_latest).")


Merchant keys normalized (consistent with build_latest).


In [5]:
# --- enrich_transactions.ipynb — Cell 4: Load or initialize merchant dimension table (original) ---
dim_cols = [
    "merchant_key", "display_name", "category", "subcategory", "tags",
    "source", "confidence", "last_updated"
]
if MERCHANT_DIM_PATH.exists():
    dim = pd.read_csv(MERCHANT_DIM_PATH)
    # ensure columns
    for c in dim_cols:
        if c not in dim.columns:
            dim[c] = np.nan
    dim = dim[dim_cols]
else:
    dim = pd.DataFrame(columns=dim_cols)

# Left-join to see which keys are already mapped
df = df.merge(dim, on="merchant_key", how="left", suffixes=("", "_dim"))

# Identify unmapped merchants
unmapped_keys = sorted(k for k in df.loc[df["display_name"].isna(), "merchant_key"].unique() if k != "UNKNOWN")
print(f"Unmapped merchants needing AI labels: {len(unmapped_keys)}")


Unmapped merchants needing AI labels: 0


In [6]:
# --- enrich_transactions.ipynb — Cell 5: Label unmapped merchants via Azure (original) ---
new_rows = []
if len(unmapped_keys) and ('chat_client' in globals()) and (chat_client is not None) and MAP_ALL:
    print(f"Labeling {len(unmapped_keys)} merchants (single-call mode)...")
    for idx, mk in enumerate(unmapped_keys, 1):
        try:
            item = azure_label_one(mk)
        except Exception as e:
            print(f"⚠️ Label fail for '{mk}': {e}")
            continue

        now = datetime.utcnow().isoformat()
        if item:
            new_rows.append({
                "merchant_key": mk,
                "display_name": item["display_name"],
                "category": item["category"],
                "subcategory": item["subcategory"],
                "tags": ",".join(item["tags"]),
                "source": "azure",
                "confidence": 0.90,
                "last_updated": now
            })

    if new_rows:
        dim_new = pd.DataFrame(new_rows)
        dim_all = pd.concat([dim, dim_new], ignore_index=True)
        dim_all = dim_all.sort_values("last_updated").drop_duplicates(["merchant_key"], keep="last")
        MERCHANT_DIM_PATH.parent.mkdir(parents=True, exist_ok=True)
        dim_all.to_csv(MERCHANT_DIM_PATH, index=False)
        dim = dim_all
        print(f"✅ Added {len(new_rows)} merchant mappings (single-call).")
    else:
        print("No new mappings added (single-call).")
else:
    print("No new mappings needed or AI disabled.")


No new mappings needed or AI disabled.


In [7]:
# --- enrich_transactions.ipynb — Cell 6: Persist merchants_dim.csv (original) ---
# Toggle if you ever want to skip writing on runs with no changes
PERSIST_MERCHANT_DIM = True

# dim_cols defined in Cell 4; dim may be updated in Cell 6
if not isinstance(PERSIST_MERCHANT_DIM, bool):
    PERSIST_MERCHANT_DIM = True

if PERSIST_MERCHANT_DIM:
    MERCHANT_DIM_PATH.parent.mkdir(parents=True, exist_ok=True)

    if 'dim' in globals() and isinstance(dim, pd.DataFrame) and len(dim):
        # ensure expected columns/order exist before save
        for c in dim_cols:
            if c not in dim.columns:
                dim[c] = np.nan
        dim = dim[dim_cols]

        dim.to_csv(MERCHANT_DIM_PATH, index=False)
        print(f"📝 merchants_dim.csv saved ({len(dim)} rows) → {MERCHANT_DIM_PATH}")
    else:
        # either no new mappings this run or dim was empty; ensure file exists
        if not MERCHANT_DIM_PATH.exists():
            pd.DataFrame(columns=dim_cols).to_csv(MERCHANT_DIM_PATH, index=False)
            print(f"📝 Created headers-only merchants_dim.csv → {MERCHANT_DIM_PATH}")
        else:
            print("ℹ️ merchants_dim.csv already exists; no changes to sync.")
else:
    print("PERSIST_MERCHANT_DIM=False → skipping merchants_dim.csv persistence.")


📝 merchants_dim.csv saved (10 rows) → C:\Users\kosis\Downloads\Automation\spending-dashboard\config\merchants_dim.csv


In [8]:
# --- enrich_transactions.ipynb — Cell 7 (FIXED): Join labels + build display & category ---

# 0) Cache Plaid's original category BEFORE any drops/merges
if "category" in df.columns:
    plaid_category_cached = df["category"].copy()
else:
    plaid_category_cached = pd.Series(pd.NA, index=df.index, dtype="object")

# 1) Remove prior label cols from earlier merges (do NOT drop our cached var)
df = df.drop(
    columns=["display_name","category","subcategory","tags","source","confidence","last_updated"],
    errors="ignore"
)

# 2) Merge merchants_dim (right has display_name/category/subcategory/tags)
df = df.merge(dim, on="merchant_key", how="left", suffixes=("", "_dim"))

# 3) Restore Plaid category safely
df["category_plaid"] = plaid_category_cached

# 4) Ensure YAML-final columns exist (from build_latest)
for m in ["display_name_final","category_final","subcategory_final","tags_final"]:
    if m not in df.columns:
        df[m] = pd.NA

# 5) Rename dim category fields to explicit names (if present)
if "category" in df.columns and "category_dim" not in df.columns:
    df = df.rename(columns={"category": "category_dim"})
if "subcategory" in df.columns and "subcategory_dim" not in df.columns:
    df = df.rename(columns={"subcategory": "subcategory_dim"})

# Helper: pick first non-empty/non-NaN string
def pick_first_nonblank(row, cols):
    for c in cols:
        if c in row.index:
            v = row[c]
            if pd.isna(v):
                continue
            s = str(v).strip()
            if s and s.lower() not in {"nan", "none"}:
                return s
    return ""

# 6) DISPLAY NAME → dim.display_name → YAML display_name_final → merchant_key
df["display_name"] = df.apply(
    lambda r: pick_first_nonblank(r, ["display_name", "display_name_final", "merchant_key"]),
    axis=1
)
df.loc[df["display_name"].str.strip().eq(""), "display_name"] = df["merchant_key"]

# 7) CATEGORY (primary) → dim.category_dim → YAML category_final → Plaid category_plaid
df["category_display"] = df.apply(
    lambda r: pick_first_nonblank(r, ["category_dim", "category_final", "category_plaid"]),
    axis=1
).astype(str).str.strip()
df.loc[df["category_display"].eq("") | df["category_display"].str.lower().eq("none"), "category_display"] = "Uncategorized"

# Keep legacy 'category' column in sync for downstream code/Power BI
df["category"] = df["category_display"]

# 8) SUBCATEGORY/TAGS (same precedence)
df["subcategory_display"] = df.apply(
    lambda r: pick_first_nonblank(r, ["subcategory_dim", "subcategory_final"]),
    axis=1
)
df["tags_display"] = df.apply(
    lambda r: pick_first_nonblank(r, ["tags", "tags_final"]),
    axis=1
)

# Keep legacy columns populated for downstream
df["subcategory"] = df.get("subcategory_display")
df["tags"] = df.get("tags_display")

# 9) Ensure required columns exist for save step
final_cols = [
    "txn_uid", "date", "bank_name", "card_name",
    "merchant_key", "display_name",
    "category", "subcategory_display", "tags_display",
    "name", "merchant_name", "amount"
]
for c in final_cols:
    if c not in df.columns:
        df[c] = pd.NA

print("✅ Labels joined. category_display built with precedence: dim → YAML → Plaid.")
print("Category sample:", df["category_display"].value_counts(dropna=False).head(10).to_dict())


✅ Labels joined. category_display built with precedence: dim → YAML → Plaid.
Category sample: {'Transfers': 46, 'Debt Payments': 32, 'Shopping': 30, 'Dining': 17, 'Services': 14, 'Transportation': 7, 'Entertainment': 4, 'Home Improvement': 2, 'Uncategorized': 1, 'Fees': 1}


In [9]:
# Re-merge guesses to current df and rebuild display
df = df.drop(columns=["display_name","category_dim","subcategory_dim","tags","source","confidence","last_updated"],
             errors="ignore")
df = df.merge(dim, on="merchant_key", how="left", suffixes=("", "_dim"))

# Helper
def pick_first_nonblank(row, cols):
    for c in cols:
        if c in row.index:
            v = row[c]
            if pd.isna(v): 
                continue
            s = str(v).strip()
            if s and s.lower() not in {"nan","none"}:
                return s
    return ""

# --- FIX: build display_name then fill empties from merchant_key (no .replace here)
df["display_name"] = df.apply(
    lambda r: pick_first_nonblank(r, ["display_name", "display_name_final", "merchant_key"]),
    axis=1
)
empty_mask = df["display_name"].astype(str).str.strip().eq("")
df.loc[empty_mask, "display_name"] = df.loc[empty_mask, "merchant_key"]

# Explicitly name dim columns if present
if "category" in df.columns and "category_dim" not in df.columns:
    df.rename(columns={"category":"category_dim"}, inplace=True)
if "subcategory" in df.columns and "subcategory_dim" not in df.columns:
    df.rename(columns={"subcategory":"subcategory_dim"}, inplace=True)

# category_display precedence: dim → YAML → Plaid, then fallback
df["category_display"] = df.apply(
    lambda r: pick_first_nonblank(r, ["category_dim","category_final","category_plaid"]),
    axis=1
).astype(str).str.strip()
df.loc[df["category_display"].eq("") | df["category_display"].str.lower().eq("none"), "category_display"] = "Uncategorized"

# Keep legacy 'category' in sync
df["category"] = df["category_display"]

print("[AI Guess] category_display refreshed.")


[AI Guess] category_display refreshed.


In [10]:
# --- enrich_transactions.ipynb — Cell 8: Subscription detection (original) ---
def detect_subscription(group: pd.DataFrame) -> bool:
    g = group.dropna(subset=["date", "amount"]).sort_values("date")
    if len(g) < 3:
        return False

    # use absolute spend magnitudes for stability
    amounts = g["amount"].abs().to_numpy(dtype=float)
    amounts = amounts[np.isfinite(amounts)]
    if amounts.size < 3:
        return False

    # gaps in days
    ts_ns = g["date"].astype("int64").to_numpy()
    gaps_days = np.diff(ts_ns) / 86_400_000_000_000
    if gaps_days.size < 2:
        return False

    monthlyish_med = float(np.median(gaps_days))
    frac_monthly = float(np.mean((gaps_days >= 27) & (gaps_days <= 33))) if gaps_days.size else 0.0

    mu = float(np.mean(amounts))
    cv = float(np.std(amounts) / (mu + 1e-9)) if mu > 0 else 1.0

    return (27 <= monthlyish_med <= 33) and (frac_monthly >= 0.6) and (cv <= 0.2)

# Clean any leftover artifacts from previous runs (e.g., is_subscription_x from merges)
for col in [c for c in df.columns if c.startswith("is_subscription") and c != "is_subscription"]:
    df.drop(columns=col, inplace=True, errors="ignore")

# Respect your sign convention
EXPENSES_ARE_NEGATIVE = (df["amount"] < 0).sum() > (df["amount"] > 0).sum()
if EXPENSES_ARE_NEGATIVE:
    outflows = df.loc[(df["amount"] < 0) & df["date"].notna(), ["display_name", "date", "amount"]].copy()
    outflows["amount"] = outflows["amount"].abs()
else:
    outflows = df.loc[(df["amount"] > 0) & df["date"].notna(), ["display_name", "date", "amount"]].copy()

subs_map = {}
for disp, g in outflows.groupby("display_name", dropna=False):
    try:
        subs_map[disp] = bool(detect_subscription(g[["date", "amount"]]))
    except Exception:
        subs_map[disp] = False

df["is_subscription"] = df["display_name"].map(subs_map).fillna(False).astype(bool)

print(f"Subscriptions flagged: {int(df['is_subscription'].sum())} candidates.")


Subscriptions flagged: 0 candidates.


  df["is_subscription"] = df["display_name"].map(subs_map).fillna(False).astype(bool)


In [11]:
# --- enrich_transactions.ipynb — Cell 9: Anomaly detection (original) ---
def zscores(x):
    mu = np.mean(x)
    sd = np.std(x)
    if sd == 0:
        return np.zeros_like(x)
    return (x - mu) / sd

df["amount_abs"] = df["amount"].abs()
df["z_by_merchant"] = (
    df.groupby("display_name", dropna=False)["amount_abs"]
      .transform(zscores)
)
df["is_anomaly"] = (df["z_by_merchant"] >= ANOMALY_Z)

print(f"Anomalies flagged: {int(df['is_anomaly'].sum())}")


Anomalies flagged: 2


In [12]:
# --- enrich_transactions.ipynb — Cell 10: 30-day digest (original) ---
today = pd.Timestamp(date.today())
cut1 = today - pd.Timedelta(days=30)
cut2 = today - pd.Timedelta(days=60)

cur = df[(df["date"] > cut1) & (df["amount"] > 0)]
prev = df[(df["date"] > cut2) & (df["date"] <= cut1) & (df["amount"] > 0)]

cur_total = cur["amount"].sum()
prev_total = prev["amount"].sum()
delta = cur_total - prev_total

top_merchants = (
    cur.groupby("display_name", dropna=False)["amount"].sum()
       .sort_values(ascending=False)
       .head(3)
)

top_category = (
    cur.groupby("category", dropna=False)["amount"].sum()
       .sort_values(ascending=False)
       .head(1)
)
top_category_name = top_category.index[0] if len(top_category) else "N/A"
top_category_amt = float(top_category.iloc[0]) if len(top_category) else 0.0

digest = []
digest.append(f"Period: last 30 days vs prior 30")
digest.append(f"Spend: ${cur_total:,.2f} ({'+' if delta>=0 else ''}{delta:,.2f} vs prior)")
digest.append("Top 3 merchants: " + ", ".join([f"{m} (${v:,.2f})" for m, v in top_merchants.items()]))
digest.append(f"Biggest category driver: {top_category_name} (${top_category_amt:,.2f})")

DATA_PROCESSED.mkdir(parents=True, exist_ok=True)
with open(DIGEST_PATH, "w", encoding="utf-8") as f:
    f.write("\n".join(digest))

print("\n".join(digest))
print(f"\nSaved digest → {DIGEST_PATH}")


Period: last 30 days vs prior 30
Spend: $5,794.09 (+1,953.32 vs prior)
Top 3 merchants: Petal ($1,451.92), Withdrawal AMEX EPAYMENT / TYPE: ACH PMT ID: 0005000008 DATA: ER AM CO: AMEX EPAYMENT NAME: KOSISONNA UGOCHUKWU %% ACH ECC WEB %% ACH Trace 091000011489512 ($777.78), Withdrawal ALLY / TYPE: ALLY PAYMT ID: 9833122002 CO: ALLY NAME: Kosisonna Ugochukw %% ACH ECC WEB %% ACH Trace 021000021948953 ($504.22)
Biggest category driver: Debt Payments ($3,377.79)

Saved digest → C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\digest_latest.txt


In [13]:
# --- enrich_transactions.ipynb — Cell 11: Goal nudges (original) ---
cur_by_cat = (
    df[(df["date"] > cut1) & (df["amount"] > 0)]
      .groupby("category", dropna=False)["amount"].sum()
      .sort_values(ascending=False)
)

nudges = []
remaining = GOAL_SAVINGS
for cat, amt in cur_by_cat.items():
    if remaining <= 0:
        break
    # propose cutting up to 40% of this category
    max_cut = 0.40 * amt
    if max_cut <= 0:
        continue
    pct_needed = min(remaining / amt, 0.40)  # cap at 40%
    if pct_needed > 0:
        nudges.append((cat, pct_needed))
        remaining -= pct_needed * amt

lines = [f"Goal: Save ${GOAL_SAVINGS:,.0f} next 30 days"]
if nudges:
    for (cat, pct) in nudges:
        lines.append(f"- Cut {cat} by {pct*100:.0f}%")
else:
    lines.append("- Spending already low or insufficient category concentration to suggest cuts.")

with open(GOAL_PATH, "w", encoding="utf-8") as f:
    f.write("\n".join(lines))

print("\n".join(lines))
print(f"\nSaved goal nudges → {GOAL_PATH}")


Goal: Save $1,000 next 30 days
- Cut Debt Payments by 30%

Saved goal nudges → C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\goal_nudges_latest.txt


In [14]:
# --- enrich_transactions.ipynb — Cell 12 (UPDATED): Embeddings cache (NA-safe) ---

import math
import pandas as pd

EMBED_MODEL = (os.getenv("AZURE_OPENAI_EMBEDDINGS") or os.getenv("OPENAI_EMBEDDINGS_DEPLOYMENT") or "").strip()
EMBED_ENABLED = bool(EMBED_MODEL)
print("EMBED_MODEL:", EMBED_MODEL or "<disabled>")

def safe_str(v) -> str:
    """Return '' for None/NaN/pd.NA/'nan'/'None', else a clean string."""
    try:
        if pd.isna(v):
            return ""
    except Exception:
        pass
    s = str(v)
    return "" if s.strip().lower() in {"nan", "none"} else s

def build_search_text(row: pd.Series) -> str:
    # Prefer 'category_display' if you created it; fallback to 'category'
    cat_col = "category_display" if "category_display" in row.index else "category"
    fields = ["display_name", "name", "merchant_name", cat_col, "subcategory", "tags"]
    parts = [safe_str(row.get(f)) for f in fields]
    return " | ".join(p for p in parts if p)

# Limit to recent rows for cost control
embed_df = df.sort_values("date", ascending=False).head(500).copy()
embed_df["search_text"] = embed_df.apply(build_search_text, axis=1)

# Load existing cache (parquet with list column is fine under pyarrow)
if EMBEDDINGS_PATH.exists():
    try:
        old = pd.read_parquet(EMBEDDINGS_PATH)
        if "txn_uid" not in old.columns or "embedding" not in old.columns:
            old = pd.DataFrame(columns=["txn_uid","embedding"])
    except Exception:
        old = pd.DataFrame(columns=["txn_uid","embedding"])
else:
    old = pd.DataFrame(columns=["txn_uid","embedding"])

existing = set(old["txn_uid"]) if len(old) else set()
to_embed = embed_df.loc[~embed_df["txn_uid"].isin(existing), ["txn_uid", "search_text"]]

# Azure embeddings config (enabled only if a deployment name is set)
EMBED_MODEL = (os.getenv("AZURE_OPENAI_EMBEDDINGS") or os.getenv("OPENAI_EMBEDDINGS_DEPLOYMENT") or "").strip()
EMBED_ENABLED = bool(EMBED_MODEL)

def get_embeddings(texts: list[str]):
    if not EMBED_ENABLED:
        return None
    # Use the same Azure OpenAI client; model is your embeddings deployment name
    res = client.embeddings.create(model=EMBED_MODEL, input=list(texts))
    return [d.embedding for d in res.data]

new_rows = []
if len(to_embed) and EMBED_ENABLED:
    B = 64
    for i in range(0, len(to_embed), B):
        chunk = to_embed.iloc[i:i+B]
        vecs = get_embeddings(chunk["search_text"].tolist())
        if vecs is None:
            break
        for uid, vec in zip(chunk["txn_uid"].tolist(), vecs):
            if vec is not None:
                new_rows.append({"txn_uid": uid, "embedding": vec})

if new_rows:
    add = pd.DataFrame(new_rows)
    merged = pd.concat([old, add], ignore_index=True).drop_duplicates("txn_uid", keep="last")
    merged.to_parquet(EMBEDDINGS_PATH, index=False)
    print(f"Embeddings cached: +{len(add)} → total {len(merged)}")
else:
    msg = "Embeddings disabled (no AZURE_OPENAI_EMBEDDINGS)" if not EMBED_ENABLED else "No new embeddings needed"
    print(msg)


EMBED_MODEL: text-embedding-3-large
No new embeddings needed


In [15]:
# --- enrich_transactions.ipynb — Cell 13 (REPLACE): Reorder and save ---

save_cols = [
    "txn_uid","date","bank_name","card_name",
    "display_name","merchant_key",
    # Keep BOTH the coalesced and raw category columns
    "category",               # <- coalesced (dim → yaml → plaid)
    "category_display",       # alias of coalesced for clarity in BI
    "category_plaid",         # original Plaid category
    "subcategory","tags",
    "name","merchant_name",
    "amount",
    # analytics flags
    "is_subscription","is_anomaly","z_by_merchant",
    # optional flow flag if present
    "is_non_spend_flow"
]

for c in save_cols:
    if c not in df.columns:
        df[c] = np.nan

df_out = df[save_cols].sort_values(["date", "bank_name"], ascending=[False, True])

# Write both the stable file (Power BI) and a processed copy
df_out.to_csv(ENRICHED_OUT_PATH, index=False)
df_out.to_csv(ENRICHED_COPY_PATH, index=False)

print(f"✅ Enriched CSV saved → {ENRICHED_OUT_PATH}")
print(f"📄 Copy saved → {ENRICHED_COPY_PATH}")
print("Column sanity (first 12):", list(df_out.columns)[:12])
print("Nulls check — category:", int(df_out['category'].isna().sum()),
      "| category_display:", int(df_out['category_display'].isna().sum()),
      "| category_plaid:", int(df_out['category_plaid'].isna().sum()))


✅ Enriched CSV saved → C:\Users\kosis\Downloads\Automation\spending-dashboard\data\raw\latest.csv
📄 Copy saved → C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\latest_enriched.csv
Column sanity (first 12): ['txn_uid', 'date', 'bank_name', 'card_name', 'display_name', 'merchant_key', 'category', 'category_display', 'category_plaid', 'subcategory', 'tags', 'name']
Nulls check — category: 0 | category_display: 0 | category_plaid: 0


In [16]:
print("\n[POST-RUN] category vs category_display (top 12):")
print(df[["category","category_display"]].fillna("<<NULL>>").value_counts().head(12))



[POST-RUN] category vs category_display (top 12):
category               category_display     
Transfers              Transfers                46
Debt Payments          Debt Payments            32
Shopping               Shopping                 30
Dining                 Dining                   17
Services               Services                 14
Transportation         Transportation            7
Entertainment          Entertainment             4
Home Improvement       Home Improvement          2
Fees                   Fees                      1
Government/Non-Profit  Government/Non-Profit     1
Health                 Health                    1
Uncategorized          Uncategorized             1
Name: count, dtype: int64


In [17]:
# --- enrich_transactions.ipynb — Cell 14 (UPDATED): Weekly Executive Digest (WoW + MoM), AI + HTML/MD/CSV ---
import os, re, json, math
from pathlib import Path
from tenacity import retry, stop_after_attempt, wait_exponential

INSIGHTS_DIR = DATA_PROCESSED / "insights"
INSIGHTS_DIR.mkdir(parents=True, exist_ok=True)
DIGEST_JSON   = INSIGHTS_DIR / "digest_latest.json"
DIGEST_MD     = INSIGHTS_DIR / "digest_latest.md"
DIGEST_FLAT   = INSIGHTS_DIR / "digest_latest_flat.csv"
EMAIL_HTML    = INSIGHTS_DIR / "digest_latest_email.html"
EMAIL_SUBJECT = INSIGHTS_DIR / "digest_latest_subject.txt"

# -------- 1) Last COMPLETED week (Mon–Sun), compare WoW --------
try:
    now = pd.Timestamp.now(tz="America/Los_Angeles").normalize()
except Exception:
    now = pd.Timestamp.now().normalize()

wd = int(now.weekday())  # Mon=0 ... Sun=6
days_to_last_sun = 7 if wd == 6 else (wd + 1)
wk_end   = (now - pd.Timedelta(days=days_to_last_sun)).date()      # inclusive Sunday
wk_start = (pd.Timestamp(wk_end) - pd.Timedelta(days=6)).date()    # prior Monday
prev_end = (pd.Timestamp(wk_end) - pd.Timedelta(days=7)).date()
prev_start = (pd.Timestamp(prev_end) - pd.Timedelta(days=6)).date()

def _short_range(ws, we):
    try:
        ws_dt = pd.to_datetime(ws).date(); we_dt = pd.to_datetime(we).date()
        return f"{ws_dt.month}/{ws_dt.day} - {we_dt.month}/{we_dt.day}"
    except Exception:
        return f"{ws} - {we}"

# -------- 1b) Exclusions: Wealthfront moves are not spend/income; Apple Cash stays --------
base = df.copy()
for c in ("display_name","merchant_name","name"):
    if c not in base.columns:
        base[c] = ""
txt_all = (base["display_name"].astype(str) + " " +
           base["merchant_name"].astype(str) + " " +
           base["name"].astype(str)).str.upper()

wealthfront_mask = txt_all.str.contains(r"\bWEALTHFRONT\b", na=False)
applecash_mask   = txt_all.str.contains(r"\bAPPLE\s+CASH\b", na=False)
base = base.loc[~(wealthfront_mask & ~applecash_mask)].copy()

if "is_non_spend_flow" in base.columns:
    non_spend_mask = base["is_non_spend_flow"].fillna(False).astype(bool)
    keep_mask = (~non_spend_mask) | applecash_mask
    base = base.loc[keep_mask].copy()

# Normalize candidate category columns (don’t create if missing)
for col in ("category_display","category","category_final","category_plaid"):
    if col in base.columns:
        s = base[col].astype(str)
        base[col] = s.where(~s.str.strip().isin(["", "nan", "None"]), np.nan)

def _best_category_col(frame: pd.DataFrame) -> str | None:
    candidates = ["category_display","category","category_final","category_plaid"]
    for c in candidates:
        if c in frame.columns and frame[c].notna().any():
            return c
    return None

# Window slices
base["date_only"] = base["date"].dt.date
cur_w  = base[(base["date_only"] >= wk_start) & (base["date_only"] <= wk_end)]
prev_w = base[(base["date_only"] >= prev_start) & (base["date_only"] <= prev_end)]

# -------- 2) Robust sign detection --------
amt_all = base["amount"].dropna()
expenses_are_negative = (amt_all < 0).sum() > (amt_all > 0).sum()

def spend_sum(frame):
    a = frame["amount"].dropna()
    return float(a[a < 0].abs().sum()) if expenses_are_negative else float(a[a > 0].sum())

def income_sum(frame):
    a = frame["amount"].dropna()
    return float(a[a > 0].sum()) if expenses_are_negative else float(a[a < 0].abs().sum())

# WoW
cur_spend   = round(spend_sum(cur_w), 2)
prev_spend  = round(spend_sum(prev_w), 2)
cur_income  = round(income_sum(cur_w), 2)
prev_income = round(income_sum(prev_w), 2)

spend_delta     = round(cur_spend - prev_spend, 2)
spend_delta_pct = round((spend_delta / prev_spend), 4) if prev_spend else (1.0 if cur_spend else 0.0)

# -------- 3) MoM (MTD vs aligned days in prior month) --------
cur_month_start = pd.Timestamp(wk_end).to_period('M').start_time.date()
cur_mtd_end     = wk_end
prev_month = (pd.Timestamp(wk_end).to_period('M') - 1)
prev_month_start = prev_month.start_time.date()
prev_month_end   = prev_month.end_time.date()
days_into_m = (pd.Timestamp(cur_mtd_end) - pd.Timestamp(cur_month_start)).days
aligned_prev_m_end = (pd.Timestamp(prev_month_start) + pd.Timedelta(days=days_into_m)).date()
if aligned_prev_m_end > prev_month_end:
    aligned_prev_m_end = prev_month_end

cur_m = base[(base["date_only"] >= cur_month_start) & (base["date_only"] <= cur_mtd_end)]
prev_m_aligned = base[(base["date_only"] >= prev_month_start) & (base["date_only"] <= aligned_prev_m_end)]
cur_spend_m   = round(spend_sum(cur_m), 2)
prev_spend_m  = round(spend_sum(prev_m_aligned), 2)
cur_income_m  = round(income_sum(cur_m), 2)
prev_income_m = round(income_sum(prev_m_aligned), 2)
spend_delta_m     = round(cur_spend_m - prev_spend_m, 2)
spend_delta_pct_m = round((spend_delta_m / prev_spend_m), 4) if prev_spend_m else (1.0 if cur_spend_m else 0.0)

# -------- 4) Top drivers (category) --------
if expenses_are_negative:
    cur_exp = cur_w[cur_w["amount"] < 0].assign(spend=lambda x: x["amount"].abs())
else:
    cur_exp = cur_w[cur_w["amount"] > 0].assign(spend=lambda x: x["amount"])

CAT_COL = _best_category_col(cur_exp)
if CAT_COL is None:
    CAT_COL = "category_display"
    cur_exp[CAT_COL] = np.nan

top_cats_cur = (
    cur_exp.groupby(CAT_COL, dropna=False)["spend"]
           .sum().sort_values(ascending=False).head(5)
           .reset_index()
)

def _label(v):
    return "Uncategorized" if (pd.isna(v) or str(v).strip() in {"", "nan", "None"}) else str(v)

if CAT_COL in top_cats_cur.columns:
    top_cats_cur[CAT_COL] = top_cats_cur[CAT_COL].apply(_label)

# DEBUG — confirm which column was used and a peek at results
print(f"[Weekly] Category column used: {CAT_COL} | non-null in window: {int(cur_exp[CAT_COL].notna().sum())}")
print(top_cats_cur.head(5).to_string(index=False))

subs_w  = cur_w.loc[cur_w.get("is_subscription", False) == True]
anoms_w = cur_w.loc[cur_w.get("is_anomaly", False) == True]

# -------- 5) Payload for AI --------
summary_payload = {
    "as_of_date": pd.Timestamp(wk_end).isoformat(),
    "window": {
        "current": {"start": str(wk_start), "end": str(wk_end), "label": "Last completed week (Mon–Sun)"},
        "previous": {"start": str(prev_start), "end": str(prev_end)}
    },
    "totals": {
        # WoW
        "spend_current": cur_spend,
        "spend_previous": prev_spend,
        "spend_delta": spend_delta,
        "spend_delta_pct": spend_delta_pct,
        "income_current": cur_income,
        "income_previous": prev_income,
        # MoM (MTD vs aligned prior month MTD)
        "spend_mtd_current": cur_spend_m,
        "spend_mtd_previous": prev_spend_m,
        "spend_mtd_delta": spend_delta_m,
        "spend_mtd_delta_pct": spend_delta_pct_m,
        "income_mtd_current": cur_income_m,
        "income_mtd_previous": prev_income_m,
    },
    "top_categories": [
        {"category": str(r[CAT_COL]), "spend": float(r["spend"])}
        for _, r in top_cats_cur.iterrows()
    ],
    "subscriptions_count": int(subs_w["display_name"].nunique()) if len(subs_w) else 0,
    "anomalies_count": int(anoms_w.shape[0]) if len(anoms_w) else 0,
}

# -------- 6) Azure summarizer (REQUIRED) with theme + narrative --------
def _salvage_json_object(txt: str):
    t = (txt or "").strip()
    if t.startswith("```"):
        t = re.sub(r"^```(?:json)?", "", t, flags=re.IGNORECASE).strip()
        t = re.sub(r"```$", "", t).strip()
    try:
        obj = json.loads(t)
        if isinstance(obj, dict):
            return obj
    except Exception:
        pass
    s, e = t.find("{"), t.rfind("}")
    if s != -1 and e != -1 and e > s:
        cand = t[s:e+1]
        try:
            obj = json.loads(cand)
            if isinstance(obj, dict):
                return obj
        except Exception:
            pass
    try:
        import ast
        obj = ast.literal_eval(t)
        if isinstance(obj, dict):
            return obj
    except Exception:
        pass
    return None

SYSTEM_SUMMARY = (
    "You are an analytics copilot for personal finance. "
    "Using ONLY the provided aggregates (week-over-week and month-over-month), "
    "produce an executive digest in STRICT JSON. No invented numbers."
)
USER_INSTRUCTIONS = (
    "Return ONLY a JSON object with keys:\n"
    "{\n"
    '  "headline": string,\n'
    '  "theme": string,\n'
    '  "narrative": string,\n'
    '  "key_metrics": [ {"name": string, "value": number, "delta_pct": number|null} ],\n'
    '  "top_drivers": [ {"label": string, "spend": number} ],\n'
    '  "risks": [ {"type": "subscription"|"anomaly"|"trend", "note": string} ],\n'
    '  "action_items": [ {"title": string, "impact_usd": number, "rationale": string} ],\n'
    '  "email_subject": string\n'
    "}\n"
    "- Max 5 items per list.\n"
    "- Use negative delta_pct where spend improved.\n"
    "- impact_usd is a rough weekly savings estimate.\n"
)

@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=6))
def _azure_digest_call(payload_json: str) -> str:
    resp = client.chat.completions.create(
        model=AZURE_OPENAI_DEPLOYMENT,
        messages=[
            {"role":"system","content": SYSTEM_SUMMARY},
            {"role":"user","content": USER_INSTRUCTIONS + "\n\nPAYLOAD:\n" + payload_json}
        ],
        temperature=0.15,
        max_tokens=750,
        response_format={"type":"json_object"},
    )
    return (resp.choices[0].message.content or "").strip()

raw = _azure_digest_call(json.dumps(summary_payload))
azure_digest = _salvage_json_object(raw)
if not isinstance(azure_digest, dict):
    raise RuntimeError("Azure summarizer returned no valid JSON. Check Azure env, deployment name, or quota.")

# -------- 7) Overlay (protect authoritative totals/window) + sanitize theme --------
def _overlay(base: dict, over: dict | None) -> dict:
    if not isinstance(over, dict):
        return base
    out = dict(base)
    for k, v in over.items():
        if k in {"totals", "window"}:
            continue
        if k in ("key_metrics","top_drivers","risks","action_items"):
            if isinstance(v, list) and len(v) > 0:
                out[k] = v
        elif v not in (None, "", {}):
            out[k] = v
    return out

short_rng = _short_range(wk_start, wk_end)

base_digest = {
    "insights_version": 3,
    "window": summary_payload["window"],
    "totals": summary_payload["totals"],
    "headline": f"Weekly digest {short_rng}",
    "key_metrics": [
        {"name":"Total Spend (WoW)",  "value": cur_spend,  "delta_pct": spend_delta_pct},
        {"name":"Total Income (WoW)","value": cur_income, "delta_pct": None},
        {"name":"Total Spend (MoM)",  "value": cur_spend_m,"delta_pct": spend_delta_pct_m},
        {"name":"Total Income (MoM)","value": cur_income_m,"delta_pct": None},
    ],
    "top_drivers": [{"label": _label(t["category"]), "spend": float(t["spend"])} for t in summary_payload["top_categories"]],
    "risks": (
        ([{"type":"subscription","note": f"{summary_payload['subscriptions_count']} active subs this week"}] if summary_payload["subscriptions_count"] else []) +
        ([{"type":"anomaly","note": f"{summary_payload['anomalies_count']} anomalies this week"}] if summary_payload["anomalies_count"] else [])
    ),
    "action_items": [],
    "theme": "",
    "narrative": "",
    "email_subject": ""
}

digest = _overlay(base_digest, azure_digest)

# Clamp theme to 3 words max; fallback if empty
if not isinstance(digest.get("theme",""), str) or not digest["theme"].strip():
    digest["theme"] = "Lean Week" if spend_delta < 0 else "Heavier Week"
else:
    words = [w for w in re.split(r"\s+", digest["theme"].strip()) if w]
    digest["theme"] = " ".join(words[:3])

# -------- 8) Compact summary line for tiles --------
def build_compact_summary(d: dict) -> str:
    ws = d.get("window", {}).get("current", {}).get("start", str(wk_start))
    we = d.get("window", {}).get("current", {}).get("end", str(wk_end))
    short = _short_range(ws, we)
    totals = d.get("totals", {}) or {}
    spend_val = float(totals.get("spend_current") or 0.0)
    dp = totals.get("spend_delta_pct")
    if isinstance(dp, (int,float)) and abs(dp) > 1: dp = dp / 100.0
    dp_txt = f"{dp*100:+.1f}%" if isinstance(dp, (int,float)) else "n/a"
    drivers = d.get("top_drivers") or []
    if drivers:
        label = (drivers[0].get("label") or "Uncategorized").strip()
        amt = float(drivers[0].get("spend") or 0.0)
        driver_txt = f"Top driver: {label} (${amt:,.0f})"
    else:
        driver_txt = "Top driver: n/a"
    return f"{short}: Weekly spend ${spend_val:,.0f} (WoW {dp_txt}). {driver_txt}."

digest["summary"] = build_compact_summary(digest)

# -------- 9) Persist JSON --------
with open(DIGEST_JSON, "w", encoding="utf-8") as f:
    json.dump(digest, f, ensure_ascii=False, indent=2)

# -------- 10) Markdown (employer-ready) --------
def render_md(d):
    ws = d.get("window", {}).get("current", {}).get("start", str(wk_start))
    we = d.get("window", {}).get("current", {}).get("end", str(wk_end))
    short = _short_range(ws, we)

    lines = [f"# Weekly Executive Summary ({short})"]
    if d.get("theme"):
        lines.append(f"*{d['theme']}*")
    if d.get("narrative"):
        lines.append(f"\n{d['narrative'].strip()}\n")
    else:
        lines.append(f"\n{d['summary']}\n")

    # Key Metrics
    lines.append("## Key Metrics")
    for m in (d.get("key_metrics") or [])[:8]:
        name = m.get("name","")
        val  = float(m.get("value") or 0.0)
        dp   = m.get("delta_pct")
        if isinstance(dp, (int,float)) and abs(dp) > 1: dp = dp/100.0
        dp_txt = f" ({dp*100:+.1f}%)" if isinstance(dp,(int,float)) else ""
        lines.append(f"- **{name}:** ${val:,.2f}{dp_txt}")

    # Drivers
    td = d.get("top_drivers") or []
    if td:
        lines.append("\n## Drivers")
        for t in td[:5]:
            label = (t.get("label") or "Uncategorized").strip()
            lines.append(f"- **{label}:** ${float(t.get('spend',0)):,.0f}")

    # Risks
    rk = d.get("risks") or []
    if rk:
        lines.append("\n## Risks")
        for r in rk[:5]:
            lines.append(f"- **{r.get('type','note')}:** {r.get('note','')}")

    # Recommendations
    ai = d.get("action_items") or []
    if ai:
        lines.append("\n## Recommendations")
        for a in ai[:5]:
            lines.append(f"- **{a.get('title','')}** — est. ${float(a.get('impact_usd',0)):,.0f}. {a.get('rationale','')}")
    return "\n".join(lines) + "\n"

with open(DIGEST_MD, "w", encoding="utf-8") as f:
    f.write(render_md(digest))

# -------- 11) Email subject + HTML (clean sections) --------
subject = digest.get("email_subject") or f"{digest.get('theme','Weekly Digest')} — {short_rng}"
with open(EMAIL_SUBJECT, "w", encoding="utf-8") as f:
    f.write(subject.strip())

def render_email_html(d):
    ws = d.get("window", {}).get("current", {}).get("start", str(wk_start))
    we = d.get("window", {}).get("current", {}).get("end", str(wk_end))
    short = _short_range(ws, we)
    theme = (d.get("theme") or "").strip()
    narrative = (d.get("narrative") or d.get("summary") or "").strip()

    parts = []
    parts.append("<!doctype html><meta charset='utf-8'>")
    parts.append("<div style='font-family:Segoe UI,system-ui,-apple-system;line-height:1.55;font-size:14px;color:#111827;'>")
    parts.append(f"<h1 style='margin:0 0 4px 0;font-size:18px;'>Weekly Executive Summary ({short})</h1>")
    if theme:
        parts.append(f"<div style='margin:0 0 12px 0;color:#6b7280;font-style:italic'>{theme}</div>")
    if narrative:
        parts.append(f"<p style='margin:0 0 16px 0'>{narrative}</p>")

    parts.append("<h2 style='margin:16px 0 8px 0;font-size:16px;'>Key Metrics</h2><ul style='margin:0 0 12px 18px;'>")
    for m in (d.get("key_metrics") or [])[:8]:
        name = m.get("name","")
        val  = float(m.get("value") or 0.0)
        dp   = m.get("delta_pct")
        if isinstance(dp, (int,float)) and abs(dp) > 1: dp = dp/100.0
        dp_txt = f" ({dp*100:+.1f}%)" if isinstance(dp,(int,float)) else ""
        parts.append(f"<li><b>{name}:</b> ${val:,.2f}{dp_txt}</li>")
    parts.append("</ul>")

    td = d.get("top_drivers") or []
    if td:
        parts.append("<h2 style='margin:16px 0 8px 0;font-size:16px;'>Drivers</h2><ul style='margin:0 0 12px 18px;'>")
        for t in td[:5]:
            label = (t.get("label") or "Uncategorized").strip()
            parts.append(f"<li><b>{label}:</b> ${float(t.get('spend',0)):,.0f}</li>")
        parts.append("</ul>")

    rk = d.get("risks") or []
    if rk:
        parts.append("<h2 style='margin:16px 0 8px 0;font-size:16px;'>Risks</h2><ul style='margin:0 0 12px 18px;'>")
        for r in rk[:5]:
            parts.append(f"<li><b>{r.get('type','note')}:</b> {r.get('note','')}</li>")
        parts.append("</ul>")

    ai = d.get("action_items") or []
    if ai:
        parts.append("<h2 style='margin:16px 0 8px 0;font-size:16px;'>Recommendations</h2><ul style='margin:0 0 12px 18px;'>")
        for a in ai[:5]:
            parts.append(f"<li><b>{a.get('title','')}</b> — est. ${float(a.get('impact_usd',0)):,.0f}. {a.get('rationale','')}</li>")
        parts.append("</ul>")

    parts.append("</div>")
    return "".join(parts)

with open(EMAIL_HTML, "w", encoding="utf-8") as f:
    f.write(render_email_html(digest))

# -------- 12) Flat CSV for Power BI (add MoM metrics too) --------
flat_rows = []
flat_rows.append({
    "row_type": "header",
    "as_of_end": str(wk_end),
    "cur_start": str(wk_start),
    "cur_end": str(wk_end),
    "prev_start": str(prev_start),
    "prev_end": str(prev_end),
    "headline": digest.get("headline",""),
    "summary": digest.get("summary",""),
    "name": "Total Spend (WoW)",
    "value": float(cur_spend),
    "delta_pct": float(spend_delta_pct),
    "label": "",
    "spend": None,
    "note": "",
    "impact_usd": None,
})
for m in digest.get("key_metrics", []):
    flat_rows.append({
        "row_type": "metric",
        "as_of_end": str(wk_end),
        "cur_start": str(wk_start),
        "cur_end": str(wk_end),
        "prev_start": str(prev_start),
        "prev_end": str(prev_end),
        "headline": digest.get("headline",""),
        "summary": "",
        "name": m.get("name",""),
        "value": float(m.get("value",0) or 0.0),
        "delta_pct": (float(m.get("delta_pct")) if isinstance(m.get("delta_pct"), (int,float)) else None),
        "label": "",
        "spend": None,
        "note": "",
        "impact_usd": None,
    })
for t in digest.get("top_drivers", []):
    flat_rows.append({
        "row_type": "driver",
        "as_of_end": str(wk_end),
        "cur_start": str(wk_start),
        "cur_end": str(wk_end),
        "prev_start": str(prev_start),
        "prev_end": str(prev_end),
        "headline": digest.get("headline",""),
        "summary": "",
        "name": "",
        "value": None,
        "delta_pct": None,
        "label": (t.get("label") or "Uncategorized"),
        "spend": float(t.get("spend",0) or 0.0),
        "note": "",
        "impact_usd": None,
    })
for r in digest.get("risks", []):
    flat_rows.append({
        "row_type": "risk",
        "as_of_end": str(wk_end),
        "cur_start": str(wk_start),
        "cur_end": str(wk_end),
        "prev_start": str(prev_start),
        "prev_end": str(prev_end),
        "headline": digest.get("headline",""),
        "summary": "",
        "name": "",
        "value": None,
        "delta_pct": None,
        "label": r.get("type",""),
        "spend": None,
        "note": r.get("note",""),
        "impact_usd": None,
    })
pd.DataFrame(flat_rows).to_csv(DIGEST_FLAT, index=False)

print(
    "🧠 Weekly executive digest written:\n"
    f"- JSON: {DIGEST_JSON}\n- MD:   {DIGEST_MD}\n- HTML: {EMAIL_HTML}\n- CSV:  {DIGEST_FLAT}\n- Subject: {EMAIL_SUBJECT}\n"
    f"Week Window: {wk_start} -> {wk_end} | Prev: {prev_start} -> {prev_end}\n"
    f"WoW Spend: cur={cur_spend} prev={prev_spend} delta={spend_delta} delta_pct={spend_delta_pct:+.4f}\n"
    f"MoM MTD Spend: cur={cur_spend_m} prev={prev_spend_m} delta={spend_delta_m} delta_pct={spend_delta_pct_m:+.4f}"
)


[Weekly] Category column used: category_display | non-null in window: 8
category_display  spend
       Transfers  65.74
        Services  60.00
        Shopping  11.06
Home Improvement   5.39
🧠 Weekly executive digest written:
- JSON: C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\insights\digest_latest.json
- MD:   C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\insights\digest_latest.md
- HTML: C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\insights\digest_latest_email.html
- CSV:  C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\insights\digest_latest_flat.csv
- Subject: C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\insights\digest_latest_subject.txt
Week Window: 2025-09-08 -> 2025-09-14 | Prev: 2025-09-01 -> 2025-09-07
WoW Spend: cur=142.19 prev=526.4 delta=-384.21 delta_pct=-0.7299
MoM MTD Spend: cur=668.59 prev=890.15 delta=-221.56 delta_pct=-0.2489


In [18]:
# --- enrich_transactions.ipynb — Cell 15 (UPDATED): Monthly Executive Digest (MTD vs prior MTD), AI + HTML/MD/CSV ---
import os, re, json, calendar
from pathlib import Path
from tenacity import retry, stop_after_attempt, wait_exponential

INSIGHTS_DIR = DATA_PROCESSED / "insights"
INSIGHTS_DIR.mkdir(parents=True, exist_ok=True)

MOM_JSON   = INSIGHTS_DIR / "digest_mom.json"
MOM_MD     = INSIGHTS_DIR / "digest_mom.md"
MOM_FLAT   = INSIGHTS_DIR / "digest_mom_flat.csv"
MOM_HTML   = INSIGHTS_DIR / "digest_mom_email.html"
MOM_SUBJ   = INSIGHTS_DIR / "digest_mom_subject.txt"

# --- 1) Date windows (align with Cell 14; use LA time) ---
try:
    now = pd.Timestamp.now(tz="America/Los_Angeles").normalize()
except Exception:
    now = pd.Timestamp.now().normalize()

wk_wd = int(now.weekday())
days_to_last_sun = 7 if wk_wd == 6 else (wk_wd + 1)
wk_end   = (now - pd.Timedelta(days=days_to_last_sun)).date()      # inclusive Sunday
cur_month_start = pd.Timestamp(wk_end).to_period('M').start_time.date()
cur_mtd_end     = wk_end

prev_month = (pd.Timestamp(wk_end).to_period('M') - 1)
prev_month_start = prev_month.start_time.date()
prev_month_end   = prev_month.end_time.date()

# Align prior-month MTD to the same number of days as current MTD
days_into_m = (pd.Timestamp(cur_mtd_end) - pd.Timestamp(cur_month_start)).days
aligned_prev_m_end = (pd.Timestamp(prev_month_start) + pd.Timedelta(days=days_into_m)).date()
if aligned_prev_m_end > prev_month_end:
    aligned_prev_m_end = prev_month_end

cur_month_name  = calendar.month_name[pd.to_datetime(cur_mtd_end).month]
prev_month_name = calendar.month_name[pd.to_datetime(prev_month_start).month]

# --- 2) Exclusions: Wealthfront out, Apple Cash in (mirror Cell 14) ---
base = df.copy()
for c in ("display_name","merchant_name","name"):
    if c not in base.columns:
        base[c] = ""
txt_all = (base["display_name"].astype(str) + " " +
           base["merchant_name"].astype(str) + " " +
           base["name"].astype(str)).str.upper()

wealthfront_mask = txt_all.str.contains(r"\bWEALTHFRONT\b", na=False)
applecash_mask   = txt_all.str.contains(r"\bAPPLE\s+CASH\b", na=False)
base = base.loc[~(wealthfront_mask & ~applecash_mask)].copy()

if "is_non_spend_flow" in base.columns:
    non_spend_mask = base["is_non_spend_flow"].fillna(False).astype(bool)
    keep_mask = (~non_spend_mask) | applecash_mask
    base = base.loc[keep_mask].copy()

# Normalize candidate category columns
for col in ("category_display","category","category_final","category_plaid"):
    if col in base.columns:
        s = base[col].astype(str)
        base[col] = s.where(~s.str.strip().isin(["", "nan", "None"]), np.nan)

def _best_category_col(frame: pd.DataFrame) -> str | None:
    candidates = ["category_display","category","category_final","category_plaid"]
    for c in candidates:
        if c in frame.columns and frame[c].notna().any():
            return c
    return None

base["date_only"] = base["date"].dt.date
cur_m  = base[(base["date_only"] >= cur_month_start) & (base["date_only"] <= cur_mtd_end)]
prev_m = base[(base["date_only"] >= prev_month_start) & (base["date_only"] <= aligned_prev_m_end)]

# --- 3) Polarity + totals ---
amt_all = base["amount"].dropna()
expenses_are_negative = (amt_all < 0).sum() > (amt_all > 0).sum()

def spend_sum(frame):
    a = frame["amount"].dropna()
    return float(a[a < 0].abs().sum()) if expenses_are_negative else float(a[a > 0].sum())

def income_sum(frame):
    a = frame["amount"].dropna()
    return float(a[a > 0].sum()) if expenses_are_negative else float(a[a < 0].abs().sum())

cur_spend_m   = round(spend_sum(cur_m), 2)
prev_spend_m  = round(spend_sum(prev_m), 2)
cur_income_m  = round(income_sum(cur_m), 2)
prev_income_m = round(income_sum(prev_m), 2)

spend_delta_m     = round(cur_spend_m - prev_spend_m, 2)
spend_delta_pct_m = round((spend_delta_m / prev_spend_m), 4) if prev_spend_m else (1.0 if cur_spend_m else 0.0)

# --- 4) Top drivers this month (category) ---
CAT_COL = _best_category_col(cur_m)
if CAT_COL is None:
    CAT_COL = "category_display"
    cur_m[CAT_COL] = np.nan

if expenses_are_negative:
    cur_exp_m = cur_m[cur_m["amount"] < 0].assign(spend=lambda x: x["amount"].abs())
else:
    cur_exp_m = cur_m[cur_m["amount"] > 0].assign(spend=lambda x: x["amount"])

top_cats_m = (
    cur_exp_m.groupby(CAT_COL, dropna=False)["spend"]
             .sum().sort_values(ascending=False).head(5)
             .reset_index()
)

def _label(v):
    return "Uncategorized" if (pd.isna(v) or str(v).strip() in {"", "nan", "None"}) else str(v)

if CAT_COL in top_cats_m.columns:
    top_cats_m[CAT_COL] = top_cats_m[CAT_COL].apply(_label)

# DEBUG — confirm which column was used and a peek at results
print(f"[Monthly] Category column used: {CAT_COL} | non-null in MTD: {int(cur_m[CAT_COL].notna().sum())}")
print(top_cats_m.head(5).to_string(index=False))

subs_m  = cur_m.loc[cur_m.get("is_subscription", False) == True]
anoms_m = cur_m.loc[cur_m.get("is_anomaly", False) == True]

# --- 5) Payload for AI ---
summary_payload_m = {
    "as_of_date": pd.Timestamp(cur_mtd_end).isoformat(),
    "window": {
        "current": {"start": str(cur_month_start), "end": str(cur_mtd_end), "label": f"{cur_month_name} MTD"},
        "previous": {"start": str(prev_month_start), "end": str(aligned_prev_m_end), "label": f"{prev_month_name} MTD (aligned)"}
    },
    "totals": {
        "spend_mtd_current": cur_spend_m,
        "spend_mtd_previous": prev_spend_m,
        "spend_mtd_delta": spend_delta_m,
        "spend_mtd_delta_pct": spend_delta_pct_m,
        "income_mtd_current": cur_income_m,
        "income_mtd_previous": prev_income_m,
    },
    "top_categories": [
        {"category": str(r[CAT_COL]), "spend": float(r["spend"])}
        for _, r in top_cats_m.iterrows()
    ],
    "subscriptions_count": int(subs_m["display_name"].nunique()) if len(subs_m) else 0,
    "anomalies_count": int(anoms_m.shape[0]) if len(anoms_m) else 0,
}

# --- 6) Azure summarizer (REQUIRED): same personality as WoW ---
def _salvage_json_object(txt: str):
    t = (txt or "").strip()
    if t.startswith("```"):
        t = re.sub(r"^```(?:json)?", "", t, flags=re.IGNORECASE).strip()
        t = re.sub(r"```$", "", t).strip()
    try:
        obj = json.loads(t)
        if isinstance(obj, dict):
            return obj
    except Exception:
        pass
    s, e = t.find("{"), t.rfind("}")
    if s != -1 and e != -1 and e > s:
        cand = t[s:e+1]
        try:
            obj = json.loads(cand)
            if isinstance(obj, dict):
                return obj
        except Exception:
            pass
    try:
        import ast
        obj = ast.literal_eval(t)
        if isinstance(obj, dict):
            return obj
    except Exception:
        pass
    return None

SYSTEM_SUMMARY_M = (
    "You are an analytics copilot for personal finance. "
    "Using ONLY the provided month-to-date aggregates (vs the same number of days in the prior month), "
    "produce an executive **monthly** digest in STRICT JSON. No invented numbers."
)
USER_INSTRUCTIONS_M = (
    "Return ONLY a JSON object with keys:\n"
    "{\n"
    '  "headline": string,\n'
    '  "theme": string,\n'
    '  "narrative": string,\n'
    '  "key_metrics": [ {"name": string, "value": number, "delta_pct": number|null} ],\n'
    '  "top_drivers": [ {"label": string, "spend": number} ],\n'
    '  "risks": [ {"type": "subscription"|"anomaly"|"trend", "note": string} ],\n'
    '  "action_items": [ {"title": string, "impact_usd": number, "rationale": string} ],\n'
    '  "email_subject": string\n'
    "}\n"
    "- Max 5 items per list.\n"
    "- Use negative delta_pct where spend improved.\n"
    "- impact_usd is a rough **weekly** savings estimate; you may still propose monthly actions.\n"
)

@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=6))
def _azure_digest_call_m(payload_json: str) -> str:
    resp = client.chat.completions.create(
        model=AZURE_OPENAI_DEPLOYMENT,
        messages=[
            {"role":"system","content": SYSTEM_SUMMARY_M},
            {"role":"user","content": USER_INSTRUCTIONS_M + "\n\nPAYLOAD:\n" + payload_json}
        ],
        temperature=0.15,
        max_tokens=750,
        response_format={"type":"json_object"},
    )
    return (resp.choices[0].message.content or "").strip()

raw_m = _azure_digest_call_m(json.dumps(summary_payload_m))
azure_digest_m = _salvage_json_object(raw_m)
if not isinstance(azure_digest_m, dict):
    raise RuntimeError("Azure MoM summarizer returned no valid JSON. Check Azure env/deployment/quota.")

# --- 7) Overlay + defaults ---
def _overlay(base: dict, over: dict | None) -> dict:
    if not isinstance(over, dict):
        return base
    out = dict(base)
    for k, v in over.items():
        if k in {"totals", "window"}:
            continue
        if k in ("key_metrics","top_drivers","risks","action_items"):
            if isinstance(v, list) and len(v) > 0:
                out[k] = v
        elif v not in (None, "", {}):
            out[k] = v
    return out

base_m = {
    "insights_version": 3,
    "window": summary_payload_m["window"],
    "totals": summary_payload_m["totals"],
    "headline": f"Monthly digest — {cur_month_name} MTD",
    "key_metrics": [
        {"name":"Total Spend (MoM MTD)",  "value": cur_spend_m,  "delta_pct": spend_delta_pct_m},
        {"name":"Total Income (MoM MTD)","value": cur_income_m, "delta_pct": None},
    ],
    "top_drivers": [{"label": _label(t["category"]), "spend": float(t["spend"])} for t in summary_payload_m["top_categories"]],
    "risks": (
        ([{"type":"subscription","note": f"{summary_payload_m['subscriptions_count']} active subs this month-to-date"}] if summary_payload_m["subscriptions_count"] else []) +
        ([{"type":"anomaly","note": f"{summary_payload_m['anomalies_count']} anomalies this month-to-date"}] if summary_payload_m["anomalies_count"] else [])
    ),
    "action_items": [],
    "theme": "",
    "narrative": "",
    "email_subject": ""
}
digest_m = _overlay(base_m, azure_digest_m)

# Limit theme to 3 words max; fallback if empty
if not isinstance(digest_m.get("theme",""), str) or not digest_m["theme"].strip():
    digest_m["theme"] = "Steady MTD" if spend_delta_m <= 0 else "Upward MTD"
else:
    words = [w for w in re.split(r"\s+", digest_m["theme"].strip()) if w]
    digest_m["theme"] = " ".join(words[:3])

# --- 8) Save JSON ---
with open(MOM_JSON, "w", encoding="utf-8") as f:
    json.dump(digest_m, f, ensure_ascii=False, indent=2)

# --- 9) Markdown (month names; narrative first) ---
def render_md_m(d):
    heading = f"# Monthly Executive Summary ({cur_month_name} MTD)"
    lines = [heading]
    if d.get("theme"):
        lines.append(f"*{d['theme']}*")
    if d.get("narrative"):
        lines.append(f"\n{d['narrative'].strip()}\n")

    lines.append("## Key Metrics (MoM)")
    for m in (d.get("key_metrics") or [])[:8]:
        name = m.get("name","")
        val  = float(m.get("value") or 0.0)
        dp   = m.get("delta_pct")
        if isinstance(dp, (int,float)) and abs(dp) > 1: dp = dp/100.0
        dp_txt = f" ({dp*100:+.1f}%)" if isinstance(dp,(int,float)) else ""
        lines.append(f"- **{name}:** ${val:,.2f}{dp_txt}")

    td = d.get("top_drivers") or []
    if td:
        lines.append("\n## Drivers (MTD)")
        for t in td[:5]:
            label = (t.get("label") or "Uncategorized").strip()
            lines.append(f"- **{label}:** ${float(t.get('spend',0)):,.0f}")
    rk = d.get("risks") or []
    if rk:
        lines.append("\n## Risks")
        for r in rk[:5]:
            lines.append(f"- **{r.get('type','note')}:** {r.get('note','')}")
    ai = d.get("action_items") or []
    if ai:
        lines.append("\n## Recommendations")
        for a in ai[:5]:
            lines.append(f"- **{a.get('title','')}** — est. ${float(a.get('impact_usd',0)):,.0f}. {a.get('rationale','')}")
    return "\n".join(lines) + "\n"

with open(MOM_MD, "w", encoding="utf-8") as f:
    f.write(render_md_m(digest_m))

# --- 10) Email subject + HTML (clean sections; month name only) ---
subject_m = digest_m.get("email_subject") or f"{digest_m.get('theme','Monthly Digest')} — {cur_month_name} MTD"
with open(MOM_SUBJ, "w", encoding="utf-8") as f:
    f.write(subject_m.strip())

def render_email_html_m(d):
    theme = (d.get("theme") or "").strip()
    narrative = (d.get("narrative") or "").strip()
    parts = []
    parts.append("<!doctype html><meta charset='utf-8'>")
    parts.append("<div style='font-family:Segoe UI,system-ui,-apple-system;line-height:1.55;font-size:14px;color:#111827;'>")
    parts.append(f"<h1 style='margin:0 0 4px 0;font-size:18px;'>Monthly Executive Summary ({cur_month_name} MTD)</h1>")
    if theme:
        parts.append(f"<div style='margin:0 0 12px 0;color:#6b7280;font-style:italic'>{theme}</div>")
    if narrative:
        parts.append(f"<p style='margin:0 0 16px 0'>{narrative}</p>")

    parts.append("<h2 style='margin:16px 0 8px 0;font-size:16px;'>Key Metrics (MoM)</h2><ul style='margin:0 0 12px 18px;'>")
    for m in (d.get("key_metrics") or [])[:8]:
        name = m.get("name","")
        val  = float(m.get("value") or 0.0)
        dp   = m.get("delta_pct")
        if isinstance(dp, (int,float)) and abs(dp) > 1: dp = dp/100.0
        dp_txt = f" ({dp*100:+.1f}%)" if isinstance(dp,(int,float)) else ""
        parts.append(f"<li><b>{name}:</b> ${val:,.2f}{dp_txt}</li>")
    parts.append("</ul>")

    td = d.get("top_drivers") or []
    if td:
        parts.append("<h2 style='margin:16px 0 8px 0;font-size:16px;'>Drivers (MTD)</h2><ul style='margin:0 0 12px 18px;'>")
        for t in td[:5]:
            label = (t.get("label") or "Uncategorized").strip()
            parts.append(f"<li><b>{label}:</b> ${float(t.get('spend',0)):,.0f}</li>")
        parts.append("</ul>")

    rk = d.get("risks") or []
    if rk:
        parts.append("<h2 style='margin:16px 0 8px 0;font-size:16px;'>Risks</h2><ul style='margin:0 0 12px 18px;'>")
        for r in rk[:5]:
            parts.append(f"<li><b>{r.get('type','note')}:</b> {r.get('note','')}</li>")
        parts.append("</ul>")

    ai = d.get("action_items") or []
    if ai:
        parts.append("<h2 style='margin:16px 0 8px 0;font-size:16px;'>Recommendations</h2><ul style='margin:0 0 12px 18px;'>")
        for a in ai[:5]:
            parts.append(f"<li><b>{a.get('title','')}</b> — est. ${float(a.get('impact_usd',0)):,.0f}. {a.get('rationale','')}</li>")
        parts.append("</ul>")

    parts.append("</div>")
    return "".join(parts)

with open(MOM_HTML, "w", encoding="utf-8") as f:
    f.write(render_email_html_m(digest_m))

# --- 11) Flat CSV for Power BI (MoM) ---
flat_rows_m = []
flat_rows_m.append({
    "row_type": "header",
    "as_of_end": str(cur_mtd_end),
    "cur_start": str(cur_month_start),
    "cur_end": str(cur_mtd_end),
    "prev_start": str(prev_month_start),
    "prev_end": str(aligned_prev_m_end),
    "headline": digest_m.get("headline",""),
    "summary": digest_m.get("narrative",""),
    "name": "Total Spend (MoM MTD)",
    "value": float(cur_spend_m),
    "delta_pct": float(spend_delta_pct_m),
    "label": "",
    "spend": None,
    "note": "",
    "impact_usd": None,
})
for m in digest_m.get("key_metrics", []):
    flat_rows_m.append({
        "row_type": "metric",
        "as_of_end": str(cur_mtd_end),
        "cur_start": str(cur_month_start),
        "cur_end": str(cur_mtd_end),
        "prev_start": str(prev_month_start),
        "prev_end": str(aligned_prev_m_end),
        "headline": digest_m.get("headline",""),
        "summary": "",
        "name": m.get("name",""),
        "value": float(m.get("value",0) or 0.0),
        "delta_pct": (float(m.get("delta_pct")) if isinstance(m.get("delta_pct"), (int,float)) else None),
        "label": "",
        "spend": None,
        "note": "",
        "impact_usd": None,
    })
for t in digest_m.get("top_drivers", []):
    flat_rows_m.append({
        "row_type": "driver",
        "as_of_end": str(cur_mtd_end),
        "cur_start": str(cur_month_start),
        "cur_end": str(cur_mtd_end),
        "prev_start": str(prev_month_start),
        "prev_end": str(aligned_prev_m_end),
        "headline": digest_m.get("headline",""),
        "summary": "",
        "name": "",
        "value": None,
        "delta_pct": None,
        "label": (t.get("label") or "Uncategorized"),
        "spend": float(t.get("spend",0) or 0.0),
        "note": "",
        "impact_usd": None,
    })
for r in digest_m.get("risks", []):
    flat_rows_m.append({
        "row_type": "risk",
        "as_of_end": str(cur_mtd_end),
        "cur_start": str(cur_month_start),
        "cur_end": str(cur_mtd_end),
        "prev_start": str(prev_month_start),
        "prev_end": str(aligned_prev_m_end),
        "headline": digest_m.get("headline",""),
        "summary": "",
        "name": "",
        "value": None,
        "delta_pct": None,
        "label": r.get("type",""),
        "spend": None,
        "note": r.get("note",""),
        "impact_usd": None,
    })
pd.DataFrame(flat_rows_m).to_csv(MOM_FLAT, index=False)

print(
    "🧾 Monthly executive digest written (MTD):\n"
    f"- JSON: {MOM_JSON}\n- MD:   {MOM_MD}\n- HTML: {MOM_HTML}\n- CSV:  {MOM_FLAT}\n- Subject: {MOM_SUBJ}\n"
    f"MTD Window: {cur_month_name} {cur_month_start} -> {cur_mtd_end} | Prior aligned: {prev_month_name} {prev_month_start} -> {aligned_prev_m_end}\n"
    f"MoM MTD Spend: cur={cur_spend_m} prev={prev_spend_m} delta={spend_delta_m} delta_pct={spend_delta_pct_m:+.4f}"
)


[Monthly] Category column used: category_display | non-null in MTD: 17
category_display  spend
        Shopping 464.86
       Transfers  79.48
        Services  60.00
  Transportation  58.86
Home Improvement   5.39
🧾 Monthly executive digest written (MTD):
- JSON: C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\insights\digest_mom.json
- MD:   C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\insights\digest_mom.md
- HTML: C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\insights\digest_mom_email.html
- CSV:  C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\insights\digest_mom_flat.csv
- Subject: C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\insights\digest_mom_subject.txt
MTD Window: September 2025-09-01 -> 2025-09-14 | Prior aligned: August 2025-08-01 -> 2025-08-14
MoM MTD Spend: cur=668.59 prev=890.15 delta=-221.56 delta_pct=-0.2489


In [19]:
# --- Cell 16: Combine Weekly + Monthly outputs for Power BI + Email bundle ---
from pathlib import Path
import pandas as pd

INSIGHTS_DIR = DATA_PROCESSED / "insights"
INSIGHTS_DIR.mkdir(parents=True, exist_ok=True)

# Weekly artifacts from Cell 14
W_JSON = INSIGHTS_DIR / "digest_latest.json"
W_FLAT = INSIGHTS_DIR / "digest_latest_flat.csv"
W_HTML = INSIGHTS_DIR / "digest_latest_email.html"
W_SUBJ = INSIGHTS_DIR / "digest_latest_subject.txt"

# Monthly artifacts from Cell 15
M_JSON = INSIGHTS_DIR / "digest_mom.json"
M_FLAT = INSIGHTS_DIR / "digest_mom_flat.csv"
M_HTML = INSIGHTS_DIR / "digest_mom_email.html"
M_SUBJ = INSIGHTS_DIR / "digest_mom_subject.txt"

# Combined outputs
C_FLAT = INSIGHTS_DIR / "digest_combined_flat.csv"
C_HTML = INSIGHTS_DIR / "digest_combined_email.html"
C_SUBJ = INSIGHTS_DIR / "digest_combined_subject.txt"

# --- 1) Combine flat CSVs with a 'period' column ---
frames = []
if W_FLAT.exists():
    w = pd.read_csv(W_FLAT)
    w["period"] = "WoW"
    frames.append(w)
if M_FLAT.exists():
    m = pd.read_csv(M_FLAT)
    m["period"] = "MoM"
    frames.append(m)

if frames:
    combined = pd.concat(frames, ignore_index=True)
    combined.to_csv(C_FLAT, index=False)
else:
    pd.DataFrame(columns=["row_type","period"]).to_csv(C_FLAT, index=False)

# --- 2) Build combined subject (uses theme if available) ---
def read_text(p: Path) -> str:
    try:
        return (p.read_text(encoding="utf-8") or "").strip()
    except Exception:
        return ""

def read_json(p: Path):
    try:
        import json
        return json.loads(p.read_text(encoding="utf-8"))
    except Exception:
        return {}

wj = read_json(W_JSON)
mj = read_json(M_JSON)

weekly_theme = (wj.get("theme") or "").strip()
month_name = ""
try:
    month_name = pd.to_datetime(mj.get("window",{}).get("current",{}).get("end","")).strftime("%B")
except Exception:
    # fallback by looking at any MTD end in flat
    try:
        mf = pd.read_csv(M_FLAT)
        if "as_of_end" in mf.columns and len(mf):
            month_name = pd.to_datetime(mf["as_of_end"].iloc[0]).strftime("%B")
    except Exception:
        month_name = ""

subject_week = read_text(W_SUBJ) or (weekly_theme and f"{weekly_theme} — Weekly") or "Weekly Summary"
subject_month = read_text(M_SUBJ) or (month_name and f"{month_name} MTD") or "Monthly MTD"
combined_subject = f"{subject_week} | {subject_month}"
Path(C_SUBJ).write_text(combined_subject, encoding="utf-8")

# --- 3) Build combined HTML (reuses generated HTML blocks if present) ---
def read_html(p: Path) -> str:
    try:
        return p.read_text(encoding="utf-8")
    except Exception:
        return ""

w_html = read_html(W_HTML)
m_html = read_html(M_HTML)

# Minimal wrapper that keeps each section's styling intact
combined_html = []
combined_html.append("<!doctype html><meta charset='utf-8'>")
combined_html.append("<div style='font-family:Segoe UI,system-ui,-apple-system;line-height:1.55;font-size:14px;color:#111827;'>")
combined_html.append("<h1 style='margin:0 0 12px 0;font-size:20px;'>Executive Summary — Weekly & Month-to-Date</h1>")

if w_html:
    # Strip outer wrappers if present to avoid nested <html> tags
    combined_html.append("<section style='margin-bottom:24px;border-bottom:1px solid #e5e7eb;padding-bottom:16px;'>")
    combined_html.append(w_html)
    combined_html.append("</section>")

if m_html:
    combined_html.append("<section style='margin-top:16px;'>")
    combined_html.append(m_html)
    combined_html.append("</section>")

combined_html.append("</div>")

Path(C_HTML).write_text("".join(combined_html), encoding="utf-8")

print(
    "📦 Combined artifacts:\n"
    f"- CSV:  {C_FLAT}\n"
    f"- HTML: {C_HTML}\n"
    f"- Subject: {C_SUBJ}\n"
    f"(Sources -> Weekly: {W_FLAT.name}, Monthly: {M_FLAT.name})"
)


📦 Combined artifacts:
- CSV:  C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\insights\digest_combined_flat.csv
- HTML: C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\insights\digest_combined_email.html
- Subject: C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\insights\digest_combined_subject.txt
(Sources -> Weekly: digest_latest_flat.csv, Monthly: digest_mom_flat.csv)


In [20]:
# --- Cell 16.5 (REPLACE): Exec-ready charts with wider figs, extra padding, percentile gradients ---

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.ticker import FuncFormatter
import numpy as np
import pandas as pd
from pathlib import Path

INSIGHTS_DIR = DATA_PROCESSED / "insights"
INSIGHTS_DIR.mkdir(parents=True, exist_ok=True)

P_LINE     = INSIGHTS_DIR / "weekly_spend_line.png"
P_DONUT    = INSIGHTS_DIR / "weekly_top_categories_donut.png"
P_MOVEMENT = INSIGHTS_DIR / "weekly_category_movement.png"

# Fonts / styling (portable)
plt.rcParams["font.family"] = "DejaVu Sans"
plt.rcParams["font.sans-serif"] = ["Segoe UI", "DejaVu Sans", "Arial", "Liberation Sans"]

def fmt_usd(x):
    try:
        return f"${float(x):,.0f}"
    except Exception:
        return "$0"

def spend_series(frame: pd.DataFrame, expenses_are_negative: bool) -> pd.Series:
    a = frame["amount"].astype(float)
    return a.where(a < 0, 0).abs() if expenses_are_negative else a.where(a > 0, 0)

def prefer_category_column(frame: pd.DataFrame) -> str:
    for c in ["category_display","category","category_final","category_plaid"]:
        if c in frame.columns and frame[c].notna().any():
            return c
    return "category"

# --- Common filters (mirror weekly digest logic) ---
base = df.copy()
for c in ("display_name","merchant_name","name"):
    if c not in base.columns:
        base[c] = ""
txt_all = (base["display_name"].astype(str) + " " +
           base["merchant_name"].astype(str) + " " +
           base["name"].astype(str)).str.upper()

wealthfront_mask = txt_all.str.contains(r"\bWEALTHFRONT\b", na=False)
applecash_mask   = txt_all.str.contains(r"\bAPPLE\s+CASH\b", na=False)
base = base.loc[~(wealthfront_mask & ~applecash_mask)].copy()

if "is_non_spend_flow" in base.columns:
    non_spend_mask = base["is_non_spend_flow"].fillna(False).astype(bool)
    keep_mask = (~non_spend_mask) | applecash_mask
    base = base.loc[keep_mask].copy()

base["date_only"] = base["date"].dt.date

# Week window (Mon–Sun), consistent with Cell 14
try:
    _ = wk_start, wk_end
except NameError:
    try:
        now = pd.Timestamp.now(tz="America/Los_Angeles").normalize()
    except Exception:
        now = pd.Timestamp.now().normalize()
    wd = int(now.weekday())  # Mon=0..Sun=6
    days_to_last_sun = 7 if wd == 6 else (wd + 1)
    wk_end   = (now - pd.Timedelta(days=days_to_last_sun)).date()
    wk_start = (pd.Timestamp(wk_end) - pd.Timedelta(days=6)).date()

prev_wk_start = (pd.Timestamp(wk_start) - pd.Timedelta(days=7)).date()
prev_wk_end   = (pd.Timestamp(wk_start) - pd.Timedelta(days=1)).date()

# Polarity
amt_all = base["amount"].dropna()
expenses_are_negative = (amt_all < 0).sum() > (amt_all > 0).sum()

# ================= 1) Daily trend — last 28 days (classic style; wider fig & padding) =================
plot_start = (pd.Timestamp(wk_end) - pd.Timedelta(days=27)).date()
tw = base[(base["date_only"] >= plot_start) & (base["date_only"] <= wk_end)].copy()

fig, ax = plt.subplots(figsize=(11.2, 5.8), dpi=144)  # wider
if tw.empty:
    ax.text(0.5, 0.5, "No spend data (last 28 days)", ha="center", va="center", fontsize=12)
    ax.axis("off")
else:
    tw["dt"] = pd.to_datetime(tw["date_only"])
    daily = (tw.assign(spend=spend_series(tw, expenses_are_negative))
               .groupby("dt", dropna=False)["spend"].sum()
               .sort_index())
    idx = pd.date_range(start=pd.to_datetime(plot_start), end=pd.to_datetime(wk_end), freq="D")
    daily = daily.reindex(idx, fill_value=0.0)
    ma7 = daily.rolling(7, min_periods=1).mean()

    # Classic lines
    ax.plot(daily.index, daily.values, marker="o", linewidth=2, label="Total Spend")
    ax.plot(ma7.index, ma7.values, linestyle="--", linewidth=2, label="7-Day Average")

    # Currency on Y
    ax.yaxis.set_major_formatter(FuncFormatter(lambda v, p: fmt_usd(v)))

    # X-axis like "Aug-28"
    def _fmt_mmm_day(xv, pos):
        try:
            dt = mdates.num2date(xv)
            return f"{dt.strftime('%b')}-{dt.day}"
        except Exception:
            return ""
    ax.xaxis.set_major_locator(mdates.AutoDateLocator(minticks=6, maxticks=10))
    ax.xaxis.set_major_formatter(FuncFormatter(_fmt_mmm_day))

    # Week-start dashed lines in light gray
    for vline in [pd.to_datetime(prev_wk_start), pd.to_datetime(wk_start)]:
        ax.axvline(vline, linestyle="--", linewidth=1, alpha=0.8, color="#d1d5db")

    ax.grid(axis="y", linestyle=":", alpha=0.35)
    ax.set_title("Total Spending — Last 28 Days", pad=8, fontsize=12)
    ax.legend(frameon=False, loc="upper left")

    # Extra breathing room
    ax.set_xlim(daily.index.min(), daily.index.max())
    ax.set_ylim(0, max(1.0, float(daily.values.max()) * 1.18))
    ax.margins(x=0.03, y=0.14)

fig.tight_layout(rect=[0.02, 0.02, 0.98, 0.98])
fig.savefig(P_LINE, bbox_inches="tight")
plt.close(fig)

# Prep week frames
cat_col = prefer_category_column(base)

cur_w  = base[(base["date_only"] >= wk_start) & (base["date_only"] <= wk_end)].copy()
prev_w = base[(base["date_only"] >= prev_wk_start) & (base["date_only"] <= prev_wk_end)].copy()

# Spend-only frames
if expenses_are_negative:
    cur_exp  = cur_w[cur_w["amount"] < 0].assign(spend=lambda x: x["amount"].abs())
    prev_exp = prev_w[prev_w["amount"] < 0].assign(spend=lambda x: x["amount"].abs())
else:
    cur_exp  = cur_w[cur_w["amount"] > 0].assign(spend=lambda x: x["amount"])
    prev_exp = prev_w[prev_w["amount"] > 0].assign(spend=lambda x: x["amount"])

# Clean categories and exclude admin buckets
EXCLUDE_CATS = {"Transfers","Income","Debt Payments","Fees"}
def _clean_cat(s):
    s = ("" if pd.isna(s) else str(s)).strip()
    return "Uncategorized" if s == "" or s.lower() in {"none","nan"} else s

cur_exp[cat_col]  = cur_exp[cat_col].apply(_clean_cat)
prev_exp[cat_col] = prev_exp[cat_col].apply(_clean_cat)

# ================= 2) Donut — Top Categories This Week (legend bottom; wider fig & bottom margin) =================
top_cats = (cur_exp[~cur_exp[cat_col].isin(EXCLUDE_CATS)]
            .groupby(cat_col, dropna=False)["spend"].sum()
            .sort_values(ascending=False).head(6))
labels = [str(i) for i in top_cats.index]
values = top_cats.values
total  = float(top_cats.sum()) or 1.0

# Bright palette
bright_colors = ["#6366F1","#F59E0B","#10B981","#EF4444","#3B82F6","#A855F7","#F97316","#06B6D4"]

fig, ax = plt.subplots(figsize=(8.2, 6.6), dpi=144)  # wider
if len(top_cats) == 0:
    ax.text(0.5, 0.5, "No category data for this week", ha="center", va="center", fontsize=12)
    ax.axis("off")
else:
    ret = ax.pie(values, labels=None, autopct=None, startangle=90, colors=bright_colors[:len(values)])
    wedges = ret[0]

    # Donut hole
    centre_circle = plt.Circle((0,0), 0.55, fc="white")
    fig.gca().add_artist(centre_circle)

    # Legend at the BOTTOM; no legend title
    pretty_labels = [f"{lab} — {fmt_usd(val)} ({val/total:,.0%})" for lab, val in zip(labels, values)]
    ax.legend(
        wedges, pretty_labels,
        loc="lower center",
        bbox_to_anchor=(0.5, -0.05),
        ncol=min(3, len(pretty_labels)),
        frameon=False
    )

    ax.set_title("Top Categories This Week", pad=8, fontsize=12)
    ax.axis('equal')

# Leave extra bottom room for the legend
fig.tight_layout(rect=[0.02, 0.10, 0.98, 0.98])
fig.savefig(P_DONUT, bbox_inches="tight")
plt.close(fig)

# ================= 3) Category Movement WoW (This Week - Prior) =================
cur_s  = (cur_exp[~cur_exp[cat_col].isin(EXCLUDE_CATS)]
          .groupby(cat_col, dropna=False)["spend"].sum())
prev_s = (prev_exp[~prev_exp[cat_col].isin(EXCLUDE_CATS)]
          .groupby(cat_col, dropna=False)["spend"].sum())

cats = sorted(set(cur_s.index) | set(prev_s.index))
cur_s  = cur_s.reindex(cats, fill_value=0.0)
prev_s = prev_s.reindex(cats, fill_value=0.0)
delta = (cur_s - prev_s)

# Keep top movers by absolute change
delta = delta[delta != 0].sort_values(key=np.abs, ascending=False).head(10)
labels_mv = list(delta.index)
vals_mv   = delta.values

fig, ax = plt.subplots(figsize=(11.8, 6.6), dpi=144)  # wider
if len(delta) == 0:
    ax.text(0.5, 0.5, "No category movement WoW", ha="center", va="center", fontsize=12)
    ax.axis("off")
else:
    abs_mv = np.abs(vals_mv)
    max_abs = float(abs_mv.max()) if len(abs_mv) else 1.0

    # ----- Percentile-based color intensity (robust to scale) -----
    # Rank by absolute change (ascending); convert to 0..1 percentile
    if len(abs_mv) > 1:
        ranks = abs_mv.argsort().argsort() + 1  # 1..N
        pct = ranks / float(len(abs_mv))        # 0..1
    else:
        pct = np.array([1.0])

    def hex_to_rgb(h):
        h = h.lstrip("#")
        return tuple(int(h[i:i+2], 16) for i in (0, 2, 4))

    def rgb_to_hex(rgb):
        return "#{:02X}{:02X}{:02X}".format(*rgb)

    def lerp(a, b, t):
        return int(a + (b - a) * t)

    def lerp_hex(c1, c2, t):
        r1,g1,b1 = hex_to_rgb(c1); r2,g2,b2 = hex_to_rgb(c2)
        return rgb_to_hex((lerp(r1,r2,t), lerp(g1,g2,t), lerp(b1,b2,t)))

    # Tailwind-ish ramps (light -> dark)
    RED_LIGHT   = "#FECACA"  # red-300
    RED_DARK    = "#B91C1C"  # red-700
    GREEN_LIGHT = "#BBF7D0"  # green-200
    GREEN_DARK  = "#065F46"  # emerald-900

    # Keep saturation between 0.35 and 1.0 to stay vivid
    t = 0.35 + 0.65 * pct
    colors = [
        lerp_hex(GREEN_LIGHT, GREEN_DARK, ti) if v < 0 else lerp_hex(RED_LIGHT, RED_DARK, ti)
        for v, ti in zip(vals_mv, t)
    ]

    # Bars
    y = np.arange(len(labels_mv))[::-1]
    ax.barh(y, vals_mv, height=0.55, color=colors, edgecolor="none")

    # Y labels
    ax.set_yticks(y, labels_mv)

    # Extra horizontal padding so labels never crowd the axis
    pad_abs = max(30.0, (float(max_abs) if len(abs_mv) else 1.0) * 0.22)
    xmin = -max_abs - pad_abs
    xmax =  max_abs + pad_abs
    ax.set_xlim(xmin, xmax)
    ax.margins(x=0.06)

    # Annotate each bar with $ change, placed slightly outside the bar
    for yi, v in zip(y, vals_mv):
        offset = max(10.0, (float(max_abs) if len(abs_mv) else 1.0) * 0.06)
        x_text = v + (offset if v >= 0 else -offset)
        ha = "left" if v >= 0 else "right"
        ax.text(x_text, yi, f"{fmt_usd(v)}", va="center", ha=ha, fontsize=10)

    # Zero reference line + grid
    ax.axvline(0, linestyle="--", linewidth=1, alpha=0.7, color="#9ca3af")
    ax.xaxis.set_major_formatter(FuncFormatter(lambda x, p: fmt_usd(x)))
    ax.grid(axis="x", linestyle=":", alpha=0.35)

    ax.set_title("Category Movement WoW (This Week vs Prior)", pad=10, fontsize=12)

fig.tight_layout(rect=[0.03, 0.02, 0.99, 0.98])
fig.savefig(P_MOVEMENT, bbox_inches="tight")
plt.close(fig)

print("📊 Charts written →", P_LINE.name, P_DONUT.name, P_MOVEMENT.name)


📊 Charts written → weekly_spend_line.png weekly_top_categories_donut.png weekly_category_movement.png


In [21]:
# --- Cell 17: Email dispatch (inline images only, NO attachments) ---
# Gmail-ready SMTP, kill switch, inline PNG charts via CID.
# Removes all file attachments (CSVs and image fallbacks).

import os, smtplib, ssl, re
from pathlib import Path
from email.message import EmailMessage

def _mask(s):
    if not s: return "<missing>"
    s = str(s)
    return (s[:3] + "…" + s[-3:]) if len(s) > 8 else "***"

INSIGHTS_DIR = DATA_PROCESSED / "insights"
INSIGHTS_DIR.mkdir(parents=True, exist_ok=True)

# Primary artifacts for body/subject
C_HTML = INSIGHTS_DIR / "digest_combined_email.html"
C_SUBJ = INSIGHTS_DIR / "digest_combined_subject.txt"
W_HTML = INSIGHTS_DIR / "digest_latest_email.html"
W_SUBJ = INSIGHTS_DIR / "digest_latest_subject.txt"
M_HTML = INSIGHTS_DIR / "digest_mom_email.html"
M_SUBJ = INSIGHTS_DIR / "digest_mom_subject.txt"

# Chart PNGs (from Cell 16.5)
WEEKLY_LINE_PATH = INSIGHTS_DIR / "weekly_spend_line.png"
WEEKLY_PIE_PATH  = INSIGHTS_DIR / "weekly_top_categories_pie.png"

# ---------------- Kill switch ----------------
EMAIL_ENABLED = (os.getenv("EMAIL_ENABLED", "1") or "1").strip().lower() not in {"0","false","no","off"}
EMAIL_KILL_FILE = STATE_DIR / "EMAIL_KILL"
if EMAIL_KILL_FILE.exists():
    EMAIL_ENABLED = False

EMAIL_DRY_RUN = (os.getenv("EMAIL_DRY_RUN", "0") or "0").strip().lower() in {"1","true","yes","on"}

if not EMAIL_ENABLED:
    print("✋ Email sending disabled (kill switch). Set EMAIL_ENABLED=1 and remove .state/EMAIL_KILL to re-enable.")
else:
    # ---------------- SMTP (Gmail-ready) ----------------
    SMTP_HOST = os.getenv("SMTP_HOST", "smtp.gmail.com").strip()
    SMTP_PORT = int(os.getenv("SMTP_PORT", "587"))
    SMTP_SSL_PORT = int(os.getenv("SMTP_SSL_PORT", "465"))
    SMTP_USERNAME = (os.getenv("SMTP_USERNAME", "") or "").strip()
    SMTP_PASSWORD = (os.getenv("SMTP_PASSWORD", "") or "").replace(" ", "")  # trim spaces Google shows
    SMTP_STARTTLS = (os.getenv("SMTP_STARTTLS", "1") or "1").strip().lower() not in {"0","false","no","off"}

    EMAIL_FROM = (os.getenv("EMAIL_FROM", "") or "").strip()
    EMAIL_TO   = (os.getenv("EMAIL_TO", "") or "").strip()
    EMAIL_CC   = (os.getenv("EMAIL_CC", "") or "").strip()
    EMAIL_BCC  = (os.getenv("EMAIL_BCC", "") or "").strip()

    SUBJECT_OVERRIDE = os.getenv("EMAIL_SUBJECT_OVERRIDE", "").strip()
    BODY_HTML_OVERRIDE_PATH = os.getenv("EMAIL_BODY_HTML_PATH", "").strip()

    # Minimal validation
    missing = [k for k,v in {
        "SMTP_HOST": SMTP_HOST,
        "SMTP_USERNAME": SMTP_USERNAME,
        "SMTP_PASSWORD": SMTP_PASSWORD,
        "EMAIL_FROM": EMAIL_FROM,
        "EMAIL_TO": EMAIL_TO,
    }.items() if not v]
    if missing:
        raise RuntimeError("Email config missing: " + ", ".join(missing))

    # ---------------- Subject & HTML body ----------------
    def _read_text(p: Path) -> str:
        try: return (p.read_text(encoding="utf-8") or "").strip()
        except Exception: return ""

    def _read_html(p: Path) -> str:
        try: return p.read_text(encoding="utf-8")
        except Exception: return ""

    subject = SUBJECT_OVERRIDE or _read_text(C_SUBJ) or _read_text(W_SUBJ) or _read_text(M_SUBJ) or "AI Credit Card Dashboard — Digest"

    if BODY_HTML_OVERRIDE_PATH:
        body_html = Path(BODY_HTML_OVERRIDE_PATH).read_text(encoding="utf-8")
    else:
        body_html = _read_html(C_HTML) or _read_html(W_HTML) or _read_html(M_HTML)

    if not body_html:
        body_html = (
            "<!doctype html><meta charset='utf-8'>"
            "<div style='font-family:Segoe UI,system-ui,-apple-system;line-height:1.55;font-size:14px;color:#111827;'>"
            "<h1 style='margin:0 0 8px 0;font-size:18px;'>AI Credit Card Dashboard — Digest</h1>"
            "<p>No HTML digest was found this run. Check earlier cells for generation status.</p>"
            "</div>"
        )

    # Plain-text fallback
    def _html_to_text(h: str) -> str:
        t = re.sub(r"<(br|/p|/li)>", "\n", h, flags=re.IGNORECASE)
        t = re.sub(r"<[^>]+>", "", t)
        return re.sub(r"\n{3,}", "\n\n", t).strip()
    body_text = _html_to_text(body_html)

    # ---------------- Prepare image refs & append <img> once ----------------
    img_refs = []
    if (INSIGHTS_DIR / "weekly_spend_line.png").exists():
        img_refs.append(("weekly_line", INSIGHTS_DIR / "weekly_spend_line.png"))
    if (INSIGHTS_DIR / "weekly_top_categories_donut.png").exists():
        img_refs.append(("weekly_top_categories_donut", INSIGHTS_DIR / "weekly_top_categories_donut.png"))
    if (INSIGHTS_DIR / "weekly_category_movement.png").exists():
        img_refs.append(("weekly_category_movement", INSIGHTS_DIR / "weekly_category_movement.png"))

    # ---------------- Build email ----------------
    def _split_emails(s): return [e.strip() for e in s.split(",") if e.strip()]
    rcpts = []
    seen = set()
    for e in _split_emails(EMAIL_TO) + _split_emails(EMAIL_CC) + _split_emails(EMAIL_BCC):
        if e.lower() not in seen:
            seen.add(e.lower()); rcpts.append(e)
    if not rcpts:
        raise RuntimeError("No recipients found (EMAIL_TO/CC/BCC).")

    msg = EmailMessage()
    msg["Subject"] = subject
    msg["From"] = EMAIL_FROM
    msg["To"] = ", ".join(_split_emails(EMAIL_TO))
    if EMAIL_CC: msg["Cc"] = ", ".join(_split_emails(EMAIL_CC))

    # Force multipart/alternative container and add HTML
    msg.set_content(body_text)
    msg.make_alternative()
    msg.add_alternative(body_html, subtype="html")
    html_part = msg.get_body(preferencelist=("html",))

    # Inline embed only (no fallback attachments)
    embedded = []
    if html_part is not None and img_refs:
        for cid, pth in img_refs:
            try:
                with open(pth, "rb") as f:
                    html_part.add_related(
                        f.read(),
                        maintype="image",
                        subtype="png",
                        cid=f"<{cid}>",
                        filename=pth.name
                    )
                embedded.append(pth.name)
            except Exception as e:
                print(f"⚠️ Inline embed failed for {pth.name}: {e}. Skipping image (no attachments by policy).")
    elif img_refs:
        print("⚠️ Could not locate HTML part; skipping inline images (no attachments by policy).")

    # ---------------- Send (STARTTLS then SSL fallback) ----------------
    context = ssl.create_default_context()

    def _try_starttls():
        with smtplib.SMTP(SMTP_HOST, SMTP_PORT, timeout=60) as server:
            server.ehlo()
            if SMTP_STARTTLS: server.starttls(context=context); server.ehlo()
            server.login(SMTP_USERNAME, SMTP_PASSWORD)
            if EMAIL_DRY_RUN:
                print("✅ STARTTLS login OK (dry-run)."); return
            server.send_message(msg, to_addrs=rcpts)

    def _try_ssl():
        with smtplib.SMTP_SSL(SMTP_HOST, SMTP_SSL_PORT, context=context, timeout=60) as server:
            server.ehlo()
            server.login(SMTP_USERNAME, SMTP_PASSWORD)
            if EMAIL_DRY_RUN:
                print("✅ SSL login OK (dry-run)."); return
            server.send_message(msg, to_addrs=rcpts)

    try:
        _try_starttls()
        print(f"📧 Email sent via STARTTLS to {', '.join(rcpts)} — subject: {subject}")
    except smtplib.SMTPAuthenticationError as e:
        print("❌ STARTTLS auth failed:", e.smtp_error.decode() if hasattr(e, "smtp_error") else str(e))
        print("…attempting SSL on port", SMTP_SSL_PORT)
        _try_ssl()
        print(f"📧 Email sent via SSL to {', '.join(rcpts)} — subject: {subject}")

    print("Inline images:", embedded)
    print("Attachments: none")


📧 Email sent via STARTTLS to kosisonna.ugo@gmail.com — subject: Weekly Financial Update: Major Spending Reduction Achieved | September Financial Overview: Spending Down, Income Up!
Inline images: ['weekly_spend_line.png', 'weekly_top_categories_donut.png', 'weekly_category_movement.png']
Attachments: none


In [22]:
import os, socket, ssl, smtplib
from pathlib import Path

def _mask(s):
    if not s: return "<missing>"
    s = str(s)
    return (s[:3] + "…" + s[-3:]) if len(s) > 8 else "***"

STATE_DIR = Path(globals().get("STATE_DIR", Path(".state")))
kill = (STATE_DIR / "EMAIL_KILL").exists()

SMTP_HOST = (os.getenv("SMTP_HOST", "smtp.gmail.com") or "").strip()
SMTP_PORT = int(os.getenv("SMTP_PORT", "587"))
SMTP_SSL_PORT = int(os.getenv("SMTP_SSL_PORT", "465"))
SMTP_USERNAME = (os.getenv("SMTP_USERNAME", "") or "").strip()
SMTP_PASSWORD = (os.getenv("SMTP_PASSWORD", "") or "").strip()
SMTP_STARTTLS = (os.getenv("SMTP_STARTTLS", "1") or "1").strip().lower() not in {"0","false","no","off"}

EMAIL_ENABLED = (os.getenv("EMAIL_ENABLED", "1") or "1").strip().lower() not in {"0","false","no","off"}
EMAIL_DRY_RUN = (os.getenv("EMAIL_DRY_RUN", "0") or "0").strip().lower() in {"1","true","yes","on"}
EMAIL_FROM = (os.getenv("EMAIL_FROM", "") or "").strip()
EMAIL_TO   = (os.getenv("EMAIL_TO", "") or "").strip()
EMAIL_CC   = (os.getenv("EMAIL_CC", "") or "").strip()
EMAIL_BCC  = (os.getenv("EMAIL_BCC", "") or "").strip()

print("=== EMAIL DIAG ===")
print("EMAIL_ENABLED:", EMAIL_ENABLED, "| kill-switch file present:", kill)
print("EMAIL_DRY_RUN:", EMAIL_DRY_RUN)
print("SMTP_HOST:", SMTP_HOST, " | STARTTLS:", SMTP_STARTTLS)
print("SMTP_PORT:", SMTP_PORT, " | SMTP_SSL_PORT:", SMTP_SSL_PORT)
print("SMTP_USERNAME:", _mask(SMTP_USERNAME))
print("SMTP_PASSWORD:", _mask(SMTP_PASSWORD))
print("EMAIL_FROM:", EMAIL_FROM)
print("EMAIL_TO:", EMAIL_TO)
print("EMAIL_CC:", EMAIL_CC)
print("EMAIL_BCC:", EMAIL_BCC)

# quick recipient sanity
rcpts = [e.strip() for e in (EMAIL_TO + "," + EMAIL_CC + "," + EMAIL_BCC).split(",") if e.strip()]
print("Recipients parsed:", rcpts)

# check ports reachability
def check_port(host, port, ssl_wrap=False):
    try:
        s = socket.create_connection((host, port), timeout=8)
        if ssl_wrap:
            ctx = ssl.create_default_context()
            s = ctx.wrap_socket(s, server_hostname=host)
        s.close()
        return True
    except Exception as e:
        print(f"Port check fail {host}:{port}{' (ssl)' if ssl_wrap else ''} ->", repr(e))
        return False

print("TCP reach 587:", check_port(SMTP_HOST, SMTP_PORT))
print("TCP reach 465 (ssl):", check_port(SMTP_HOST, SMTP_SSL_PORT, ssl_wrap=True))

# optional: auth probe without sending
try:
    with smtplib.SMTP(SMTP_HOST, SMTP_PORT, timeout=15) as server:
        server.ehlo()
        if SMTP_STARTTLS:
            import ssl as _ssl
            server.starttls(context=_ssl.create_default_context()); server.ehlo()
        server.login(SMTP_USERNAME, SMTP_PASSWORD)
        print("SMTP AUTH OK on STARTTLS")
except smtplib.SMTPAuthenticationError as e:
    print("SMTP AUTH FAIL on STARTTLS:", e.smtp_error.decode() if hasattr(e, "smtp_error") else str(e))
except Exception as e:
    print("SMTP STARTTLS path error:", repr(e))

try:
    with smtplib.SMTP_SSL(SMTP_HOST, SMTP_SSL_PORT, timeout=15) as server:
        server.ehlo()
        server.login(SMTP_USERNAME, SMTP_PASSWORD)
        print("SMTP AUTH OK on SSL")
except smtplib.SMTPAuthenticationError as e:
    print("SMTP AUTH FAIL on SSL:", e.smtp_error.decode() if hasattr(e, "smtp_error") else str(e))
except Exception as e:
    print("SMTP SSL path error:", repr(e))

print("=== END EMAIL DIAG ===")


=== EMAIL DIAG ===
EMAIL_ENABLED: True | kill-switch file present: False
EMAIL_DRY_RUN: False
SMTP_HOST: smtp.gmail.com  | STARTTLS: True
SMTP_PORT: 587  | SMTP_SSL_PORT: 465
SMTP_USERNAME: kos…com
SMTP_PASSWORD: hwq…lvo
EMAIL_FROM: kosisonna.ugo@gmail.com
EMAIL_TO: kosisonna.ugo@gmail.com
EMAIL_CC: 
EMAIL_BCC: 
Recipients parsed: ['kosisonna.ugo@gmail.com']
TCP reach 587: True
TCP reach 465 (ssl): True
SMTP AUTH OK on STARTTLS
SMTP AUTH OK on SSL
=== END EMAIL DIAG ===
