In [38]:
# --- Cell 1: Robust setup + centralized Azure client factory ---
import os, re, json, math, hashlib, ast
from pathlib import Path
from datetime import datetime, timedelta, date
from collections import defaultdict

import numpy as np
import pandas as pd

# Ensure OpenAI SDK is available (Azure OpenAI compatible)
try:
    from openai import OpenAI
except Exception:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "openai"])
    from openai import OpenAI

# --- Paths (prefer GITHUB_WORKSPACE, never walk above repo) ---
cwd = Path.cwd().resolve()
gw = os.getenv("GITHUB_WORKSPACE")
start = Path(gw).resolve() if gw else cwd
repo_root = next((p for p in [start, *start.parents] if (p / ".git").exists()), start)
REPO = repo_root

DATA_RAW       = REPO / "data" / "raw"
DATA_PROCESSED = REPO / "data" / "processed"
CONFIG_DIR     = REPO / "config"
STATE_DIR      = REPO / ".state"
VECTOR_DIR     = REPO / "vectorstore"

MERCHANT_DIM_PATH  = CONFIG_DIR / "merchants_dim.csv"
LATEST_CSV_PATH    = DATA_RAW / "latest.csv"
ENRICHED_OUT_PATH  = DATA_RAW / "latest.csv"                # overwrite stable file for Power BI
ENRICHED_COPY_PATH = DATA_PROCESSED / "latest_enriched.csv"
DIGEST_PATH        = DATA_PROCESSED / "digest_latest.txt"
GOAL_PATH          = DATA_PROCESSED / "goal_nudges_latest.txt"
EMBEDDINGS_PATH    = VECTOR_DIR / "embeddings.parquet"

# Ensure dirs
DATA_RAW.mkdir(parents=True, exist_ok=True)
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)
VECTOR_DIR.mkdir(parents=True, exist_ok=True)

# Config flags
MAP_ALL        = True        # label unmapped merchants via Azure (if enabled)
GOAL_SAVINGS   = 1000.0      # monthly savings target for nudges
ANOMALY_Z      = 2.5         # z-score threshold for anomalies

# --- Azure OpenAI env ---
AZURE_OPENAI_ENDPOINT   = os.getenv("AZURE_OPENAI_ENDPOINT", "").rstrip("/")
AZURE_OPENAI_API_KEY    = os.getenv("AZURE_OPENAI_API_KEY", "")
AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT", "")   # chat model (deployment name)
AZURE_OPENAI_EMBEDDINGS = os.getenv("AZURE_OPENAI_EMBEDDINGS", "")   # embeddings deployment name
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2024-02-15-preview")

def _have_azure(deploy: str) -> bool:
    return bool(AZURE_OPENAI_ENDPOINT and AZURE_OPENAI_API_KEY and deploy)

def make_azure_client(deployment: str) -> OpenAI | None:
    """Factory for Azure OpenAI client bound to a specific deployment."""
    if not _have_azure(deployment):
        return None
    # For Azure, base_url points at the deployment; api-version goes on every request
    return OpenAI(
        api_key=AZURE_OPENAI_API_KEY,
        base_url=f"{AZURE_OPENAI_ENDPOINT}/openai/deployments/{deployment}",
        default_query={"api-version": AZURE_OPENAI_API_VERSION},
        default_headers={"api-key": AZURE_OPENAI_API_KEY},
    )

# Shared clients (None if not configured)
chat_client  = make_azure_client(AZURE_OPENAI_DEPLOYMENT) if AZURE_OPENAI_DEPLOYMENT else None
embed_client = make_azure_client(AZURE_OPENAI_EMBEDDINGS) if AZURE_OPENAI_EMBEDDINGS else None
azure_enabled = chat_client is not None

if not azure_enabled:
    print("⚠️ Azure OpenAI (chat) not fully set; AI summaries will fall back to deterministic base.")
if embed_client is None:
    print("⚠️ Azure OpenAI (embeddings) not set; embeddings cache will be skipped.")

print("✅ Setup complete.")


⚠️ Azure OpenAI (chat) not fully set; AI summaries will fall back to deterministic base.
⚠️ Azure OpenAI (embeddings) not set; embeddings cache will be skipped.
✅ Setup complete.


In [39]:
# Load latest.csv (from build_latest.ipynb), robust path resolution
candidates = [
    LATEST_CSV_PATH,
    Path(os.getenv("OUTPUT_DIR", str(REPO / "data" / "raw"))) / "latest.csv",
    REPO / "data" / "raw" / "latest.csv",
]
src = next((p for p in candidates if p.exists()), None)
if src is None:
    raise FileNotFoundError(
        "latest.csv not found.\nChecked:\n- " + "\n- ".join(str(p) for p in candidates) +
        f"\nCWD={Path.cwd()}  REPO={REPO}"
    )

df = pd.read_csv(src)

# Ensure expected columns exist
expected = {"date","name","merchant_name","category","amount","bank_name"}
missing = expected - set(df.columns)
if missing:
    raise ValueError(f"latest.csv missing columns: {missing}")

# Ensure card_name exists (fallback to bank_name)
if "card_name" not in df.columns:
    df["card_name"] = df["bank_name"]

# Coerce types
df["date"] = pd.to_datetime(df["date"], errors="coerce")
df["amount"] = pd.to_numeric(df["amount"], errors="coerce")

# Basic cleanups
df["merchant_name"] = df["merchant_name"].fillna("")
df["name"] = df["name"].fillna("")

# A robust unique id for each transaction (for embeddings & caching)
def make_txn_uid(row):
    key = f"{row.get('date')}_{row.get('name')}_{row.get('merchant_name')}_{row.get('amount')}_{row.get('bank_name')}"
    return hashlib.sha1(key.encode("utf-8")).hexdigest()

df["txn_uid"] = df.apply(make_txn_uid, axis=1)

# Global sign convention: True if expenses are negative numbers
EXPENSES_ARE_NEGATIVE = (df["amount"] < 0).sum() > (df["amount"] > 0).sum()
print(f"Loaded {len(df)} transactions. expenses_are_negative={EXPENSES_ARE_NEGATIVE}")

Loaded 148 transactions. expenses_are_negative=False


In [40]:
# --- Cell 3: Normalize merchant_key consistently with build_latest ---
import numpy as np
import re

def merchant_key_from(name: str) -> str:
    s = (name or "").upper()
    s = re.sub(r"APPLE PAY ENDING IN \d{4}", "", s)
    s = re.sub(r"#\d{2,}", "", s)              # strip store numbers like #1234
    s = re.sub(r"\d+", "", s)                  # kill stray digits
    s = re.sub(r"[^A-Z&\s]", " ", s)           # keep letters, ampersand, spaces
    s = re.sub(r"\s+", " ", s).strip()
    return s or "UNKNOWN"

# Use 'merchant_name' when available, else 'name'
df["merchant_key"] = np.where(
    df["merchant_name"].astype(str).str.len() > 0,
    df["merchant_name"].map(merchant_key_from),
    df["name"].map(merchant_key_from)
)

print("Merchant keys normalized (consistent with build_latest).")


Merchant keys normalized (consistent with build_latest).


In [41]:
# Load or initialize merchant dimension table
dim_cols = [
    "merchant_key", "display_name", "category", "subcategory", "tags",
    "source", "confidence", "last_updated"
]
if MERCHANT_DIM_PATH.exists():
    dim = pd.read_csv(MERCHANT_DIM_PATH)
    # ensure columns
    for c in dim_cols:
        if c not in dim.columns:
            dim[c] = np.nan
    dim = dim[dim_cols]
else:
    dim = pd.DataFrame(columns=dim_cols)

# Left-join to see which keys are already mapped
df = df.merge(dim, on="merchant_key", how="left", suffixes=("", "_dim"))

# Identify unmapped merchants
unmapped_keys = sorted(k for k in df.loc[df["display_name"].isna(), "merchant_key"].unique() if k != "UNKNOWN")
print(f"Unmapped merchants needing AI labels: {len(unmapped_keys)}")


Unmapped merchants needing AI labels: 13


In [42]:
# --- Cell 6: Label unmapped merchants via Azure (single-call) ---
new_rows = []
if len(unmapped_keys) and ('chat_client' in globals()) and (chat_client is not None) and MAP_ALL:
    print(f"Labeling {len(unmapped_keys)} merchants (single-call mode)...")
    for idx, mk in enumerate(unmapped_keys, 1):
        try:
            item = azure_label_one(mk)
        except Exception as e:
            print(f"⚠️ Label fail for '{mk}': {e}")
            continue

        now = datetime.utcnow().isoformat()
        if item:
            new_rows.append({
                "merchant_key": mk,
                "display_name": item["display_name"],
                "category": item["category"],
                "subcategory": item["subcategory"],
                "tags": ",".join(item["tags"]),
                "source": "azure",
                "confidence": 0.90,
                "last_updated": now
            })

    if new_rows:
        dim_new = pd.DataFrame(new_rows)
        dim_all = pd.concat([dim, dim_new], ignore_index=True)
        dim_all = dim_all.sort_values("last_updated").drop_duplicates(["merchant_key"], keep="last")
        MERCHANT_DIM_PATH.parent.mkdir(parents=True, exist_ok=True)
        dim_all.to_csv(MERCHANT_DIM_PATH, index=False)
        dim = dim_all
        print(f"✅ Added {len(new_rows)} merchant mappings (single-call).")
    else:
        print("No new mappings added (single-call).")
else:
    print("No new mappings needed or AI disabled.")


No new mappings needed or AI disabled.


In [43]:
# --- Cell 6B: Persist merchants_dim.csv (idempotent) ---

# Toggle if you ever want to skip writing on runs with no changes
PERSIST_MERCHANT_DIM = True

# dim_cols defined in Cell 4; dim may be updated in Cell 6
if not isinstance(PERSIST_MERCHANT_DIM, bool):
    PERSIST_MERCHANT_DIM = True

if PERSIST_MERCHANT_DIM:
    MERCHANT_DIM_PATH.parent.mkdir(parents=True, exist_ok=True)

    if 'dim' in globals() and isinstance(dim, pd.DataFrame) and len(dim):
        # ensure expected columns/order exist before save
        for c in dim_cols:
            if c not in dim.columns:
                dim[c] = np.nan
        dim = dim[dim_cols]

        dim.to_csv(MERCHANT_DIM_PATH, index=False)
        print(f"📝 merchants_dim.csv saved ({len(dim)} rows) → {MERCHANT_DIM_PATH}")
    else:
        # either no new mappings this run or dim was empty; ensure file exists
        if not MERCHANT_DIM_PATH.exists():
            pd.DataFrame(columns=dim_cols).to_csv(MERCHANT_DIM_PATH, index=False)
            print(f"📝 Created headers-only merchants_dim.csv → {MERCHANT_DIM_PATH}")
        else:
            print("ℹ️ merchants_dim.csv already exists; no changes to sync.")
else:
    print("PERSIST_MERCHANT_DIM=False → skipping merchants_dim.csv persistence.")

📝 merchants_dim.csv saved (64 rows) → C:\Users\kosis\Downloads\Automation\spending-dashboard\config\merchants_dim.csv


In [44]:
df = df.drop(columns=["display_name","category","subcategory","tags","source","confidence","last_updated"], errors="ignore")
df = df.merge(dim, on="merchant_key", how="left", suffixes=("", "_dim"))

# Final output columns (feel free to adjust ordering)
final_cols = [
    "txn_uid", "date", "bank_name", "card_name",
    "merchant_key", "display_name",
    "category", "subcategory", "tags",
    "name", "merchant_name", "amount"
]
# Ensure existence even if null
for c in final_cols:
    if c not in df.columns:
        df[c] = np.nan

# Canonical display name fallback
df["display_name"] = df["display_name"].fillna(df["merchant_key"])

print("Labels joined.")


Labels joined.


In [45]:
# --- Cell 8: Subscription detection (sign-aware, idempotent) ---

def detect_subscription(group: pd.DataFrame) -> bool:
    g = group.dropna(subset=["date", "amount"]).sort_values("date")
    if len(g) < 3:
        return False

    # use absolute spend magnitudes for stability
    amounts = g["amount"].abs().to_numpy(dtype=float)
    amounts = amounts[np.isfinite(amounts)]
    if amounts.size < 3:
        return False

    # gaps in days
    ts_ns = g["date"].astype("int64").to_numpy()
    gaps_days = np.diff(ts_ns) / 86_400_000_000_000
    if gaps_days.size < 2:
        return False

    monthlyish_med = float(np.median(gaps_days))
    frac_monthly = float(np.mean((gaps_days >= 27) & (gaps_days <= 33))) if gaps_days.size else 0.0

    mu = float(np.mean(amounts))
    cv = float(np.std(amounts) / (mu + 1e-9)) if mu > 0 else 1.0

    return (27 <= monthlyish_med <= 33) and (frac_monthly >= 0.6) and (cv <= 0.2)

# Clean any leftover artifacts from previous runs (e.g., is_subscription_x from merges)
for col in [c for c in df.columns if c.startswith("is_subscription") and c != "is_subscription"]:
    df.drop(columns=col, inplace=True, errors="ignore")

# Respect your sign convention
EXPENSES_ARE_NEGATIVE = (df["amount"] < 0).sum() > (df["amount"] > 0).sum()
if EXPENSES_ARE_NEGATIVE:
    outflows = df.loc[(df["amount"] < 0) & df["date"].notna(), ["display_name", "date", "amount"]].copy()
    outflows["amount"] = outflows["amount"].abs()
else:
    outflows = df.loc[(df["amount"] > 0) & df["date"].notna(), ["display_name", "date", "amount"]].copy()

subs_map = {}
for disp, g in outflows.groupby("display_name", dropna=False):
    try:
        subs_map[disp] = bool(detect_subscription(g[["date", "amount"]]))
    except Exception:
        subs_map[disp] = False

df["is_subscription"] = df["display_name"].map(subs_map).fillna(False).astype(bool)

print(f"Subscriptions flagged: {int(df['is_subscription'].sum())} candidates.")


Subscriptions flagged: 0 candidates.


  df["is_subscription"] = df["display_name"].map(subs_map).fillna(False).astype(bool)


In [46]:
def zscores(x):
    mu = np.mean(x)
    sd = np.std(x)
    if sd == 0:
        return np.zeros_like(x)
    return (x - mu) / sd

df["amount_abs"] = df["amount"].abs()
df["z_by_merchant"] = (
    df.groupby("display_name", dropna=False)["amount_abs"]
      .transform(zscores)
)
df["is_anomaly"] = (df["z_by_merchant"] >= ANOMALY_Z)

print(f"Anomalies flagged: {int(df['is_anomaly'].sum())}")


Anomalies flagged: 1


In [47]:
today = pd.Timestamp(date.today())
cut1 = today - pd.Timedelta(days=30)
cut2 = today - pd.Timedelta(days=60)

cur = df[(df["date"] > cut1) & (df["amount"] > 0)]
prev = df[(df["date"] > cut2) & (df["date"] <= cut1) & (df["amount"] > 0)]

cur_total = cur["amount"].sum()
prev_total = prev["amount"].sum()
delta = cur_total - prev_total

top_merchants = (
    cur.groupby("display_name", dropna=False)["amount"].sum()
       .sort_values(ascending=False)
       .head(3)
)

top_category = (
    cur.groupby("category", dropna=False)["amount"].sum()
       .sort_values(ascending=False)
       .head(1)
)
top_category_name = top_category.index[0] if len(top_category) else "N/A"
top_category_amt = float(top_category.iloc[0]) if len(top_category) else 0.0

digest = []
digest.append(f"Period: last 30 days vs prior 30")
digest.append(f"Spend: ${cur_total:,.2f} ({'+' if delta>=0 else ''}{delta:,.2f} vs prior)")
digest.append("Top 3 merchants: " + ", ".join([f"{m} (${v:,.2f})" for m, v in top_merchants.items()]))
digest.append(f"Biggest category driver: {top_category_name} (${top_category_amt:,.2f})")

DATA_PROCESSED.mkdir(parents=True, exist_ok=True)
with open(DIGEST_PATH, "w", encoding="utf-8") as f:
    f.write("\n".join(digest))

print("\n".join(digest))
print(f"\nSaved digest → {DIGEST_PATH}")


Period: last 30 days vs prior 30
Spend: $5,541.28 (+2,537.77 vs prior)
Top 3 merchants: WITHDRAWAL ALLY TYPE ALLY PAYMT ID CO ALLY NAME KOSISONNA UGOCHUKW ACH ECC WEB ACH TRACE ($1,494.22), WITHDRAWAL AMEX EPAYMENT TYPE ACH PMT ID DATA ER AM CO AMEX EPAYMENT NAME KOSISONNA UGOCHUKWU ACH ECC WEB ACH TRACE ($777.78), PETAL ($738.96)
Biggest category driver: nan ($2,868.17)

Saved digest → C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\digest_latest.txt


In [48]:
# Suggest % cuts in top categories to reach GOAL_SAVINGS over next 30 days
cur_by_cat = (
    df[(df["date"] > cut1) & (df["amount"] > 0)]
      .groupby("category", dropna=False)["amount"].sum()
      .sort_values(ascending=False)
)

nudges = []
remaining = GOAL_SAVINGS
for cat, amt in cur_by_cat.items():
    if remaining <= 0:
        break
    # propose cutting up to 40% of this category
    max_cut = 0.40 * amt
    if max_cut <= 0:
        continue
    pct_needed = min(remaining / amt, 0.40)  # cap at 40%
    if pct_needed > 0:
        nudges.append((cat, pct_needed))
        remaining -= pct_needed * amt

lines = [f"Goal: Save ${GOAL_SAVINGS:,.0f} next 30 days"]
if nudges:
    for (cat, pct) in nudges:
        lines.append(f"- Cut {cat} by {pct*100:.0f}%")
else:
    lines.append("- Spending already low or insufficient category concentration to suggest cuts.")

with open(GOAL_PATH, "w", encoding="utf-8") as f:
    f.write("\n".join(lines))

print("\n".join(lines))
print(f"\nSaved goal nudges → {GOAL_PATH}")


Goal: Save $1,000 next 30 days
- Cut nan by 35%

Saved goal nudges → C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\goal_nudges_latest.txt


In [49]:
# --- Cell 12: Build text and cache embeddings (reuse shared embed_client) ---
def build_search_text(row):
    parts = [
        str(row.get("display_name") or ""),
        str(row.get("name") or ""),
        str(row.get("merchant_name") or ""),
        str(row.get("category") or ""),
        str(row.get("subcategory") or ""),
        str(row.get("tags") or ""),
    ]
    return " | ".join(p for p in parts if p)

# Limit to recent rows for cost control
embed_df = df.sort_values("date", ascending=False).head(500).copy()
embed_df["search_text"] = embed_df.apply(build_search_text, axis=1)

# Load existing cache
if EMBEDDINGS_PATH.exists():
    old = pd.read_parquet(EMBEDDINGS_PATH)
else:
    old = pd.DataFrame(columns=["txn_uid","embedding"])

existing = set(old["txn_uid"]) if len(old) else set()
to_embed = embed_df[~embed_df["txn_uid"].isin(existing)][["txn_uid", "search_text"]]

def get_embeddings(texts: list[str]) -> list | None:
    if embed_client is None:
        return None
    # The model name is the deployment name on Azure
    res = embed_client.embeddings.create(model=AZURE_OPENAI_EMBEDDINGS, input=list(texts))
    # Return raw vectors (list[float]) as provided
    return [d.embedding for d in res.data]

new_rows = []
if len(to_embed) and embed_client is not None:
    B = 64
    for i in range(0, len(to_embed), B):
        chunk = to_embed.iloc[i:i+B]
        vecs = get_embeddings(chunk["search_text"].tolist())
        if vecs is None:
            break
        for uid, vec in zip(chunk["txn_uid"].tolist(), vecs):
            if vec is not None:
                new_rows.append({"txn_uid": uid, "embedding": vec})

if new_rows:
    add = pd.DataFrame(new_rows)
    merged = pd.concat([old, add], ignore_index=True).drop_duplicates("txn_uid", keep="last")
    merged.to_parquet(EMBEDDINGS_PATH, index=False)
    print(f"Embeddings cached: +{len(add)} → total {len(merged)}")
else:
    print("No new embeddings added (none missing or embeddings disabled).")


No new embeddings added (none missing or embeddings disabled).


In [50]:
# Reorder and save
save_cols = [
    "txn_uid","date","bank_name","card_name",
    "display_name","merchant_key",
    "category","subcategory","tags",
    "name","merchant_name",
    "amount","is_subscription","is_anomaly","z_by_merchant"
]

for c in save_cols:
    if c not in df.columns:
        df[c] = np.nan

df_out = df[save_cols].sort_values(["date", "bank_name"], ascending=[False, True])

# Write both the stable file (Power BI) and a processed copy
df_out.to_csv(ENRICHED_OUT_PATH, index=False)
df_out.to_csv(ENRICHED_COPY_PATH, index=False)

print(f"✅ Enriched CSV saved → {ENRICHED_OUT_PATH}")
print(f"📄 Copy saved → {ENRICHED_COPY_PATH}")


✅ Enriched CSV saved → C:\Users\kosis\Downloads\Automation\spending-dashboard\data\raw\latest.csv
📄 Copy saved → C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\latest_enriched.csv


In [51]:
# --- Cell 14: Weekly Executive Digest (WoW) — Wealthfront excluded, Apple Cash kept (Friends) + Azure overlay + flat CSV ---
import os, re, json
from pathlib import Path
from tenacity import retry, stop_after_attempt, wait_exponential

INSIGHTS_DIR = DATA_PROCESSED / "insights"
INSIGHTS_DIR.mkdir(parents=True, exist_ok=True)
DIGEST_JSON = INSIGHTS_DIR / "digest_latest.json"
DIGEST_MD   = INSIGHTS_DIR / "digest_latest.md"
DIGEST_FLAT = INSIGHTS_DIR / "digest_latest_flat.csv"   # PBI-friendly

# -------- 0) Digest-only filters & helpers --------
def _upper_text_cols(frame):
    # Safe uppercase concat of display_name / merchant_name / name
    for c in ("display_name","merchant_name","name"):
        if c not in frame.columns:
            frame[c] = ""
    return (frame["display_name"].astype(str) + " " +
            frame["merchant_name"].astype(str) + " " +
            frame["name"].astype(str)).str.upper()

def apply_digest_filters(frame: pd.DataFrame) -> pd.DataFrame:
    """
    Exclude Wealthfront moves (HYSA transfers/withdrawals), but do NOT exclude Apple Cash.
    """
    txt = _upper_text_cols(frame)
    is_wealthfront = txt.str.contains(r"\bWEALTHFRONT\b", na=False)
    is_apple_cash  = txt.str.contains(r"\bAPPLE CASH\b", na=False)
    # Drop Wealthfront unless it's also Apple Cash (rare, but explicit)
    keep_mask = ~(is_wealthfront & ~is_apple_cash)
    out = frame.loc[keep_mask].copy()

    # For digest presentation only: tidy Apple Cash labeling if blank/noisy
    # - Display name -> "APPLE CASH"
    # - Category -> "Transfers: Friends" if blank/NaN (you said it's usually sending to a friend)
    ac_mask = _upper_text_cols(out).str.contains(r"\bAPPLE CASH\b", na=False)
    if "display_name" not in out.columns:
        out["display_name"] = out.get("merchant_key", "")
    out.loc[ac_mask, "display_name"] = "APPLE CASH"
    if "category" not in out.columns:
        out["category"] = ""
    cat_blank = out["category"].isna() | (out["category"].astype(str).str.strip() == "")
    out.loc[ac_mask & cat_blank, "category"] = "Transfers to Friends/Family"
    return out

# -------- 1) Last COMPLETED week (Mon–Sun), compare WoW --------
try:
    now = pd.Timestamp.now(tz="America/Los_Angeles").normalize()
except Exception:
    now = pd.Timestamp.now().normalize()

wd = int(now.weekday())  # Mon=0 ... Sun=6
days_to_last_sun = 7 if wd == 6 else (wd + 1)
wk_end   = (now - pd.Timedelta(days=days_to_last_sun)).date()      # inclusive Sunday
wk_start = (pd.Timestamp(wk_end) - pd.Timedelta(days=6)).date()    # prior Monday
prev_end = (pd.Timestamp(wk_end) - pd.Timedelta(days=7)).date()
prev_start = (pd.Timestamp(prev_end) - pd.Timedelta(days=6)).date()

# Digest view: apply Wealthfront exclusion + Apple Cash tidy BEFORE slicing weeks
df_w_all = apply_digest_filters(df.copy())
df_w_all["date_only"] = df_w_all["date"].dt.date

cur  = df_w_all[(df_w_all["date_only"] >= wk_start) & (df_w_all["date_only"] <= wk_end)]
prev = df_w_all[(df_w_all["date_only"] >= prev_start) & (df_w_all["date_only"] <= prev_end)]

# -------- 2) Robust polarity inference (current-week first, then fallback to full filtered set) --------
def infer_orientation(frame) -> str | None:
    a = frame["amount"].dropna()
    pos = int((a > 0).sum())
    neg = int((a < 0).sum())
    if pos == 0 and neg == 0:
        return None
    return "neg" if neg > pos else "pos"

orient_cur = infer_orientation(cur)
orient_all = infer_orientation(df_w_all)
orient = orient_cur or orient_all or "pos"  # default to positive-outflow if ambiguous

def spend_series(frame, orient_hint: str) -> pd.Series:
    a = frame["amount"].dropna()
    if orient_hint == "neg":
        s = a[a < 0].abs()
        if s.empty and (a > 0).any():
            s = a[a > 0]
    else:
        s = a[a > 0]
        if s.empty and (a < 0).any():
            s = a[a < 0].abs()
    return s

def income_series(frame, orient_hint: str) -> pd.Series:
    a = frame["amount"].dropna()
    if orient_hint == "neg":
        inc = a[a > 0]
        if inc.empty and (a < 0).any():
            inc = a[a < 0].abs()
    else:
        inc = a[a < 0].abs()
        if inc.empty and (a > 0).any():
            inc = a[a > 0]
    return inc

cur_spend_ser   = spend_series(cur, orient)
prev_spend_ser  = spend_series(prev, orient)
cur_income_ser  = income_series(cur, orient)
prev_income_ser = income_series(prev, orient)

cur_spend   = round(float(cur_spend_ser.sum()), 2)
prev_spend  = round(float(prev_spend_ser.sum()), 2)
cur_income  = round(float(cur_income_ser.sum()), 2)
prev_income = round(float(prev_income_ser.sum()), 2)

spend_delta     = round(cur_spend - prev_spend, 2)
spend_delta_pct = round((spend_delta / prev_spend), 4) if prev_spend else (1.0 if cur_spend else 0.0)

# Top drivers this week based on the actual spend vector we used
if not cur_spend_ser.empty:
    cur_exp = cur.loc[cur_spend_ser.index].copy()
    cur_exp = cur_exp.assign(spend=cur_spend_ser.values)
else:
    cur_exp = cur.assign(spend=0.0)

top_merchants_cur = (
    cur_exp.groupby("display_name", dropna=False)["spend"]
          .sum().sort_values(ascending=False).head(5)
          .reset_index()
)
top_cats_cur = (
    cur_exp.groupby("category", dropna=False)["spend"]
          .sum().sort_values(ascending=False).head(5)
          .reset_index()
)

subs_w  = cur.loc[cur.get("is_subscription", False) == True]
anoms_w = cur.loc[cur.get("is_anomaly", False) == True]

summary_payload = {
    "as_of_date": pd.Timestamp(wk_end).isoformat(),
    "window": {
        "current": {"start": str(wk_start), "end": str(wk_end), "label": "Last completed week (Mon-Sun)"},
        "previous": {"start": str(prev_start), "end": str(prev_end)}
    },
    "totals": {
        "spend_current": cur_spend,
        "spend_previous": prev_spend,
        "spend_delta": spend_delta,
        "spend_delta_pct": spend_delta_pct,
        "income_current": cur_income,
        "income_previous": prev_income,
    },
    "top_merchants": [
        {"display_name": str(r["display_name"]), "spend": float(r["spend"])}
        for _, r in top_merchants_cur.iterrows()
    ],
    "top_categories": [
        {"category": str(r["category"]), "spend": float(r["spend"])}
        for _, r in top_cats_cur.iterrows()
    ],
    "subscriptions_count": int(subs_w["display_name"].nunique()) if len(subs_w) else 0,
    "anomalies_count": int(anoms_w.shape[0]) if len(anoms_w) else 0,
}

# -------- 3) Azure summarizer (overlay JSON, never empty) --------
def _salvage_json_object(txt: str):
    t = (txt or "").strip()
    if t.startswith("```"):
        t = re.sub(r"^```(?:json)?", "", t, flags=re.IGNORECASE).strip()
        t = re.sub(r"```$", "", t).strip()
    try:
        obj = json.loads(t)
        if isinstance(obj, dict):
            return obj
    except Exception:
        pass
    s, e = t.find("{"), t.rfind("}")
    if s != -1 and e != -1 and e > s:
        cand = t[s:e+1]
        try:
            obj = json.loads(cand)
            if isinstance(obj, dict):
                return obj
        except Exception:
            pass
    try:
        import ast
        obj = ast.literal_eval(t)
        if isinstance(obj, dict):
            return obj
    except Exception:
        pass
    return None

SYSTEM_SUMMARY = (
    "You are an analytics copilot for personal finance. "
    "Using ONLY the provided aggregates for the last completed week and the previous week, "
    "produce an executive digest in STRICT JSON. Do not invent numbers. Keep it concise."
)
USER_INSTRUCTIONS = (
    "Compare the current week vs previous week (WoW). "
    "Return ONLY a JSON object with keys:\n"
    "{\n"
    '  "headline": string,\n'
    '  "key_metrics": [ {"name": string, "value": number, "delta_pct": number|null} ],\n'
    '  "top_drivers": [ {"label": string, "spend": number} ],\n'
    '  "risks": [ {"type": "subscription"|"anomaly"|"trend", "note": string} ],\n'
    '  "action_items": [ {"title": string, "impact_usd": number, "rationale": string} ]\n'
    "}\n"
    "- Max 5 items per list.\n"
    "- Use negative delta_pct for improvements if spend fell.\n"
    "- impact_usd is a rough weekly savings estimate.\n"
)

@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=6))
def _azure_digest_call(payload_json: str) -> str:
    assert chat_client is not None
    resp = chat_client.chat.completions.create(  # Azure OpenAI
        model=AZURE_OPENAI_DEPLOYMENT,
        messages=[
            {"role":"system","content": SYSTEM_SUMMARY},
            {"role":"user","content": USER_INSTRUCTIONS + "\n\nPAYLOAD:\n" + payload_json}
        ],
        temperature=0.1,
        max_tokens=600,
        response_format={"type":"json_object"},
    )
    return resp.choices[0].message.content

# Deterministic base digest (ASCII hyphens)
base_digest = {
    "insights_version": 2,
    "window": summary_payload["window"],
    "totals": summary_payload["totals"],
    "headline": f"Weekly digest {wk_start}-{wk_end}",
    "key_metrics": [
        {"name":"Spend (week)",  "value": cur_spend,  "delta_pct": spend_delta_pct},
        {"name":"Income (week)", "value": cur_income, "delta_pct": None},
    ],
    "top_drivers": [{"label": t["category"], "spend": float(t["spend"])} for t in summary_payload["top_categories"]],
    "risks": (
        ([{"type":"subscription","note": f"{summary_payload['subscriptions_count']} active subs this week"}] if summary_payload["subscriptions_count"] else [])
        + ([{"type":"anomaly","note": f"{summary_payload['anomalies_count']} anomalies this week"}] if summary_payload["anomalies_count"] else [])
    ),
    "action_items": []
}

azure_digest = None
if 'chat_client' in globals() and chat_client is not None:
    try:
        raw = _azure_digest_call(json.dumps(summary_payload))
        azure_digest = _salvage_json_object(raw)
    except Exception:
        azure_digest = None

def _overlay(base: dict, over: dict | None) -> dict:
    if not isinstance(over, dict):
        return base
    out = dict(base)
    for k, v in over.items():
        if k in ("key_metrics","top_drivers","risks","action_items"):
            if isinstance(v, list) and len(v) > 0:
                out[k] = v
        elif v not in (None, "", {}):
            out[k] = v
    return out

digest = _overlay(base_digest, azure_digest)

# -------- 4) Compact summary string for text tile / email subject line --------
def build_compact_summary(d: dict) -> str:
    ws = d.get("window", {}).get("current", {}).get("start", str(wk_start))
    we = d.get("window", {}).get("current", {}).get("end", str(wk_end))
    km = {m.get("name",""): m for m in d.get("key_metrics", [])}
    spend_m = km.get("Spend (week)")
    spend_val = float(spend_m.get("value",0)) if spend_m else 0.0
    dp = spend_m.get("delta_pct") if spend_m else None
    dp_txt = f"{dp*100:+.1f}%" if isinstance(dp,(int,float)) else "n/a"
    top = (d.get("top_drivers") or [])
    if top:
        top_label = top[0].get("label") or ""
        top_amt = float(top[0].get("spend",0) or 0.0)
        driver_txt = f"Top driver: {top_label} (${top_amt:,.0f})"
    else:
        driver_txt = "Top driver: n/a"
    return f"{ws}-{we}: Weekly spend ${spend_val:,.0f} (WoW {dp_txt}). {driver_txt}."

digest["summary"] = build_compact_summary(digest)

# -------- 5) Persist JSON + Markdown + flat CSV --------
with open(DIGEST_JSON, "w", encoding="utf-8") as f:
    json.dump(digest, f, ensure_ascii=False, indent=2)

def render_md(d):
    ws = d.get("window", {}).get("current", {}).get("start", str(wk_start))
    we = d.get("window", {}).get("current", {}).get("end", str(wk_end))
    lines = [f"## Weekly Digest: {ws}-{we}", f"{d.get('headline','Executive digest')}"]
    if d.get("summary"):
        lines.append(f"\n{d['summary']}\n")
    km = d.get("key_metrics", [])[:5]
    if km:
        lines.append("\n**Key metrics (WoW)**")
        for m in km:
            dp = m.get("delta_pct", None)
            dp_txt = f" ({dp*100:+.1f}%)" if isinstance(dp,(int,float)) else ""
            lines.append(f"- {m['name']}: ${m['value']:,.2f}{dp_txt}")
    td = d.get("top_drivers", [])[:5]
    if td:
        lines.append("\n**Top drivers (this week)**")
        for t in td:
            label = t.get("label") or t.get("display_name") or ""
            lines.append(f"- {label}: ${float(t.get('spend',0)):,.2f}")
    rk = d.get("risks", [])[:5]
    if rk:
        lines.append("\n**Risks**")
        for r in rk:
            lines.append(f"- {r.get('type','note')}: {r.get('note','')}")
    ai = d.get("action_items", [])[:5]
    if ai:
        lines.append("\n**Action items**")
        for a in ai:
            lines.append(f"- {a['title']} — est. weekly impact ${float(a.get('impact_usd',0)):,.0f}. {a.get('rationale','')}")
    return "\n".join(lines) + "\n"

with open(DIGEST_MD, "w", encoding="utf-8") as f:
    f.write(render_md(digest))

# Flat table for Power BI ingestion (one row per element with type)
flat_rows = []

# Header row (window + totals; includes summary string)
flat_rows.append({
    "row_type": "header",
    "as_of_end": str(wk_end),
    "cur_start": str(wk_start),
    "cur_end": str(wk_end),
    "prev_start": str(prev_start),
    "prev_end": str(prev_end),
    "headline": digest.get("headline", ""),
    "summary": digest.get("summary", ""),
    "name": "Spend (week)",
    "value": cur_spend,
    "delta_pct": spend_delta_pct,
    "label": "",
    "spend": None,
    "note": "",
    "impact_usd": None,
})

# Dedicated summary row for a text card
flat_rows.append({
    "row_type": "summary",
    "as_of_end": str(wk_end),
    "cur_start": str(wk_start),
    "cur_end": str(wk_end),
    "prev_start": str(prev_start),
    "prev_end": str(prev_end),
    "headline": digest.get("headline", ""),
    "summary": digest.get("summary", ""),
    "name": "",
    "value": None,
    "delta_pct": None,
    "label": "",
    "spend": None,
    "note": "",
    "impact_usd": None,
})

# Key metrics
for m in digest.get("key_metrics", []):
    flat_rows.append({
        "row_type": "metric",
        "as_of_end": str(wk_end),
        "cur_start": str(wk_start),
        "cur_end": str(wk_end),
        "prev_start": str(prev_start),
        "prev_end": str(prev_end),
        "headline": digest.get("headline", ""),
        "summary": "",
        "name": m.get("name",""),
        "value": float(m.get("value",0) or 0.0),
        "delta_pct": (float(m.get("delta_pct")) if isinstance(m.get("delta_pct"), (int,float)) else None),
        "label": "",
        "spend": None,
        "note": "",
        "impact_usd": None,
    })

# Top drivers
for t in digest.get("top_drivers", []):
    flat_rows.append({
        "row_type": "driver",
        "as_of_end": str(wk_end),
        "cur_start": str(wk_start),
        "cur_end": str(wk_end),
        "prev_start": str(prev_start),
        "prev_end": str(prev_end),
        "headline": digest.get("headline", ""),
        "summary": "",
        "name": "",
        "value": None,
        "delta_pct": None,
        "label": t.get("label",""),
        "spend": float(t.get("spend",0) or 0.0),
        "note": "",
        "impact_usd": None,
    })

# Risks
for r in digest.get("risks", []):
    flat_rows.append({
        "row_type": "risk",
        "as_of_end": str(wk_end),
        "cur_start": str(wk_start),
        "cur_end": str(wk_end),
        "prev_start": str(prev_start),
        "prev_end": str(prev_end),
        "headline": digest.get("headline", ""),
        "summary": "",
        "name": "",
        "value": None,
        "delta_pct": None,
        "label": r.get("type",""),
        "spend": None,
        "note": r.get("note",""),
        "impact_usd": None,
    })

# Action items
for a in digest.get("action_items", []):
    flat_rows.append({
        "row_type": "action",
        "as_of_end": str(wk_end),
        "cur_start": str(wk_start),
        "cur_end": str(wk_end),
        "prev_start": str(prev_start),
        "prev_end": str(prev_end),
        "headline": digest.get("headline", ""),
        "summary": "",
        "name": "",
        "value": None,
        "delta_pct": None,
        "label": a.get("title",""),
        "spend": None,
        "note": a.get("rationale",""),
        "impact_usd": float(a.get("impact_usd",0) or 0.0),
    })

pd.DataFrame(flat_rows).to_csv(DIGEST_FLAT, index=False)

# -------- 6) Debug footer + quick counts after filtering --------
print(
    "🧠 Weekly executive digest written (WoW):\n"
    f"- JSON: {DIGEST_JSON}\n- MD:   {DIGEST_MD}\n- CSV:  {DIGEST_FLAT}\n"
    f"Window: {wk_start} -> {wk_end} | Prev: {prev_start} -> {prev_end}\n"
    f"Polarity (cur/global): {orient_cur}/{orient_all} | cur +/− counts: "
    f"{int((cur['amount']>0).sum())}/{int((cur['amount']<0).sum())} | "
    f"cur spend total: {cur_spend:.2f} | prev spend total: {prev_spend:.2f}\n"
    f"Filtered out Wealthfront rows this week: "
    f"{int((_upper_text_cols(df).str.contains('WEALTHFRONT', na=False)).sum()) - int((_upper_text_cols(df_w_all).str.contains('WEALTHFRONT', na=False)).sum())}\n"
)


🧠 Weekly executive digest written (WoW):
- JSON: C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\insights\digest_latest.json
- MD:   C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\insights\digest_latest.md
- CSV:  C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\insights\digest_latest_flat.csv
Window: 2025-09-01 -> 2025-09-07 | Prev: 2025-08-25 -> 2025-08-31
Polarity (cur/global): pos/pos | cur +/− counts: 5/4 | cur spend total: 526.40 | prev spend total: 1629.20
Filtered out Wealthfront rows this week: 4



In [58]:
# --- Cell 15: Weekly AI narrative (Theme + MD + HTML + Subject CSV, fixed structure) ---
import json, html, csv, re
from pathlib import Path

INSIGHTS_DIR = DATA_PROCESSED / "insights"
INSIGHTS_DIR.mkdir(parents=True, exist_ok=True)
EMAIL_MD_PATH    = INSIGHTS_DIR / "digest_latest_email.md"
EMAIL_HTML_PATH  = INSIGHTS_DIR / "digest_latest_email.html"
EMAIL_SUBJ_PATH  = INSIGHTS_DIR / "digest_latest_email_subject.txt"
EMAIL_SUBJ_CSV   = INSIGHTS_DIR / "digest_latest_email_subject.csv"  # single-column CSV

def _get_chat_client():
    if 'chat_client' in globals() and chat_client is not None: return chat_client
    if 'client' in globals() and client is not None: return client
    return None

def _safe_num(x, default=0.0):
    try: return float(x)
    except Exception: return default

def _short_range_dash(ws_str, we_str):
    try:
        ws = pd.to_datetime(ws_str).date()
        we = pd.to_datetime(we_str).date()
        return f"{ws.month}-{ws.day} to {we.month}-{we.day}"
    except Exception:
        return f"{ws_str} to {we_str}"

def _strip_merge_markers(s: str) -> str:
    return re.sub(r"<<<<<<<.*?=======|>>>>>>>.*?$", "", s, flags=re.DOTALL | re.MULTILINE)

def _sanitize_subject(s: str) -> str:
    s = s.replace("\r", " ").replace("\n", " ").replace("\t", " ")
    s = _strip_merge_markers(s)
    s = re.sub(r"\s{2,}", " ", s).strip()
    return s

# Deterministic fallback subject
def _fallback_subject(d):
    ws = d.get("window", {}).get("current", {}).get("start", "")
    we = d.get("window", {}).get("current", {}).get("end", "")
    short = _short_range_dash(ws, we)
    km = {m.get("name",""): m for m in d.get("key_metrics", [])}
    spend_m = km.get("Spend (week)")
    spend_val = _safe_num(spend_m.get("value")) if spend_m else 0.0
    dp = spend_m.get("delta_pct") if spend_m else None
    dp_txt = f"{float(dp)*100:+.1f}%" if isinstance(dp,(int,float)) else "n/a"
    return f"This Week {short} — Spend ${spend_val:,.0f} (WoW {dp_txt})"

# Simple MD->HTML renderer (headings + bullets)
def _simple_html_from_md(md_text, title_color="#0f172a"):
    html_lines = []
    open_ul = False
    for raw in md_text.splitlines():
        line = raw.strip()
        if line.startswith("# "):
            if open_ul: html_lines.append("</ul>"); open_ul=False
            html_lines.append(f'<h1 style="margin:0 0 12px 0;color:{title_color};font-weight:800;">{html.escape(line[2:])}</h1>')
        elif line.startswith("## "):
            if open_ul: html_lines.append("</ul>"); open_ul=False
            html_lines.append(f'<h2 style="margin:20px 0 10px 0;color:{title_color};font-weight:700;">{html.escape(line[3:])}</h2>')
        elif line.startswith("- "):
            if not open_ul:
                html_lines.append("<ul style='margin:4px 0 12px 22px;padding:0;'>"); open_ul=True
            html_lines.append(f"<li>{html.escape(line[2:])}</li>")
        elif line == "":
            if open_ul: html_lines.append("</ul>"); open_ul=False
            html_lines.append('<div style="height:6px"></div>')
        else:
            if open_ul: html_lines.append("</ul>"); open_ul=False
            html_lines.append(f"<p style='margin:6px 0'>{html.escape(line)}</p>")
    if open_ul: html_lines.append("</ul>")
    body = "\n".join(html_lines)
    return f"<!doctype html><meta charset='utf-8'><div style='font-family:Segoe UI,system-ui,-apple-system,Arial;line-height:1.45;font-size:14px;color:#111827;'>{body}</div>"

# Persona + prompts
persona = os.getenv("AI_SUMMARY_PERSONA", "Crisp, witty, CFO-style, confident")
ws = digest.get("window", {}).get("current", {}).get("start", "")
we = digest.get("window", {}).get("current", {}).get("end", "")
short_range = _short_range_dash(ws, we)

payload_json = json.dumps({
    "window": digest.get("window", {}),
    "totals": digest.get("totals", {}),
    "key_metrics": digest.get("key_metrics", []),
    "top_drivers": digest.get("top_drivers", []),
    "risks": digest.get("risks", []),
    "action_items": digest.get("action_items", []),
    "summary_line": digest.get("summary", "")
}, ensure_ascii=False)

THEME_SYSTEM = (
    "Return ONLY a 2–3 word theme that reflects the *data* (e.g., when spend is sharply down, a frugal vibe; "
    "when up, a celebratory/alert vibe). Keep it tasteful; no emojis; no trailing punctuation."
)

SUBJECT_SYSTEM = (
    "Return ONLY an email subject, 8–12 words, including: the short date range (M-D to M-D), "
    "weekly spend and WoW delta if present, and the theme. No quotes, no newlines."
)

SUMMARY_SYSTEM = (
    f"You are an analytics copywriter. Tone: {persona}. "
    "Output **Markdown only** with this **exact structure** and exactly one top-level H1:\n"
    f"# <Theme> — Weekly Executive Summary ({short_range})\n"
    "\n"
    "Then H2 sections in this order (no extras, no repeats):\n"
    "## Snapshot\n"
    "## Drivers\n"
    "## Category Mix\n"
    "## Subscriptions & Anomalies\n"
    "## Cash Flow\n"
    "## Notables\n"
    "## Recommendations\n"
    "## Next Week Watchlist\n"
    "\n"
    "- Base the mood and wording on the numbers you are given.\n"
    "- Use very short lines or bullets under each section; do not repeat the H1 or add another H1.\n"
    "- In 'Drivers', include a short 'Top Spending Categories:' line then 3–4 bullets using the provided aggregates.\n"
)

chat = _get_chat_client()
subject_txt = _fallback_subject(digest)
md_txt = None
theme_txt = "Weekly Snapshot"

try:
    if chat is not None:
        # Theme (data-aware)
        theme_resp = chat.chat_completions.create(
            model=AZURE_OPENAI_DEPLOYMENT,
            messages=[{"role":"system","content": THEME_SYSTEM},
                      {"role":"user","content": "Aggregates:\n"+payload_json}],
            temperature=0.6, max_tokens=16,
        )
        theme_txt = (theme_resp.choices[0].message.content or "").strip() or theme_txt

        # Subject
        subj_resp = chat.chat_completions.create(
            model=AZURE_OPENAI_DEPLOYMENT,
            messages=[{"role":"system","content": SUBJECT_SYSTEM},
                      {"role":"user","content": f"Date range: {short_range}\nAggregates:\n{payload_json}\nTheme: {theme_txt}"}],
            temperature=0.3, max_tokens=64,
        )
        st = (subj_resp.choices[0].message.content or "").strip()
        subject_txt = st or subject_txt

        # Body (single H1 + exact sections)
        body_resp = chat.chat_completions.create(
            model=AZURE_OPENAI_DEPLOYMENT,
            messages=[{"role":"system","content": SUMMARY_SYSTEM},
                      {"role":"user","content": f"Theme: {theme_txt}\nAggregates:\n{payload_json}"}],
            temperature=0.45, max_tokens=1200,
        )
        mt = (body_resp.choices[0].message.content or "").strip()
        md_txt = mt if mt else None
except Exception:
    pass

# Fallback MD if AI not available
if md_txt is None:
    # Build a compact, sectioned fallback from deterministic data
    km = {m.get("name",""): m for m in digest.get("key_metrics", [])}
    spend_m = km.get("Spend (week)")
    spend_val = _safe_num(spend_m.get("value")) if spend_m else 0.0
    dp = spend_m.get("delta_pct") if spend_m else None
    dp_txt = f"{float(dp)*100:+.1f}%" if isinstance(dp,(int,float)) else "n/a"
    intro = f"Welcome to your weekly check-in. Spend ${spend_val:,.2f} (WoW {dp_txt})."
    top = digest.get("top_drivers", [])[:4]
    lines = [f"# {theme_txt} — Weekly Executive Summary ({short_range})",
             "## Snapshot",
             f"- Total Spend: ${spend_val:,.2f} (WoW {dp_txt})",
             "## Drivers",
             "Top Spending Categories:"]
    for t in top:
        lines.append(f"- {t.get('label','')}: ${_safe_num(t.get('spend')):,.0f}")
    lines += [
        "## Category Mix",
        "- Mix stable; see dashboard for breakdown.",
        "## Subscriptions & Anomalies",
        f"- Anomalies: {digest.get('anomalies_count', 0)}",
        "## Cash Flow",
        "- Net cash flow covered this week.",
        "## Notables",
        "- Largest line item listed above.",
        "## Recommendations",
        "- Keep discretionary spend in check.",
        "## Next Week Watchlist",
        "- Review anomalies; monitor categories that may rebound.",
    ]
    md_txt = "\n".join([intro, ""] + lines) + "\n"

# Final sanitize (no merge markers / duplicates)
subject_txt = _sanitize_subject(subject_txt)
md_txt = _strip_merge_markers(md_txt)
# Ensure only one H1 that starts with "<Theme> — Weekly Executive Summary"
lines = [l for l in md_txt.splitlines()]
first_h1_seen = False
clean = []
for l in lines:
    if l.startswith("# "):
        if first_h1_seen:
            # drop any later H1 lines
            continue
        first_h1_seen = True
        clean.append(l)
    else:
        clean.append(l)
md_txt = "\n".join(clean)

# Write outputs
EMAIL_MD_PATH.write_text(md_txt, encoding="utf-8")
EMAIL_HTML_PATH.write_text(_simple_html_from_md(md_txt), encoding="utf-8")
EMAIL_SUBJ_PATH.write_text(subject_txt, encoding="utf-8")
with open(EMAIL_SUBJ_CSV, "w", newline="", encoding="utf-8-sig") as f:
    writer = csv.DictWriter(f, fieldnames=["subject"]); writer.writeheader(); writer.writerow({"subject": subject_txt})

print(
    "✉️ Weekly AI summary written:\n"
    f"- Subject (csv): {EMAIL_SUBJ_CSV}\n"
    f"- Subject (txt): {EMAIL_SUBJ_PATH}\n"
    f"- Markdown:      {EMAIL_MD_PATH}\n"
    f"- HTML:          {EMAIL_HTML_PATH}\n"
)


✉️ Weekly AI summary written:
- Subject (csv): C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\insights\digest_latest_email_subject.csv
- Subject (txt): C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\insights\digest_latest_email_subject.txt
- Markdown:      C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\insights\digest_latest_email.md
- HTML:          C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\insights\digest_latest_email.html



In [59]:
# --- Cell 16: Monthly AI narrative (Month name subject, macro tone, fixed structure) ---
import calendar, csv, html, json, re

M_INSIGHTS_DIR = DATA_PROCESSED / "insights"
M_INSIGHTS_DIR.mkdir(parents=True, exist_ok=True)
M_JSON  = M_INSIGHTS_DIR / "digest_mom.json"
M_MD    = M_INSIGHTS_DIR / "digest_mom_email.md"
M_HTML  = M_INSIGHTS_DIR / "digest_mom_email.html"
M_SUBJ  = M_INSIGHTS_DIR / "digest_mom_email_subject.txt"
M_SUBJ_CSV = M_INSIGHTS_DIR / "digest_mom_email_subject.csv"
M_FLAT  = M_INSIGHTS_DIR / "digest_mom_flat.csv"

def _strip_merge_markers(s: str) -> str:
    return re.sub(r"<<<<<<<.*?=======|>>>>>>>.*?$", "", s, flags=re.DOTALL | re.MULTILINE)

def _sanitize_one_line(s: str) -> str:
    s = s.replace("\r"," ").replace("\n"," ").replace("\t"," ")
    s = _strip_merge_markers(s)
    s = re.sub(r"\s{2,}"," ", s).strip()
    return s

def _month_bounds(ts: pd.Timestamp):
    y, m = ts.year, ts.month
    start = pd.Timestamp(year=y, month=m, day=1).date()
    last_day = calendar.monthrange(y, m)[1]
    end = pd.Timestamp(year=y, month=m, day=last_day).date()
    return start, end

# last completed month (America/Los_Angeles)
try:
    now = pd.Timestamp.now(tz="America/Los_Angeles")
except Exception:
    now = pd.Timestamp.now()
first_of_this_month = pd.Timestamp(year=now.year, month=now.month, day=1, tz=getattr(now, 'tz', None))
prev_month_end = (first_of_this_month - pd.Timedelta(days=1)).tz_localize(None) if hasattr(first_of_this_month, 'tz') else (first_of_this_month - pd.Timedelta(days=1))
cm_start, cm_end = _month_bounds(pd.Timestamp(year=prev_month_end.year, month=prev_month_end.month, day=1))
pm_end_dt = pd.Timestamp(cm_start) - pd.Timedelta(days=1)
pm_start, pm_end = _month_bounds(pm_end_dt)

# Wealthfront filter (keep Apple Cash)
base = df.copy()
def _concat_upper(frame):
    for c in ("display_name","merchant_name","name"):
        if c not in frame.columns: frame[c] = ""
    return (frame["display_name"].astype(str)+" "+frame["merchant_name"].astype(str)+" "+frame["name"].astype(str)).str.upper()
txt_all = _concat_upper(base)
base = base.loc[~(txt_all.str.contains(r"\bWEALTHFRONT\b", na=False) & ~txt_all.str.contains(r"\bAPPLE CASH\b", na=False))].copy()

# Slices
base["date_only"] = base["date"].dt.date
cur  = base[(base["date_only"] >= cm_start) & (base["date_only"] <= cm_end)]
prev = base[(base["date_only"] >= pm_start) & (base["date_only"] <= pm_end)]

def _infer_orient(frame) -> str | None:
    a = frame["amount"].dropna()
    pos, neg = int((a > 0).sum()), int((a < 0).sum())
    if pos == 0 and neg == 0: return None
    return "neg" if neg > pos else "pos"
orient = _infer_orient(cur) or _infer_orient(base) or "pos"

def _series_spend(frame, hint: str):
    a = frame["amount"].dropna()
    s = a[a < 0].abs() if hint == "neg" else a[a > 0]
    if s.empty: s = a[a > 0] if hint == "neg" else a[a < 0].abs()
    return s
def _series_income(frame, hint: str):
    a = frame["amount"].dropna()
    inc = a[a > 0] if hint == "neg" else a[a < 0].abs()
    if inc.empty: inc = a[a < 0].abs() if hint == "neg" else a[a > 0]
    return inc

cur_spend   = float(_series_spend(cur, orient).sum())
prev_spend  = float(_series_spend(prev, orient).sum())
cur_income  = float(_series_income(cur, orient).sum())
prev_income = float(_series_income(prev, orient).sum())
delta = cur_spend - prev_spend
delta_pct = (delta / prev_spend) if prev_spend else (1.0 if cur_spend else 0.0)

# Drivers by category (ensure no NaN)
cur_cat = cur.copy()
cur_cat["category"] = cur_cat["category"].fillna("Uncategorized").replace("", "Uncategorized")
svec = _series_spend(cur_cat, orient)
cur_exp = cur_cat.loc[svec.index].assign(spend=svec.values) if not cur_cat.empty else cur_cat.assign(spend=0.0)
top_cats = (cur_exp.groupby("category", dropna=False)["spend"]
            .sum().sort_values(ascending=False).head(5).reset_index())

subs_m = cur.loc[cur.get("is_subscription", False) == True]
anoms_m = cur.loc[cur.get("is_anomaly", False) == True]

mom_payload = {
    "window": {"current": {"start": str(cm_start), "end": str(cm_end), "label": "Last completed month"},
               "previous": {"start": str(pm_start), "end": str(pm_end)}},
    "totals": {"spend_current": round(cur_spend, 2), "spend_previous": round(prev_spend, 2),
               "spend_delta": round(delta, 2), "spend_delta_pct": round(delta_pct, 4),
               "income_current": round(cur_income, 2), "income_previous": round(prev_income, 2)},
    "top_categories": [{"category": str(r["category"]), "spend": float(r["spend"])} for _, r in top_cats.iterrows()],
    "subscriptions_count": int(subs_m["display_name"].nunique()) if len(subs_m) else 0,
    "anomalies_count": int(anoms_m.shape[0]) if len(anoms_m) else 0,
}
with open(M_JSON, "w", encoding="utf-8") as f:
    json.dump(mom_payload, f, ensure_ascii=False, indent=2)

# Month-name subject (computed, macro lens)
month_name = pd.to_datetime(mom_payload["window"]["current"]["start"]).strftime("%B")
val = mom_payload["totals"]["spend_current"]
pct = mom_payload["totals"]["spend_delta_pct"] * 100.0
trend_word = "Up" if pct >= 0 else "Down"
msubj = f"{month_name} Spend: ${val:,.0f} — {trend_word} {abs(pct):.0f}% MoM"
msubj = _sanitize_one_line(msubj)
M_SUBJ.write_text(msubj, encoding="utf-8")
with open(M_SUBJ_CSV, "w", newline="", encoding="utf-8-sig") as f:
    writer = csv.DictWriter(f, fieldnames=["subject"]); writer.writeheader(); writer.writerow({"subject": msubj})

# Ask AI for the *body* in the same style, but toned down
def _simple_html_from_md(md_text, title_color="#0f172a"):
    html_lines, open_ul = [], False
    for raw in md_text.splitlines():
        line = raw.strip()
        if line.startswith("# "):
            if open_ul: html_lines.append("</ul>"); open_ul=False
            html_lines.append(f'<h1 style="margin:0 0 12px 0;color:{title_color};font-weight:800;">{html.escape(line[2:])}</h1>')
        elif line.startswith("## "):
            if open_ul: html_lines.append("</ul>"); open_ul=False
            html_lines.append(f'<h2 style="margin:20px 0 10px 0;color:{title_color};font-weight:700;">{html.escape(line[3:])}</h2>')
        elif line.startswith("- "):
            if not open_ul:
                html_lines.append("<ul style='margin:4px 0 12px 22px;padding:0;'>"); open_ul=True
            html_lines.append(f"<li>{html.escape(line[2:])}</li>")
        elif line == "":
            if open_ul: html_lines.append("</ul>"); open_ul=False
            html_lines.append('<div style="height:6px"></div>')
        else:
            if open_ul: html_lines.append("</ul>"); open_ul=False
            html_lines.append(f"<p style='margin:6px 0'>{html.escape(line)}</p>")
    if open_ul: html_lines.append("</ul>")
    body = "\n".join(html_lines)
    return f"<!doctype html><meta charset='utf-8'><div style='font-family:Segoe UI,system-ui,-apple-system,Arial;line-height:1.45;font-size:14px;color:#111827;'>{body}</div>"

chat = None
if 'chat_client' in globals() and chat_client is not None: chat = chat_client
elif 'client' in globals() and client is not None: chat = client

short_month = month_name  # for header
payload_json_m = json.dumps(mom_payload, ensure_ascii=False)

THEME_MO_SYSTEM = (
    "Return ONLY a 2–3 word theme for a monthly personal finance report, "
    "based on the provided aggregates. No punctuation beyond hyphens."
)
BODY_MO_SYSTEM = (
    "You are an analytics copywriter. Tone: concise, macro, CFO-style. "
    "Output Markdown with exactly one H1 and these H2 sections (no extras, no repeats):\n"
    f"# <Theme> — Monthly Executive Summary ({short_month})\n"
    "## Snapshot\n"
    "## Key Metrics\n"
    "## Drivers (Top Categories)\n"
    "## Category Mix\n"
    "## Subscriptions & Anomalies\n"
    "## Cash Flow\n"
    "## Notables\n"
    "## Next Month Watchlist\n"
    "- Keep lines short (≤12 words) and macro in perspective.\n"
)

md_text = None
try:
    if chat is not None:
        # Theme
        theme_resp = chat.chat_completions.create(
            model=AZURE_OPENAI_DEPLOYMENT,
            messages=[{"role":"system","content": THEME_MO_SYSTEM},
                      {"role":"user","content": "Aggregates:\n"+payload_json_m}],
            temperature=0.6, max_tokens=16,
        )
        theme_m = (theme_resp.choices[0].message.content or "").strip() or f"{short_month} Overview"

        # Body
        body_resp = chat.chat_completions.create(
            model=AZURE_OPENAI_DEPLOYMENT,
            messages=[{"role":"system","content": BODY_MO_SYSTEM},
                      {"role":"user","content": f"Theme: {theme_m}\nAggregates:\n{payload_json_m}"}],
            temperature=0.4, max_tokens=1000,
        )
        md_text = (body_resp.choices[0].message.content or "").strip()
except Exception:
    pass

if md_text is None:
    # Fallback: simple macro summary
    md_lines = [
        f"# {short_month} — Monthly Executive Summary",
        "## Snapshot",
        f"- Spend: ${mom_payload['totals']['spend_current']:,.2f} (MoM {mom_payload['totals']['spend_delta_pct']*100:+.1f}%)",
        f"- Income: ${mom_payload['totals']['income_current']:,.2f}",
        "## Key Metrics",
        f"- Prev Spend: ${mom_payload['totals']['spend_previous']:,.2f}",
        f"- Delta: ${mom_payload['totals']['spend_delta']:,.2f}",
        "## Drivers (Top Categories)",
    ]
    for t in mom_payload["top_categories"]:
        md_lines.append(f"- {t['category']}: ${t['spend']:,.0f}")
    md_lines += [
        "## Category Mix",
        "- Mix stable; see dashboard.",
        "## Subscriptions & Anomalies",
        f"- Subscriptions: {mom_payload['subscriptions_count']} | Anomalies: {mom_payload['anomalies_count']}",
        "## Cash Flow",
        "- Macro cash position stable.",
        "## Notables",
        "- Largest categories listed above.",
        "## Next Month Watchlist",
        "- Monitor categories with rising trend.",
    ]
    md_text = "\n".join(md_lines) + "\n"

# Sanitize & write
md_text = _strip_merge_markers(md_text)
lines = md_text.splitlines()
first_h1=False
clean=[]
for l in lines:
    if l.startswith("# "):
        if first_h1: continue
        first_h1=True
    clean.append(l)
md_text = "\n".join(clean)

M_MD.write_text(md_text, encoding="utf-8")
M_HTML.write_text(_simple_html_from_md(md_text), encoding="utf-8")

# Flat CSV (unchanged from earlier)
flat = [{
    "row_type":"header",
    "cur_start": mom_payload["window"]["current"]["start"],
    "cur_end":   mom_payload["window"]["current"]["end"],
    "prev_start":mom_payload["window"]["previous"]["start"],
    "prev_end":  mom_payload["window"]["previous"]["end"],
    "name":"Spend (month)",
    "value": mom_payload["totals"]["spend_current"],
    "delta_pct": mom_payload["totals"]["spend_delta_pct"],
    "label":"", "spend":None, "note":"", "impact_usd":None
}]
for t in mom_payload["top_categories"]:
    flat.append({
        "row_type":"driver","cur_start":mom_payload["window"]["current"]["start"],"cur_end":mom_payload["window"]["current"]["end"],
        "prev_start":mom_payload["window"]["previous"]["start"],"prev_end":mom_payload["window"]["previous"]["end"],
        "name":"", "value":None, "delta_pct":None, "label":t["category"], "spend":t["spend"], "note":"", "impact_usd":None
    })
pd.DataFrame(flat).to_csv(M_FLAT, index=False)

print(
    "📆 MoM AI summary written:\n"
    f"- Subject (csv): {M_SUBJ_CSV}\n"
    f"- Subject (txt): {M_SUBJ}\n"
    f"- Markdown:      {M_MD}\n"
    f"- HTML:          {M_HTML}\n"
    f"- Flat CSV:      {M_FLAT}\n"
)


📆 MoM AI summary written:
- Subject (csv): C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\insights\digest_mom_email_subject.csv
- Subject (txt): C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\insights\digest_mom_email_subject.txt
- Markdown:      C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\insights\digest_mom_email.md
- HTML:          C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\insights\digest_mom_email.html
- Flat CSV:      C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\insights\digest_mom_flat.csv

