In [39]:
# --- Cell 1: Robust setup + centralized Azure client factory ---
import os, re, json, math, hashlib, ast
from pathlib import Path
from datetime import datetime, timedelta, date
from collections import defaultdict

import numpy as np
import pandas as pd

# Ensure OpenAI SDK is available (Azure OpenAI compatible)
try:
    from openai import OpenAI
except Exception:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "openai"])
    from openai import OpenAI

# --- Paths (prefer GITHUB_WORKSPACE, never walk above repo) ---
cwd = Path.cwd().resolve()
gw = os.getenv("GITHUB_WORKSPACE")
start = Path(gw).resolve() if gw else cwd
repo_root = next((p for p in [start, *start.parents] if (p / ".git").exists()), start)
REPO = repo_root

DATA_RAW       = REPO / "data" / "raw"
DATA_PROCESSED = REPO / "data" / "processed"
CONFIG_DIR     = REPO / "config"
STATE_DIR      = REPO / ".state"
VECTOR_DIR     = REPO / "vectorstore"

MERCHANT_DIM_PATH  = CONFIG_DIR / "merchants_dim.csv"
LATEST_CSV_PATH    = DATA_RAW / "latest.csv"
ENRICHED_OUT_PATH  = DATA_RAW / "latest.csv"                # overwrite stable file for Power BI
ENRICHED_COPY_PATH = DATA_PROCESSED / "latest_enriched.csv"
DIGEST_PATH        = DATA_PROCESSED / "digest_latest.txt"
GOAL_PATH          = DATA_PROCESSED / "goal_nudges_latest.txt"
EMBEDDINGS_PATH    = VECTOR_DIR / "embeddings.parquet"

# Ensure dirs
DATA_RAW.mkdir(parents=True, exist_ok=True)
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)
VECTOR_DIR.mkdir(parents=True, exist_ok=True)

# Config flags
MAP_ALL        = True        # label unmapped merchants via Azure (if enabled)
GOAL_SAVINGS   = 1000.0      # monthly savings target for nudges
ANOMALY_Z      = 2.5         # z-score threshold for anomalies

# --- Azure OpenAI env ---
AZURE_OPENAI_ENDPOINT   = os.getenv("AZURE_OPENAI_ENDPOINT", "").rstrip("/")
AZURE_OPENAI_API_KEY    = os.getenv("AZURE_OPENAI_API_KEY", "")
AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT", "")   # chat model (deployment name)
AZURE_OPENAI_EMBEDDINGS = os.getenv("AZURE_OPENAI_EMBEDDINGS", "")   # embeddings deployment name
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2024-02-15-preview")

def _have_azure(deploy: str) -> bool:
    return bool(AZURE_OPENAI_ENDPOINT and AZURE_OPENAI_API_KEY and deploy)

def make_azure_client(deployment: str) -> OpenAI | None:
    """Factory for Azure OpenAI client bound to a specific deployment."""
    if not _have_azure(deployment):
        return None
    # For Azure, base_url points at the deployment; api-version goes on every request
    return OpenAI(
        api_key=AZURE_OPENAI_API_KEY,
        base_url=f"{AZURE_OPENAI_ENDPOINT}/openai/deployments/{deployment}",
        default_query={"api-version": AZURE_OPENAI_API_VERSION},
        default_headers={"api-key": AZURE_OPENAI_API_KEY},
    )

# Shared clients (None if not configured)
chat_client  = make_azure_client(AZURE_OPENAI_DEPLOYMENT) if AZURE_OPENAI_DEPLOYMENT else None
embed_client = make_azure_client(AZURE_OPENAI_EMBEDDINGS) if AZURE_OPENAI_EMBEDDINGS else None
azure_enabled = chat_client is not None

if not azure_enabled:
    print("⚠️ Azure OpenAI (chat) not fully set; AI summaries will fall back to deterministic base.")
if embed_client is None:
    print("⚠️ Azure OpenAI (embeddings) not set; embeddings cache will be skipped.")

print("✅ Setup complete.")


⚠️ Azure OpenAI (chat) not fully set; AI summaries will fall back to deterministic base.
⚠️ Azure OpenAI (embeddings) not set; embeddings cache will be skipped.
✅ Setup complete.


In [40]:
# Load latest.csv (from build_latest.ipynb), robust path resolution
candidates = [
    LATEST_CSV_PATH,
    Path(os.getenv("OUTPUT_DIR", str(REPO / "data" / "raw"))) / "latest.csv",
    REPO / "data" / "raw" / "latest.csv",
]
src = next((p for p in candidates if p.exists()), None)
if src is None:
    raise FileNotFoundError(
        "latest.csv not found.\nChecked:\n- " + "\n- ".join(str(p) for p in candidates) +
        f"\nCWD={Path.cwd()}  REPO={REPO}"
    )

df = pd.read_csv(src)

# Ensure expected columns exist
expected = {"date","name","merchant_name","category","amount","bank_name"}
missing = expected - set(df.columns)
if missing:
    raise ValueError(f"latest.csv missing columns: {missing}")

# Ensure card_name exists (fallback to bank_name)
if "card_name" not in df.columns:
    df["card_name"] = df["bank_name"]

# Coerce types
df["date"] = pd.to_datetime(df["date"], errors="coerce")
df["amount"] = pd.to_numeric(df["amount"], errors="coerce")

# Basic cleanups
df["merchant_name"] = df["merchant_name"].fillna("")
df["name"] = df["name"].fillna("")

# A robust unique id for each transaction (for embeddings & caching)
def make_txn_uid(row):
    key = f"{row.get('date')}_{row.get('name')}_{row.get('merchant_name')}_{row.get('amount')}_{row.get('bank_name')}"
    return hashlib.sha1(key.encode("utf-8")).hexdigest()

df["txn_uid"] = df.apply(make_txn_uid, axis=1)

# Global sign convention: True if expenses are negative numbers
EXPENSES_ARE_NEGATIVE = (df["amount"] < 0).sum() > (df["amount"] > 0).sum()
print(f"Loaded {len(df)} transactions. expenses_are_negative={EXPENSES_ARE_NEGATIVE}")

Loaded 148 transactions. expenses_are_negative=False


In [41]:
# --- Cell 3: Normalize merchant_key consistently with build_latest ---
import numpy as np
import re

def merchant_key_from(name: str) -> str:
    s = (name or "").upper()
    s = re.sub(r"APPLE PAY ENDING IN \d{4}", "", s)
    s = re.sub(r"#\d{2,}", "", s)              # strip store numbers like #1234
    s = re.sub(r"\d+", "", s)                  # kill stray digits
    s = re.sub(r"[^A-Z&\s]", " ", s)           # keep letters, ampersand, spaces
    s = re.sub(r"\s+", " ", s).strip()
    return s or "UNKNOWN"

# Use 'merchant_name' when available, else 'name'
df["merchant_key"] = np.where(
    df["merchant_name"].astype(str).str.len() > 0,
    df["merchant_name"].map(merchant_key_from),
    df["name"].map(merchant_key_from)
)

print("Merchant keys normalized (consistent with build_latest).")


Merchant keys normalized (consistent with build_latest).


In [42]:
# Load or initialize merchant dimension table
dim_cols = [
    "merchant_key", "display_name", "category", "subcategory", "tags",
    "source", "confidence", "last_updated"
]
if MERCHANT_DIM_PATH.exists():
    dim = pd.read_csv(MERCHANT_DIM_PATH)
    # ensure columns
    for c in dim_cols:
        if c not in dim.columns:
            dim[c] = np.nan
    dim = dim[dim_cols]
else:
    dim = pd.DataFrame(columns=dim_cols)

# Left-join to see which keys are already mapped
df = df.merge(dim, on="merchant_key", how="left", suffixes=("", "_dim"))

# Identify unmapped merchants
unmapped_keys = sorted(k for k in df.loc[df["display_name"].isna(), "merchant_key"].unique() if k != "UNKNOWN")
print(f"Unmapped merchants needing AI labels: {len(unmapped_keys)}")


Unmapped merchants needing AI labels: 0


In [43]:
# SINGLE-MERCHANT LABELING (robust)
import re, json, ast
from tenacity import retry, stop_after_attempt, wait_exponential

SYSTEM = (
    "You are a financial data labeling assistant.\n"
    "Given ONE merchant_key, output a single JSON object with fields:\n"
    "merchant_key (echo EXACTLY), display_name (string), category (string), subcategory (string), tags (array of 1-5 short strings).\n"
    "Categories: Dining, Groceries, Gas, Utilities, Subscriptions, Shopping, Travel, Health, Entertainment, Education, Income, Transfers, Fees, Misc.\n"
    "display_name should be human-friendly (e.g., 'ARCO', 'APPLEBEE'S').\n"
    "Return ONLY JSON. No code fences, no commentary."
)

def _salvage_json_object(txt: str):
    """Try hard to recover a single JSON object from a messy string."""
    t = txt.strip()
    # strip code fences if present
    if t.startswith("```"):
        t = re.sub(r"^```(?:json)?", "", t, flags=re.IGNORECASE).strip()
        t = re.sub(r"```$", "", t).strip()
    # direct parse
    try:
        obj = json.loads(t)
        if isinstance(obj, dict):
            return obj
    except Exception:
        pass
    # find largest {...} block
    start = t.find("{")
    end = t.rfind("}")
    if start != -1 and end != -1 and end > start:
        candidate = t[start:end+1]
        try:
            obj = json.loads(candidate)
            if isinstance(obj, dict):
                return obj
        except Exception:
            pass
    # last resort: python-ish literal
    try:
        obj = ast.literal_eval(t)
        if isinstance(obj, dict):
            return obj
    except Exception:
        pass
    raise RuntimeError(f"Failed to parse single-object JSON:\n{t[:400]}")

@retry(stop=stop_after_attempt(5), wait=wait_exponential(min=1, max=12))
def azure_label_one(mk: str):
    """Label exactly one merchant_key with strict JSON, resilient to noise."""
    if client is None:
        return None
    user = (
        "Label this merchant_key and return ONLY a single JSON object:\n"
        '{ "merchant_key": "...", "display_name":"...", "category":"...", "subcategory":"...", "tags":[...] }\n\n'
        f'merchant_key: "{mk}"'
    )
    resp = client.chat.completions.create(
        model=AZURE_OPENAI_DEPLOYMENT,
        messages=[{"role":"system","content": SYSTEM}, {"role":"user","content": user}],
        temperature=0,
        max_tokens=200,
        response_format={"type": "json_object"},   # strongly nudges valid JSON
    )
    raw = resp.choices[0].message.content
    obj = _salvage_json_object(raw)
    # Coerce + fill
    out = {
        "merchant_key": mk,  # echo exactly
        "display_name": str(obj.get("display_name", mk)).upper().strip(),
        "category": str(obj.get("category", "")),
        "subcategory": str(obj.get("subcategory", "")),
        "tags": obj.get("tags", []),
    }
    # normalize tags into CSV (safe)
    if not isinstance(out["tags"], list):
        out["tags"] = []
    out["tags"] = [str(t).strip() for t in out["tags"] if str(t).strip()]
    return out


In [44]:
new_rows = []
if len(unmapped_keys) and client is not None and MAP_ALL:
    print(f"Labeling {len(unmapped_keys)} merchants (single-call mode)...")
    for idx, mk in enumerate(unmapped_keys, 1):
        try:
            item = azure_label_one(mk)
        except Exception as e:
            print(f"⚠️ Label fail for '{mk}': {e}")
            continue

        now = datetime.utcnow().isoformat()
        if item:
            new_rows.append({
                "merchant_key": mk,
                "display_name": item["display_name"],
                "category": item["category"],
                "subcategory": item["subcategory"],
                "tags": ",".join(item["tags"]),
                "source": "azure",
                "confidence": 0.90,
                "last_updated": now
            })

    if new_rows:
        dim_new = pd.DataFrame(new_rows)
        dim_all = pd.concat([dim, dim_new], ignore_index=True)
        dim_all = dim_all.sort_values("last_updated").drop_duplicates(["merchant_key"], keep="last")
        MERCHANT_DIM_PATH.parent.mkdir(parents=True, exist_ok=True)
        dim_all.to_csv(MERCHANT_DIM_PATH, index=False)
        dim = dim_all
        print(f"✅ Added {len(new_rows)} merchant mappings (single-call).")
    else:
        print("No new mappings added (single-call).")
else:
    print("No new mappings needed or AI disabled.")


No new mappings needed or AI disabled.


In [45]:
# --- Cell 6B: Persist merchants_dim.csv (idempotent) ---

# Toggle if you ever want to skip writing on runs with no changes
PERSIST_MERCHANT_DIM = True

# dim_cols defined in Cell 4; dim may be updated in Cell 6
if not isinstance(PERSIST_MERCHANT_DIM, bool):
    PERSIST_MERCHANT_DIM = True

if PERSIST_MERCHANT_DIM:
    MERCHANT_DIM_PATH.parent.mkdir(parents=True, exist_ok=True)

    if 'dim' in globals() and isinstance(dim, pd.DataFrame) and len(dim):
        # ensure expected columns/order exist before save
        for c in dim_cols:
            if c not in dim.columns:
                dim[c] = np.nan
        dim = dim[dim_cols]

        dim.to_csv(MERCHANT_DIM_PATH, index=False)
        print(f"📝 merchants_dim.csv saved ({len(dim)} rows) → {MERCHANT_DIM_PATH}")
    else:
        # either no new mappings this run or dim was empty; ensure file exists
        if not MERCHANT_DIM_PATH.exists():
            pd.DataFrame(columns=dim_cols).to_csv(MERCHANT_DIM_PATH, index=False)
            print(f"📝 Created headers-only merchants_dim.csv → {MERCHANT_DIM_PATH}")
        else:
            print("ℹ️ merchants_dim.csv already exists; no changes to sync.")
else:
    print("PERSIST_MERCHANT_DIM=False → skipping merchants_dim.csv persistence.")

📝 merchants_dim.csv saved (64 rows) → C:\Users\kosis\Downloads\Automation\spending-dashboard\config\merchants_dim.csv


In [46]:
df = df.drop(columns=["display_name","category","subcategory","tags","source","confidence","last_updated"], errors="ignore")
df = df.merge(dim, on="merchant_key", how="left", suffixes=("", "_dim"))

# Final output columns (feel free to adjust ordering)
final_cols = [
    "txn_uid", "date", "bank_name", "card_name",
    "merchant_key", "display_name",
    "category", "subcategory", "tags",
    "name", "merchant_name", "amount"
]
# Ensure existence even if null
for c in final_cols:
    if c not in df.columns:
        df[c] = np.nan

# Canonical display name fallback
df["display_name"] = df["display_name"].fillna(df["merchant_key"])

print("Labels joined.")


Labels joined.


In [47]:
# --- Cell 8: Subscription detection (sign-aware, idempotent) ---

def detect_subscription(group: pd.DataFrame) -> bool:
    g = group.dropna(subset=["date", "amount"]).sort_values("date")
    if len(g) < 3:
        return False

    # use absolute spend magnitudes for stability
    amounts = g["amount"].abs().to_numpy(dtype=float)
    amounts = amounts[np.isfinite(amounts)]
    if amounts.size < 3:
        return False

    # gaps in days
    ts_ns = g["date"].astype("int64").to_numpy()
    gaps_days = np.diff(ts_ns) / 86_400_000_000_000
    if gaps_days.size < 2:
        return False

    monthlyish_med = float(np.median(gaps_days))
    frac_monthly = float(np.mean((gaps_days >= 27) & (gaps_days <= 33))) if gaps_days.size else 0.0

    mu = float(np.mean(amounts))
    cv = float(np.std(amounts) / (mu + 1e-9)) if mu > 0 else 1.0

    return (27 <= monthlyish_med <= 33) and (frac_monthly >= 0.6) and (cv <= 0.2)

# Clean any leftover artifacts from previous runs (e.g., is_subscription_x from merges)
for col in [c for c in df.columns if c.startswith("is_subscription") and c != "is_subscription"]:
    df.drop(columns=col, inplace=True, errors="ignore")

# Respect your sign convention
EXPENSES_ARE_NEGATIVE = (df["amount"] < 0).sum() > (df["amount"] > 0).sum()
if EXPENSES_ARE_NEGATIVE:
    outflows = df.loc[(df["amount"] < 0) & df["date"].notna(), ["display_name", "date", "amount"]].copy()
    outflows["amount"] = outflows["amount"].abs()
else:
    outflows = df.loc[(df["amount"] > 0) & df["date"].notna(), ["display_name", "date", "amount"]].copy()

subs_map = {}
for disp, g in outflows.groupby("display_name", dropna=False):
    try:
        subs_map[disp] = bool(detect_subscription(g[["date", "amount"]]))
    except Exception:
        subs_map[disp] = False

df["is_subscription"] = df["display_name"].map(subs_map).fillna(False).astype(bool)

print(f"Subscriptions flagged: {int(df['is_subscription'].sum())} candidates.")


Subscriptions flagged: 0 candidates.


  df["is_subscription"] = df["display_name"].map(subs_map).fillna(False).astype(bool)


In [48]:
def zscores(x):
    mu = np.mean(x)
    sd = np.std(x)
    if sd == 0:
        return np.zeros_like(x)
    return (x - mu) / sd

df["amount_abs"] = df["amount"].abs()
df["z_by_merchant"] = (
    df.groupby("display_name", dropna=False)["amount_abs"]
      .transform(zscores)
)
df["is_anomaly"] = (df["z_by_merchant"] >= ANOMALY_Z)

print(f"Anomalies flagged: {int(df['is_anomaly'].sum())}")


Anomalies flagged: 1


In [49]:
today = pd.Timestamp(date.today())
cut1 = today - pd.Timedelta(days=30)
cut2 = today - pd.Timedelta(days=60)

cur = df[(df["date"] > cut1) & (df["amount"] > 0)]
prev = df[(df["date"] > cut2) & (df["date"] <= cut1) & (df["amount"] > 0)]

cur_total = cur["amount"].sum()
prev_total = prev["amount"].sum()
delta = cur_total - prev_total

top_merchants = (
    cur.groupby("display_name", dropna=False)["amount"].sum()
       .sort_values(ascending=False)
       .head(3)
)

top_category = (
    cur.groupby("category", dropna=False)["amount"].sum()
       .sort_values(ascending=False)
       .head(1)
)
top_category_name = top_category.index[0] if len(top_category) else "N/A"
top_category_amt = float(top_category.iloc[0]) if len(top_category) else 0.0

digest = []
digest.append(f"Period: last 30 days vs prior 30")
digest.append(f"Spend: ${cur_total:,.2f} ({'+' if delta>=0 else ''}{delta:,.2f} vs prior)")
digest.append("Top 3 merchants: " + ", ".join([f"{m} (${v:,.2f})" for m, v in top_merchants.items()]))
digest.append(f"Biggest category driver: {top_category_name} (${top_category_amt:,.2f})")

DATA_PROCESSED.mkdir(parents=True, exist_ok=True)
with open(DIGEST_PATH, "w", encoding="utf-8") as f:
    f.write("\n".join(digest))

print("\n".join(digest))
print(f"\nSaved digest → {DIGEST_PATH}")


Period: last 30 days vs prior 30
Spend: $5,542.88 (+2,059.58 vs prior)
Top 3 merchants: WITHDRAWAL ALLY TYPE ALLY PAYMT ID CO ALLY NAME KOSISONNA UGOCHUKW ACH ECC WEB ACH TRACE ($1,494.22), WITHDRAWAL AMEX EPAYMENT TYPE ACH PMT ID DATA ER AM CO AMEX EPAYMENT NAME KOSISONNA UGOCHUKWU ACH ECC WEB ACH TRACE ($777.78), PETAL ($738.96)
Biggest category driver: nan ($2,868.17)

Saved digest → C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\digest_latest.txt


In [50]:
# Suggest % cuts in top categories to reach GOAL_SAVINGS over next 30 days
cur_by_cat = (
    df[(df["date"] > cut1) & (df["amount"] > 0)]
      .groupby("category", dropna=False)["amount"].sum()
      .sort_values(ascending=False)
)

nudges = []
remaining = GOAL_SAVINGS
for cat, amt in cur_by_cat.items():
    if remaining <= 0:
        break
    # propose cutting up to 40% of this category
    max_cut = 0.40 * amt
    if max_cut <= 0:
        continue
    pct_needed = min(remaining / amt, 0.40)  # cap at 40%
    if pct_needed > 0:
        nudges.append((cat, pct_needed))
        remaining -= pct_needed * amt

lines = [f"Goal: Save ${GOAL_SAVINGS:,.0f} next 30 days"]
if nudges:
    for (cat, pct) in nudges:
        lines.append(f"- Cut {cat} by {pct*100:.0f}%")
else:
    lines.append("- Spending already low or insufficient category concentration to suggest cuts.")

with open(GOAL_PATH, "w", encoding="utf-8") as f:
    f.write("\n".join(lines))

print("\n".join(lines))
print(f"\nSaved goal nudges → {GOAL_PATH}")


Goal: Save $1,000 next 30 days
- Cut nan by 35%

Saved goal nudges → C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\goal_nudges_latest.txt


In [51]:
# --- Cell 12: Build text and cache embeddings (reuse shared embed_client) ---
def build_search_text(row):
    parts = [
        str(row.get("display_name") or ""),
        str(row.get("name") or ""),
        str(row.get("merchant_name") or ""),
        str(row.get("category") or ""),
        str(row.get("subcategory") or ""),
        str(row.get("tags") or ""),
    ]
    return " | ".join(p for p in parts if p)

# Limit to recent rows for cost control
embed_df = df.sort_values("date", ascending=False).head(500).copy()
embed_df["search_text"] = embed_df.apply(build_search_text, axis=1)

# Load existing cache
if EMBEDDINGS_PATH.exists():
    old = pd.read_parquet(EMBEDDINGS_PATH)
else:
    old = pd.DataFrame(columns=["txn_uid","embedding"])

existing = set(old["txn_uid"]) if len(old) else set()
to_embed = embed_df[~embed_df["txn_uid"].isin(existing)][["txn_uid", "search_text"]]

def get_embeddings(texts: list[str]) -> list | None:
    if embed_client is None:
        return None
    # The model name is the deployment name on Azure
    res = embed_client.embeddings.create(model=AZURE_OPENAI_EMBEDDINGS, input=list(texts))
    # Return raw vectors (list[float]) as provided
    return [d.embedding for d in res.data]

new_rows = []
if len(to_embed) and embed_client is not None:
    B = 64
    for i in range(0, len(to_embed), B):
        chunk = to_embed.iloc[i:i+B]
        vecs = get_embeddings(chunk["search_text"].tolist())
        if vecs is None:
            break
        for uid, vec in zip(chunk["txn_uid"].tolist(), vecs):
            if vec is not None:
                new_rows.append({"txn_uid": uid, "embedding": vec})

if new_rows:
    add = pd.DataFrame(new_rows)
    merged = pd.concat([old, add], ignore_index=True).drop_duplicates("txn_uid", keep="last")
    merged.to_parquet(EMBEDDINGS_PATH, index=False)
    print(f"Embeddings cached: +{len(add)} → total {len(merged)}")
else:
    print("No new embeddings added (none missing or embeddings disabled).")


No new embeddings added (none missing or embeddings disabled).


In [52]:
# Reorder and save
save_cols = [
    "txn_uid","date","bank_name","card_name",
    "display_name","merchant_key",
    "category","subcategory","tags",
    "name","merchant_name",
    "amount","is_subscription","is_anomaly","z_by_merchant"
]

for c in save_cols:
    if c not in df.columns:
        df[c] = np.nan

df_out = df[save_cols].sort_values(["date", "bank_name"], ascending=[False, True])

# Write both the stable file (Power BI) and a processed copy
df_out.to_csv(ENRICHED_OUT_PATH, index=False)
df_out.to_csv(ENRICHED_COPY_PATH, index=False)

print(f"✅ Enriched CSV saved → {ENRICHED_OUT_PATH}")
print(f"📄 Copy saved → {ENRICHED_COPY_PATH}")


✅ Enriched CSV saved → C:\Users\kosis\Downloads\Automation\spending-dashboard\data\raw\latest.csv
📄 Copy saved → C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\latest_enriched.csv


In [53]:
# --- Cell 14: Weekly Executive Digest (WoW) — Azure overlay + flat CSV for Power BI ---
import os, re, json
from pathlib import Path
from tenacity import retry, stop_after_attempt, wait_exponential

INSIGHTS_DIR = DATA_PROCESSED / "insights"
INSIGHTS_DIR.mkdir(parents=True, exist_ok=True)
DIGEST_JSON = INSIGHTS_DIR / "digest_latest.json"
DIGEST_MD   = INSIGHTS_DIR / "digest_latest.md"
DIGEST_FLAT = INSIGHTS_DIR / "digest_latest_flat.csv"   # <-- PBI-friendly

# -------- 1) Last COMPLETED week (Mon–Sun), compare WoW --------
try:
    now = pd.Timestamp.now(tz="America/Los_Angeles").normalize()
except Exception:
    now = pd.Timestamp.now().normalize()

wd = int(now.weekday())                               # Mon=0 ... Sun=6
days_to_last_sun = 7 if wd == 6 else (wd + 1)
wk_end   = (now - pd.Timedelta(days=days_to_last_sun)).date()      # inclusive Sunday
wk_start = (pd.Timestamp(wk_end) - pd.Timedelta(days=6)).date()    # prior Monday
prev_end = (pd.Timestamp(wk_end) - pd.Timedelta(days=7)).date()
prev_start = (pd.Timestamp(prev_end) - pd.Timedelta(days=6)).date()

df_w = df.copy()
df_w["date_only"] = df_w["date"].dt.date
cur  = df_w[(df_w["date_only"] >= wk_start) & (df_w["date_only"] <= wk_end)]
prev = df_w[(df_w["date_only"] >= prev_start) & (df_w["date_only"] <= prev_end)]

# -------- 2) Robust sign detection --------
amt_all = df_w["amount"].dropna()
expenses_are_negative = (amt_all < 0).sum() > (amt_all > 0).sum()

def spend_sum(frame):
    a = frame["amount"].dropna()
    return float(a[a < 0].abs().sum()) if expenses_are_negative else float(a[a > 0].sum())

def income_sum(frame):
    a = frame["amount"].dropna()
    return float(a[a > 0].sum()) if expenses_are_negative else float(a[a < 0].abs().sum())

cur_spend  = round(spend_sum(cur), 2)
prev_spend = round(spend_sum(prev), 2)
cur_income  = round(income_sum(cur), 2)
prev_income = round(income_sum(prev), 2)

spend_delta     = round(cur_spend - prev_spend, 2)
spend_delta_pct = round((spend_delta / prev_spend), 4) if prev_spend else (1.0 if cur_spend else 0.0)

# Top drivers this week based on spend direction
if expenses_are_negative:
    cur_exp = cur[cur["amount"] < 0].assign(spend=lambda x: x["amount"].abs())
else:
    cur_exp = cur[cur["amount"] > 0].assign(spend=lambda x: x["amount"])

top_merchants_cur = (
    cur_exp.groupby("display_name", dropna=False)["spend"].sum()
          .sort_values(ascending=False).head(5)
          .reset_index()
)
top_cats_cur = (
    cur_exp.groupby("category", dropna=False)["spend"].sum()
          .sort_values(ascending=False).head(5)
          .reset_index()
)

subs_w  = cur.loc[cur.get("is_subscription", False) == True]
anoms_w = cur.loc[cur.get("is_anomaly", False) == True]

summary_payload = {
    "as_of_date": pd.Timestamp(wk_end).isoformat(),
    "window": {
        "current": {"start": str(wk_start), "end": str(wk_end), "label": "Last completed week (Mon–Sun)"},
        "previous": {"start": str(prev_start), "end": str(prev_end)}
    },
    "totals": {
        "spend_current": cur_spend,
        "spend_previous": prev_spend,
        "spend_delta": spend_delta,
        "spend_delta_pct": spend_delta_pct,
        "income_current": cur_income,
        "income_previous": prev_income,
    },
    "top_merchants": [
        {"display_name": str(r["display_name"]), "spend": float(r["spend"])}
        for _, r in top_merchants_cur.iterrows()
    ],
    "top_categories": [
        {"category": str(r["category"]), "spend": float(r["spend"])}
        for _, r in top_cats_cur.iterrows()
    ],
    "subscriptions_count": int(subs_w["display_name"].nunique()) if len(subs_w) else 0,
    "anomalies_count": int(anoms_w.shape[0]) if len(anoms_w) else 0,
}

# -------- 3) Azure summarizer (overlay JSON, never empty) --------
def _salvage_json_object(txt: str):
    t = (txt or "").strip()
    if t.startswith("```"):
        t = re.sub(r"^```(?:json)?", "", t, flags=re.IGNORECASE).strip()
        t = re.sub(r"```$", "", t).strip()
    try:
        obj = json.loads(t)
        if isinstance(obj, dict):
            return obj
    except Exception:
        pass
    s, e = t.find("{"), t.rfind("}")
    if s != -1 and e != -1 and e > s:
        cand = t[s:e+1]
        try:
            obj = json.loads(cand)
            if isinstance(obj, dict):
                return obj
        except Exception:
            pass
    try:
        import ast
        obj = ast.literal_eval(t)
        if isinstance(obj, dict):
            return obj
    except Exception:
        pass
    return None

SYSTEM_SUMMARY = (
    "You are an analytics copilot for personal finance. "
    "Using ONLY the provided aggregates for the last completed week and the previous week, "
    "produce an executive digest in STRICT JSON. Do not invent numbers. Keep it concise."
)
USER_INSTRUCTIONS = (
    "Compare the current week vs previous week (WoW). "
    "Return ONLY a JSON object with keys:\n"
    "{\n"
    '  "headline": string,\n'
    '  "key_metrics": [ {"name": string, "value": number, "delta_pct": number|null} ],\n'
    '  "top_drivers": [ {"label": string, "spend": number} ],\n'
    '  "risks": [ {"type": "subscription"|"anomaly"|"trend", "note": string} ],\n'
    '  "action_items": [ {"title": string, "impact_usd": number, "rationale": string} ]\n'
    "}\n"
    "- Max 5 items per list.\n"
    "- Use negative delta_pct for improvements if spend fell.\n"
    "- impact_usd is a rough **weekly** savings estimate.\n"
)

@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=6))
def _azure_digest_call(payload_json: str) -> str:
    assert chat_client is not None
    resp = chat_client.chat.completions.create(
        model=AZURE_OPENAI_DEPLOYMENT,   # deployment name as model
        messages=[
            {"role":"system","content": SYSTEM_SUMMARY},
            {"role":"user","content": USER_INSTRUCTIONS + "\n\nPAYLOAD:\n" + payload_json}
        ],
        temperature=0.1,
        max_tokens=600,
        response_format={"type":"json_object"},
    )
    return resp.choices[0].message.content

# Deterministic base digest
base_digest = {
    "insights_version": 2,
    "window": summary_payload["window"],
    "totals": summary_payload["totals"],
    "headline": f"Weekly digest {wk_start}–{wk_end}",
    "key_metrics": [
        {"name":"Spend (week)",  "value": cur_spend,  "delta_pct": spend_delta_pct},
        {"name":"Income (week)", "value": cur_income, "delta_pct": None},
    ],
    "top_drivers": [{"label": t["category"], "spend": float(t["spend"])} for t in summary_payload["top_categories"]],
    "risks": (
        ([{"type":"subscription","note": f"{summary_payload['subscriptions_count']} active subs this week"}] if summary_payload["subscriptions_count"] else [])
        + ([{"type":"anomaly","note": f"{summary_payload['anomalies_count']} anomalies this week"}] if summary_payload["anomalies_count"] else [])
    ),
    "action_items": []
}

azure_digest = None
if chat_client is not None:
    try:
        raw = _azure_digest_call(json.dumps(summary_payload))
        azure_digest = _salvage_json_object(raw)
    except Exception:
        azure_digest = None

def _overlay(base: dict, over: dict | None) -> dict:
    if not isinstance(over, dict):
        return base
    out = dict(base)
    for k, v in over.items():
        if k in ("key_metrics","top_drivers","risks","action_items"):
            if isinstance(v, list) and len(v) > 0:
                out[k] = v
        elif v not in (None, "", {}):
            out[k] = v
    return out

digest = _overlay(base_digest, azure_digest)

# -------- 4) Persist JSON + Markdown + flat CSV --------
with open(DIGEST_JSON, "w", encoding="utf-8") as f:
    json.dump(digest, f, ensure_ascii=False, indent=2)

def render_md(d):
    ws = d.get("window", {}).get("current", {}).get("start", str(wk_start))
    we = d.get("window", {}).get("current", {}).get("end", str(wk_end))
    lines = [f"## Weekly Digest: {ws}–{we}", f"{d.get('headline','Executive digest')}"]
    km = d.get("key_metrics", [])[:5]
    if km:
        lines.append("\n**Key metrics (WoW)**")
        for m in km:
            dp = m.get("delta_pct", None)
            dp_txt = f" ({dp*100:+.1f}%)" if isinstance(dp,(int,float)) else ""
            lines.append(f"- {m['name']}: ${m['value']:,.2f}{dp_txt}")
    td = d.get("top_drivers", [])[:5]
    if td:
        lines.append("\n**Top drivers (this week)**")
        for t in td:
            label = t.get("label") or t.get("display_name") or ""
            lines.append(f"- {label}: ${float(t.get('spend',0)):,.2f}")
    rk = d.get("risks", [])[:5]
    if rk:
        lines.append("\n**Risks**")
        for r in rk:
            lines.append(f"- {r.get('type','note')}: {r.get('note','')}")
    ai = d.get("action_items", [])[:5]
    if ai:
        lines.append("\n**Action items**")
        for a in ai:
            lines.append(f"- {a['title']} — est. weekly impact ${float(a.get('impact_usd',0)):,.0f}. {a.get('rationale','')}")
    return "\n".join(lines) + "\n"

with open(DIGEST_MD, "w", encoding="utf-8") as f:
    f.write(render_md(digest))

# Flat table for Power BI ingestion (one row per element with type)
flat_rows = []

# Header row (window + totals)
flat_rows.append({
    "row_type": "header",
    "as_of_end": str(wk_end),
    "cur_start": str(wk_start),
    "cur_end": str(wk_end),
    "prev_start": str(prev_start),
    "prev_end": str(prev_end),
    "headline": digest.get("headline", ""),
    "name": "Spend (week)",
    "value": cur_spend,
    "delta_pct": spend_delta_pct,
    "label": "",
    "spend": None,
    "note": "",
    "impact_usd": None,
})

# Key metrics
for m in digest.get("key_metrics", []):
    flat_rows.append({
        "row_type": "metric",
        "as_of_end": str(wk_end),
        "cur_start": str(wk_start),
        "cur_end": str(wk_end),
        "prev_start": str(prev_start),
        "prev_end": str(prev_end),
        "headline": digest.get("headline", ""),
        "name": m.get("name",""),
        "value": float(m.get("value",0) or 0.0),
        "delta_pct": (float(m.get("delta_pct")) if isinstance(m.get("delta_pct"), (int,float)) else None),
        "label": "",
        "spend": None,
        "note": "",
        "impact_usd": None,
    })

# Top drivers
for t in digest.get("top_drivers", []):
    flat_rows.append({
        "row_type": "driver",
        "as_of_end": str(wk_end),
        "cur_start": str(wk_start),
        "cur_end": str(wk_end),
        "prev_start": str(prev_start),
        "prev_end": str(prev_end),
        "headline": digest.get("headline", ""),
        "name": "",
        "value": None,
        "delta_pct": None,
        "label": t.get("label",""),
        "spend": float(t.get("spend",0) or 0.0),
        "note": "",
        "impact_usd": None,
    })

# Risks
for r in digest.get("risks", []):
    flat_rows.append({
        "row_type": "risk",
        "as_of_end": str(wk_end),
        "cur_start": str(wk_start),
        "cur_end": str(wk_end),
        "prev_start": str(prev_start),
        "prev_end": str(prev_end),
        "headline": digest.get("headline", ""),
        "name": "",
        "value": None,
        "delta_pct": None,
        "label": r.get("type",""),
        "spend": None,
        "note": r.get("note",""),
        "impact_usd": None,
    })

# Action items
for a in digest.get("action_items", []):
    flat_rows.append({
        "row_type": "action",
        "as_of_end": str(wk_end),
        "cur_start": str(wk_start),
        "cur_end": str(wk_end),
        "prev_start": str(prev_start),
        "prev_end": str(prev_end),
        "headline": digest.get("headline", ""),
        "name": "",
        "value": None,
        "delta_pct": None,
        "label": a.get("title",""),
        "spend": None,
        "note": a.get("rationale",""),
        "impact_usd": float(a.get("impact_usd",0) or 0.0),
    })

pd.DataFrame(flat_rows).to_csv(DIGEST_FLAT, index=False)

print(
    "🧠 Weekly executive digest written (WoW):\n"
    f"- JSON: {DIGEST_JSON}\n- MD:   {DIGEST_MD}\n- CSV:  {DIGEST_FLAT}\n"
    f"Window: {wk_start} → {wk_end} | Prev: {prev_start} → {prev_end}"
)


🧠 Weekly executive digest written (WoW):
- JSON: C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\insights\digest_latest.json
- MD:   C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\insights\digest_latest.md
- CSV:  C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\insights\digest_latest_flat.csv
Window: 2025-09-01 → 2025-09-07 | Prev: 2025-08-25 → 2025-08-31
