In [88]:
# --- Imports ---
import os, re, json, math, hashlib, ast
from pathlib import Path
from datetime import datetime, timedelta, date
from collections import defaultdict

import numpy as np
import pandas as pd

# Try to ensure OpenAI SDK is available (for Azure OpenAI)
try:
    from openai import OpenAI
except Exception:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "openai"])
    from openai import OpenAI

# --- Paths (robust: prefer GITHUB_WORKSPACE, never walk above repo) ---
cwd = Path.cwd().resolve()
gw = os.getenv("GITHUB_WORKSPACE")

start = Path(gw).resolve() if gw else cwd
# find the repo root by locating the first directory that has a .git
repo_root = next((p for p in [start, *start.parents] if (p / ".git").exists()), start)
REPO = repo_root

DATA_RAW = REPO / "data" / "raw"
DATA_PROCESSED = REPO / "data" / "processed"
CONFIG_DIR = REPO / "config"
STATE_DIR = REPO / ".state"
VECTOR_DIR = REPO / "vectorstore"

MERCHANT_DIM_PATH = CONFIG_DIR / "merchants_dim.csv"
LATEST_CSV_PATH   = DATA_RAW / "latest.csv"
ENRICHED_OUT_PATH = DATA_RAW / "latest.csv"               # overwrite stable file for Power BI
ENRICHED_COPY_PATH = DATA_PROCESSED / "latest_enriched.csv"
DIGEST_PATH = DATA_PROCESSED / "digest_latest.txt"
GOAL_PATH   = DATA_PROCESSED / "goal_nudges_latest.txt"
EMBEDDINGS_PATH = VECTOR_DIR / "embeddings.parquet"

# --- Ensure dirs ---
DATA_RAW.mkdir(parents=True, exist_ok=True)
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)
VECTOR_DIR.mkdir(parents=True, exist_ok=True)

# --- Config flags ---
MAP_ALL = True              # map any merchant missing from dimension
GOAL_SAVINGS = 1000.0       # target monthly savings for "goal nudges"
ANOMALY_Z = 2.5             # z-score threshold for anomalies

# --- Azure OpenAI env ---
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT", "")
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY", "")
AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT", "")  # chat model
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2024-02-15-preview")

if not (AZURE_OPENAI_ENDPOINT and AZURE_OPENAI_API_KEY and AZURE_OPENAI_DEPLOYMENT):
    print("⚠️ Azure OpenAI env not fully set. AI labeling will be skipped.")

# Build OpenAI (Azure) client if possible
client = None
if AZURE_OPENAI_ENDPOINT and AZURE_OPENAI_API_KEY and AZURE_OPENAI_DEPLOYMENT:
    client = OpenAI(
        api_key=AZURE_OPENAI_API_KEY,
        base_url=f"{AZURE_OPENAI_ENDPOINT}/openai/deployments/{AZURE_OPENAI_DEPLOYMENT}",
        default_query={"api-version": AZURE_OPENAI_API_VERSION},
        default_headers={"api-key": AZURE_OPENAI_API_KEY},
    )

print("✅ Setup complete.")


✅ Setup complete.


In [89]:
# Load latest.csv (from build_latest.ipynb), robust path resolution
candidates = [
    LATEST_CSV_PATH,
    Path(os.getenv("OUTPUT_DIR", str(REPO / "data" / "raw"))) / "latest.csv",
    REPO / "data" / "raw" / "latest.csv",
]
src = next((p for p in candidates if p.exists()), None)
if src is None:
    raise FileNotFoundError(
        "latest.csv not found.\nChecked:\n- " + "\n- ".join(str(p) for p in candidates) +
        f"\nCWD={Path.cwd()}  REPO={REPO}"
    )

df = pd.read_csv(src)

# Ensure expected columns exist
expected = {"date","name","merchant_name","category","amount","bank_name"}
missing = expected - set(df.columns)
if missing:
    raise ValueError(f"latest.csv missing columns: {missing}")

# Ensure card_name exists (fallback to bank_name)
if "card_name" not in df.columns:
    df["card_name"] = df["bank_name"]

# Coerce types
df["date"] = pd.to_datetime(df["date"], errors="coerce")
df["amount"] = pd.to_numeric(df["amount"], errors="coerce")

# Basic cleanups
df["merchant_name"] = df["merchant_name"].fillna("")
df["name"] = df["name"].fillna("")

# A robust unique id for each transaction (for embeddings & caching)
def make_txn_uid(row):
    key = f"{row.get('date')}_{row.get('name')}_{row.get('merchant_name')}_{row.get('amount')}_{row.get('bank_name')}"
    return hashlib.sha1(key.encode("utf-8")).hexdigest()

df["txn_uid"] = df.apply(make_txn_uid, axis=1)

print(f"Loaded {len(df)} transactions.")


Loaded 149 transactions.


In [90]:
# Normalize noisy merchant strings into a stable 'merchant_key'
# Use 'merchant_name' when available, else 'name'
def normalize_merchant_key(txt: str) -> str:
    t = (txt or "").upper().strip()
    # Remove common noise: excessive spaces, digits, #, store ids, etc.
    t = re.sub(r"\d{2,}", "", t)              # drop long digit runs
    t = re.sub(r"[-_/#*]+", " ", t)           # separators -> space
    t = re.sub(r"\s{2,}", " ", t).strip()
    # Drop locale suffixes like "NV", "CA" at the end if present
    t = re.sub(r"\b([A-Z]{2})\b$", "", t).strip()
    # Collapse APPLE PAY / GOOGLE PAY hints
    t = t.replace("APPLE PAY", "").replace("GOOGLE PAY", "").strip()
    # Fallback
    return t or "UNKNOWN"

df["merchant_key"] = np.where(
    df["merchant_name"].str.len() > 0,
    df["merchant_name"].apply(normalize_merchant_key),
    df["name"].apply(normalize_merchant_key)
)

print("Merchant keys normalized.")


Merchant keys normalized.


In [91]:
# Load or initialize merchant dimension table
dim_cols = [
    "merchant_key", "display_name", "category", "subcategory", "tags",
    "source", "confidence", "last_updated"
]
if MERCHANT_DIM_PATH.exists():
    dim = pd.read_csv(MERCHANT_DIM_PATH)
    # ensure columns
    for c in dim_cols:
        if c not in dim.columns:
            dim[c] = np.nan
    dim = dim[dim_cols]
else:
    dim = pd.DataFrame(columns=dim_cols)

# Left-join to see which keys are already mapped
df = df.merge(dim, on="merchant_key", how="left", suffixes=("", "_dim"))

# Identify unmapped merchants
unmapped_keys = sorted(k for k in df.loc[df["display_name"].isna(), "merchant_key"].unique() if k != "UNKNOWN")
print(f"Unmapped merchants needing AI labels: {len(unmapped_keys)}")


Unmapped merchants needing AI labels: 0


In [92]:
# SINGLE-MERCHANT LABELING (robust)
import re, json, ast
from tenacity import retry, stop_after_attempt, wait_exponential

SYSTEM = (
    "You are a financial data labeling assistant.\n"
    "Given ONE merchant_key, output a single JSON object with fields:\n"
    "merchant_key (echo EXACTLY), display_name (string), category (string), subcategory (string), tags (array of 1-5 short strings).\n"
    "Categories: Dining, Groceries, Gas, Utilities, Subscriptions, Shopping, Travel, Health, Entertainment, Education, Income, Transfers, Fees, Misc.\n"
    "display_name should be human-friendly (e.g., 'ARCO', 'APPLEBEE'S').\n"
    "Return ONLY JSON. No code fences, no commentary."
)

def _salvage_json_object(txt: str):
    """Try hard to recover a single JSON object from a messy string."""
    t = txt.strip()
    # strip code fences if present
    if t.startswith("```"):
        t = re.sub(r"^```(?:json)?", "", t, flags=re.IGNORECASE).strip()
        t = re.sub(r"```$", "", t).strip()
    # direct parse
    try:
        obj = json.loads(t)
        if isinstance(obj, dict):
            return obj
    except Exception:
        pass
    # find largest {...} block
    start = t.find("{")
    end = t.rfind("}")
    if start != -1 and end != -1 and end > start:
        candidate = t[start:end+1]
        try:
            obj = json.loads(candidate)
            if isinstance(obj, dict):
                return obj
        except Exception:
            pass
    # last resort: python-ish literal
    try:
        obj = ast.literal_eval(t)
        if isinstance(obj, dict):
            return obj
    except Exception:
        pass
    raise RuntimeError(f"Failed to parse single-object JSON:\n{t[:400]}")

@retry(stop=stop_after_attempt(5), wait=wait_exponential(min=1, max=12))
def azure_label_one(mk: str):
    """Label exactly one merchant_key with strict JSON, resilient to noise."""
    if client is None:
        return None
    user = (
        "Label this merchant_key and return ONLY a single JSON object:\n"
        '{ "merchant_key": "...", "display_name":"...", "category":"...", "subcategory":"...", "tags":[...] }\n\n'
        f'merchant_key: "{mk}"'
    )
    resp = client.chat.completions.create(
        model=AZURE_OPENAI_DEPLOYMENT,
        messages=[{"role":"system","content": SYSTEM}, {"role":"user","content": user}],
        temperature=0,
        max_tokens=200,
        response_format={"type": "json_object"},   # strongly nudges valid JSON
    )
    raw = resp.choices[0].message.content
    obj = _salvage_json_object(raw)
    # Coerce + fill
    out = {
        "merchant_key": mk,  # echo exactly
        "display_name": str(obj.get("display_name", mk)).upper().strip(),
        "category": str(obj.get("category", "")),
        "subcategory": str(obj.get("subcategory", "")),
        "tags": obj.get("tags", []),
    }
    # normalize tags into CSV (safe)
    if not isinstance(out["tags"], list):
        out["tags"] = []
    out["tags"] = [str(t).strip() for t in out["tags"] if str(t).strip()]
    return out


In [93]:
new_rows = []
if len(unmapped_keys) and client is not None and MAP_ALL:
    print(f"Labeling {len(unmapped_keys)} merchants (single-call mode)...")
    for idx, mk in enumerate(unmapped_keys, 1):
        try:
            item = azure_label_one(mk)
        except Exception as e:
            print(f"⚠️ Label fail for '{mk}': {e}")
            continue

        now = datetime.utcnow().isoformat()
        if item:
            new_rows.append({
                "merchant_key": mk,
                "display_name": item["display_name"],
                "category": item["category"],
                "subcategory": item["subcategory"],
                "tags": ",".join(item["tags"]),
                "source": "azure",
                "confidence": 0.90,
                "last_updated": now
            })

    if new_rows:
        dim_new = pd.DataFrame(new_rows)
        dim_all = pd.concat([dim, dim_new], ignore_index=True)
        dim_all = dim_all.sort_values("last_updated").drop_duplicates(["merchant_key"], keep="last")
        MERCHANT_DIM_PATH.parent.mkdir(parents=True, exist_ok=True)
        dim_all.to_csv(MERCHANT_DIM_PATH, index=False)
        dim = dim_all
        print(f"✅ Added {len(new_rows)} merchant mappings (single-call).")
    else:
        print("No new mappings added (single-call).")
else:
    print("No new mappings needed or AI disabled.")


No new mappings needed or AI disabled.


In [101]:
# --- Cell 6B: Persist merchants_dim.csv (idempotent) ---

# Toggle if you ever want to skip writing on runs with no changes
PERSIST_MERCHANT_DIM = True

# dim_cols defined in Cell 4; dim may be updated in Cell 6
if not isinstance(PERSIST_MERCHANT_DIM, bool):
    PERSIST_MERCHANT_DIM = True

if PERSIST_MERCHANT_DIM:
    MERCHANT_DIM_PATH.parent.mkdir(parents=True, exist_ok=True)

    if 'dim' in globals() and isinstance(dim, pd.DataFrame) and len(dim):
        # ensure expected columns/order exist before save
        for c in dim_cols:
            if c not in dim.columns:
                dim[c] = np.nan
        dim = dim[dim_cols]

        dim.to_csv(MERCHANT_DIM_PATH, index=False)
        print(f"📝 merchants_dim.csv saved ({len(dim)} rows) → {MERCHANT_DIM_PATH}")
    else:
        # either no new mappings this run or dim was empty; ensure file exists
        if not MERCHANT_DIM_PATH.exists():
            pd.DataFrame(columns=dim_cols).to_csv(MERCHANT_DIM_PATH, index=False)
            print(f"📝 Created headers-only merchants_dim.csv → {MERCHANT_DIM_PATH}")
        else:
            print("ℹ️ merchants_dim.csv already exists; no changes to sync.")
else:
    print("PERSIST_MERCHANT_DIM=False → skipping merchants_dim.csv persistence.")

📝 Created headers-only merchants_dim.csv → C:\Users\kosis\Downloads\Automation\spending-dashboard\config\merchants_dim.csv


In [None]:
df = df.drop(columns=["display_name","category","subcategory","tags","source","confidence","last_updated"], errors="ignore")
df = df.merge(dim, on="merchant_key", how="left", suffixes=("", "_dim"))

# Final output columns (feel free to adjust ordering)
final_cols = [
    "txn_uid", "date", "bank_name", "card_name",
    "merchant_key", "display_name",
    "category", "subcategory", "tags",
    "name", "merchant_name", "amount"
]
# Ensure existence even if null
for c in final_cols:
    if c not in df.columns:
        df[c] = np.nan

# Canonical display name fallback
df["display_name"] = df["display_name"].fillna(df["merchant_key"])

print("Labels joined.")


Labels joined.


In [95]:
# --- Cell 8 (fixed): Subscription detection (idempotent, no merge/apply warnings) ---

def detect_subscription(group: pd.DataFrame) -> bool:
    g = group.dropna(subset=["date", "amount"]).sort_values("date")
    if len(g) < 3:
        return False

    # positive outflows only
    amounts = g["amount"].to_numpy(dtype=float)
    amounts = amounts[np.isfinite(amounts)]
    if amounts.size < 3:
        return False

    # gaps in days (datetime64[ns] -> int ns -> days)
    ts_ns = g["date"].astype("int64").to_numpy()
    gaps_days = np.diff(ts_ns) / 86_400_000_000_000
    if gaps_days.size < 2:
        return False

    monthlyish_med = float(np.median(gaps_days))
    frac_monthly = float(np.mean((gaps_days >= 27) & (gaps_days <= 33))) if gaps_days.size else 0.0

    mu = float(np.mean(amounts))
    if mu <= 0:
        return False
    cv = float(np.std(amounts) / (mu + 1e-9))

    return (27 <= monthlyish_med <= 33) and (frac_monthly >= 0.6) and (cv <= 0.2)

# Clean any leftover artifacts from previous runs (e.g., is_subscription_x from merges)
for col in [c for c in df.columns if c.startswith("is_subscription") and c != "is_subscription"]:
    df.drop(columns=col, inplace=True, errors="ignore")

# Compute a per-display_name subscription flag without merge/apply warnings
subs_map = {}
pos = df.loc[(df["amount"] > 0) & df["date"].notna(), ["display_name", "date", "amount"]]
for disp, g in pos.groupby("display_name", dropna=False):
    try:
        subs_map[disp] = bool(detect_subscription(g[["date", "amount"]]))
    except Exception:
        subs_map[disp] = False

# Assign deterministically; idempotent across runs
df["is_subscription"] = df["display_name"].map(subs_map).fillna(False).astype(bool)

print(f"Subscriptions flagged: {int(df['is_subscription'].sum())} candidates.")


Subscriptions flagged: 0 candidates.


  df["is_subscription"] = df["display_name"].map(subs_map).fillna(False).astype(bool)


In [96]:
def zscores(x):
    mu = np.mean(x)
    sd = np.std(x)
    if sd == 0:
        return np.zeros_like(x)
    return (x - mu) / sd

df["amount_abs"] = df["amount"].abs()
df["z_by_merchant"] = (
    df.groupby("display_name", dropna=False)["amount_abs"]
      .transform(zscores)
)
df["is_anomaly"] = (df["z_by_merchant"] >= ANOMALY_Z)

print(f"Anomalies flagged: {int(df['is_anomaly'].sum())}")


Anomalies flagged: 1


In [97]:
today = pd.Timestamp(date.today())
cut1 = today - pd.Timedelta(days=30)
cut2 = today - pd.Timedelta(days=60)

cur = df[(df["date"] > cut1) & (df["amount"] > 0)]
prev = df[(df["date"] > cut2) & (df["date"] <= cut1) & (df["amount"] > 0)]

cur_total = cur["amount"].sum()
prev_total = prev["amount"].sum()
delta = cur_total - prev_total

top_merchants = (
    cur.groupby("display_name", dropna=False)["amount"].sum()
       .sort_values(ascending=False)
       .head(3)
)

top_category = (
    cur.groupby("category", dropna=False)["amount"].sum()
       .sort_values(ascending=False)
       .head(1)
)
top_category_name = top_category.index[0] if len(top_category) else "N/A"
top_category_amt = float(top_category.iloc[0]) if len(top_category) else 0.0

digest = []
digest.append(f"Period: last 30 days vs prior 30")
digest.append(f"Spend: ${cur_total:,.2f} ({'+' if delta>=0 else ''}{delta:,.2f} vs prior)")
digest.append("Top 3 merchants: " + ", ".join([f"{m} (${v:,.2f})" for m, v in top_merchants.items()]))
digest.append(f"Biggest category driver: {top_category_name} (${top_category_amt:,.2f})")

DATA_PROCESSED.mkdir(parents=True, exist_ok=True)
with open(DIGEST_PATH, "w", encoding="utf-8") as f:
    f.write("\n".join(digest))

print("\n".join(digest))
print(f"\nSaved digest → {DIGEST_PATH}")


Period: last 30 days vs prior 30
Spend: $5,822.81 (+2,546.55 vs prior)
Top 3 merchants: WITHDRAWAL ALLY TYPE: ALLY PAYMT ID: CO: ALLY NAME: KOSISONNA UGOCHUKW %% ACH ECC WEB %% ACH TRACE ($1,494.22), WITHDRAWAL AMEX EPAYMENT TYPE: ACH PMT ID: DATA: ER AM CO: AMEX EPAYMENT NAME: KOSISONNA UGOCHUKWU %% ACH ECC WEB %% ACH TRACE ($777.78), PETAL ($738.96)
Biggest category driver: nan ($5,822.81)

Saved digest → C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\digest_latest.txt


In [98]:
# Suggest % cuts in top categories to reach GOAL_SAVINGS over next 30 days
cur_by_cat = (
    df[(df["date"] > cut1) & (df["amount"] > 0)]
      .groupby("category", dropna=False)["amount"].sum()
      .sort_values(ascending=False)
)

nudges = []
remaining = GOAL_SAVINGS
for cat, amt in cur_by_cat.items():
    if remaining <= 0:
        break
    # propose cutting up to 40% of this category
    max_cut = 0.40 * amt
    if max_cut <= 0:
        continue
    pct_needed = min(remaining / amt, 0.40)  # cap at 40%
    if pct_needed > 0:
        nudges.append((cat, pct_needed))
        remaining -= pct_needed * amt

lines = [f"Goal: Save ${GOAL_SAVINGS:,.0f} next 30 days"]
if nudges:
    for (cat, pct) in nudges:
        lines.append(f"- Cut {cat} by {pct*100:.0f}%")
else:
    lines.append("- Spending already low or insufficient category concentration to suggest cuts.")

with open(GOAL_PATH, "w", encoding="utf-8") as f:
    f.write("\n".join(lines))

print("\n".join(lines))
print(f"\nSaved goal nudges → {GOAL_PATH}")


Goal: Save $1,000 next 30 days
- Cut nan by 17%

Saved goal nudges → C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\goal_nudges_latest.txt


In [99]:
# Build text field and store embeddings for semantic search
def build_search_text(row):
    parts = [
        str(row.get("display_name") or ""),
        str(row.get("name") or ""),
        str(row.get("merchant_name") or ""),
        str(row.get("category") or ""),
        str(row.get("subcategory") or ""),
        str(row.get("tags") or ""),
    ]
    return " | ".join(p for p in parts if p)

# Prepare rows (limit to recent for cost-control)
embed_df = df.sort_values("date", ascending=False).head(500).copy()
embed_df["search_text"] = embed_df.apply(build_search_text, axis=1)

# Load existing cache
if EMBEDDINGS_PATH.exists():
    old = pd.read_parquet(EMBEDDINGS_PATH)
else:
    old = pd.DataFrame(columns=["txn_uid","embedding"])

existing = set(old["txn_uid"]) if len(old) else set()
to_embed = embed_df[~embed_df["txn_uid"].isin(existing)][["txn_uid", "search_text"]]

def get_embeddings(texts):
    if client is None:
        return [None for _ in texts]
    # Use the Azure embeddings deployment name from env
    emb_deploy = os.getenv("AZURE_OPENAI_EMBEDDINGS", "")
    if not emb_deploy:
        return [None for _ in texts]

    # New OpenAI client pattern for embeddings under Azure:
    # base_url should be resource; we temporarily create a fresh client pointing to embeddings deployment
    emb_client = OpenAI(
        api_key=AZURE_OPENAI_API_KEY,
        base_url=f"{AZURE_OPENAI_ENDPOINT}/openai/deployments/{emb_deploy}",
        default_query={"api-version": AZURE_OPENAI_API_VERSION},
        default_headers={"api-key": AZURE_OPENAI_API_KEY},
    )
    res = emb_client.embeddings.create(model=emb_deploy, input=list(texts))
    return [d.embedding for d in res.data]

new_rows = []
if len(to_embed):
    B = 64
    for i in range(0, len(to_embed), B):
        chunk = to_embed.iloc[i:i+B]
        vecs = get_embeddings(chunk["search_text"].tolist())
        for uid, vec in zip(chunk["txn_uid"].tolist(), vecs):
            if vec is not None:
                new_rows.append({"txn_uid": uid, "embedding": vec})

if new_rows:
    add = pd.DataFrame(new_rows)
    merged = pd.concat([old, add], ignore_index=True).drop_duplicates("txn_uid", keep="last")
    merged.to_parquet(EMBEDDINGS_PATH, index=False)
    print(f"Embeddings cached: +{len(add)} → total {len(merged)}")
else:
    print("No new embeddings added (either none missing or AI disabled).")


Embeddings cached: +1 → total 146


In [100]:
# Reorder and save
save_cols = [
    "txn_uid","date","bank_name","card_name",
    "display_name","merchant_key",
    "category","subcategory","tags",
    "name","merchant_name",
    "amount","is_subscription","is_anomaly","z_by_merchant"
]

for c in save_cols:
    if c not in df.columns:
        df[c] = np.nan

df_out = df[save_cols].sort_values(["date", "bank_name"], ascending=[False, True])

# Write both the stable file (Power BI) and a processed copy
df_out.to_csv(ENRICHED_OUT_PATH, index=False)
df_out.to_csv(ENRICHED_COPY_PATH, index=False)

print(f"✅ Enriched CSV saved → {ENRICHED_OUT_PATH}")
print(f"📄 Copy saved → {ENRICHED_COPY_PATH}")


✅ Enriched CSV saved → C:\Users\kosis\Downloads\Automation\spending-dashboard\data\raw\latest.csv
📄 Copy saved → C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\latest_enriched.csv
