In [41]:
# --- Imports ---
import os, re, json, math, hashlib, ast
from pathlib import Path
from datetime import datetime, timedelta, date
from collections import defaultdict
from datetime import datetime, timezone


import numpy as np
import pandas as pd

# Try to ensure OpenAI SDK is available (for Azure OpenAI)
try:
    from openai import OpenAI
except Exception:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "openai"])
    from openai import OpenAI

# dotenv for local runs
try:
    from dotenv import load_dotenv
except Exception:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "python-dotenv"])
    from dotenv import load_dotenv

# --- Paths (robust: prefer GITHUB_WORKSPACE, never walk above repo) ---
cwd = Path.cwd().resolve()
gw = os.getenv("GITHUB_WORKSPACE")

start = Path(gw).resolve() if gw else cwd
# find the repo root by locating the first directory that has a .git
repo_root = next((p for p in [start, *start.parents] if (p / ".git").exists()), start)
REPO = repo_root

DATA_RAW = REPO / "data" / "raw"
DATA_PROCESSED = REPO / "data" / "processed"
CONFIG_DIR = REPO / "config"
STATE_DIR = REPO / ".state"
VECTOR_DIR = REPO / "vectorstore"

MERCHANT_DIM_PATH = CONFIG_DIR / "merchants_dim.csv"
LATEST_CSV_PATH   = DATA_RAW / "latest.csv"
ENRICHED_OUT_PATH = DATA_RAW / "latest.csv"               # overwrite stable file for Power BI
ENRICHED_COPY_PATH = DATA_PROCESSED / "latest_enriched.csv"
DIGEST_PATH = DATA_PROCESSED / "digest_latest.txt"
GOAL_PATH   = DATA_PROCESSED / "goal_nudges_latest.txt"
EMBEDDINGS_PATH = VECTOR_DIR / "embeddings.parquet"

# --- Ensure dirs ---
DATA_RAW.mkdir(parents=True, exist_ok=True)
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)
VECTOR_DIR.mkdir(parents=True, exist_ok=True)

# --- Config flags ---
MAP_ALL = True              # map any merchant missing from dimension
GOAL_SAVINGS = 1000.0       # target monthly savings for "goal nudges"
ANOMALY_Z = 2.5             # z-score threshold for anomalies

# --- Load local envs if present (for dev runs) ---
for p in [REPO / "scripts" / ".env", REPO / ".env"]:
    if p.exists():
        load_dotenv(p, override=True)

# --- Azure OpenAI env ---
AZURE_OPENAI_ENDPOINT   = os.getenv("AZURE_OPENAI_ENDPOINT", "")
AZURE_OPENAI_API_KEY    = os.getenv("AZURE_OPENAI_API_KEY", "")
AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT", "")  # chat model deployment name
AZURE_OPENAI_EMBEDDINGS = os.getenv("AZURE_OPENAI_EMBEDDINGS", "")  # embeddings deployment name
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2024-02-15-preview")

if not (AZURE_OPENAI_ENDPOINT and AZURE_OPENAI_API_KEY and AZURE_OPENAI_DEPLOYMENT):
    print("⚠️ Azure OpenAI env not fully set. AI labeling will be skipped.")

# Build OpenAI (Azure) client if possible (used only if this notebook calls it directly)
client = None
if AZURE_OPENAI_ENDPOINT and AZURE_OPENAI_API_KEY and AZURE_OPENAI_DEPLOYMENT:
    client = OpenAI(
        api_key=AZURE_OPENAI_API_KEY,
        base_url=f"{AZURE_OPENAI_ENDPOINT}/openai/deployments/{AZURE_OPENAI_DEPLOYMENT}",
        default_query={"api-version": AZURE_OPENAI_API_VERSION},
        default_headers={"api-key": AZURE_OPENAI_API_KEY},
    )

# --- Load shared utils notebook (keeps everything .ipynb) ---
UTILS_NOTEBOOK = REPO / "scripts" / "ai_utils.ipynb"
if UTILS_NOTEBOOK.exists():
    try:
        import IPython
        IPython.get_ipython().run_line_magic("run", str(UTILS_NOTEBOOK))
        print(f"📚 Loaded utils from {UTILS_NOTEBOOK}")
    except Exception as e:
        print(f"⚠️ Could not %run {UTILS_NOTEBOOK}: {e}")
else:
    print(f"⚠️ Utils notebook not found at {UTILS_NOTEBOOK}. Skipping.")

print("✅ Setup complete.")

📚 Loaded utils from C:\Users\kosis\Downloads\Automation\spending-dashboard\scripts\ai_utils.ipynb
✅ Setup complete.


In [42]:
# --- Merchant labeling (robust JSON mode, stable schema, timezone-aware) ---

from datetime import datetime, timezone
import json, re, time
from tenacity import retry, wait_exponential, stop_after_attempt

MERCHANT_DIM_PATH = Path("config/merchants_dim.csv")
BATCH = 20
MAP_ALL = True

# Stable schema prevents concat FutureWarnings
SCHEMA = {
    "merchant_key": "string",
    "display_name": "string",
    "category": "string",
    "subcategory": "string",
    "tags": "string",
    "source": "string",
    "confidence": "float64",
    "last_updated": "string",
}

def _ensure_merchants_dim():
    if MERCHANT_DIM_PATH.exists():
        md = pd.read_csv(MERCHANT_DIM_PATH)
        for col, dt in SCHEMA.items():
            if col not in md.columns:
                md[col] = pd.Series(dtype=dt)
        return md.astype(SCHEMA, copy=False)
    MERCHANT_DIM_PATH.parent.mkdir(parents=True, exist_ok=True)
    return pd.DataFrame({c: pd.Series(dtype=dt) for c, dt in SCHEMA.items()})

def _parse_labels_strict_or_salvage(txt: str):
    txt = txt.strip()
    # Accept {"items":[...]} or [...]
    try:
        obj = json.loads(txt)
        if isinstance(obj, dict) and "items" in obj and isinstance(obj["items"], list):
            return obj["items"]
        if isinstance(obj, list):
            return obj
    except Exception:
        pass
    # Salvage: try longest [...] block
    m = re.findall(r"\[[\s\S]*\]", txt)
    if m:
        for cand in reversed(m):
            try:
                return json.loads(cand)
            except Exception:
                continue
    raise RuntimeError(f"Failed to parse AI JSON (first 400 chars):\n{txt[:400]}")

@retry(wait=wait_exponential(multiplier=1, min=1, max=20), stop=stop_after_attempt(5))
def azure_label_batch(keys_batch):
    compact = [str(k)[:100] for k in keys_batch]  # shrink to avoid truncation

    sys_prompt = (
        "You are labeling merchant identifiers for a personal finance dashboard.\n"
        "For each merchant_key, produce fields: merchant_key, display_name, category, subcategory, tags.\n"
        "- display_name: short human-friendly name (e.g., 'ARCO', 'Apple Card').\n"
        "- category: one of Dining, Groceries, Gas, Shopping, Utilities, Subscriptions, Transfers, "
        "Income, Health, Travel, Entertainment, Education, Fees, Misc.\n"
        "- subcategory: specific subtype (e.g., 'Gas Station', 'Fast Casual', 'Internet Service').\n"
        "- tags: array of 1–5 lowercase keywords (e.g., ['gas','fuel']).\n"
        'Return ONLY JSON in this exact shape: {"items":[{...}]} with no extra commentary.'
    )
    usr_payload = {"merchant_keys": compact}

    c = azure_chat_client()
    r = c.chat.completions.create(
        model=os.getenv("AZURE_OPENAI_DEPLOYMENT"),
        messages=[
            {"role":"system","content": sys_prompt},
            {"role":"user","content": json.dumps(usr_payload)}
        ],
        temperature=0,
        max_tokens=1400,
        response_format={"type": "json_object"}  # strict JSON
    )
    return _parse_labels_strict_or_salvage(r.choices[0].message.content)

def label_new_merchants(df, merchant_key_col="merchant_key"):
    md = _ensure_merchants_dim()
    if merchant_key_col not in df.columns:
        print(f"Column '{merchant_key_col}' not in dataframe; skipping labeling.")
        return 0

    known = set(md["merchant_key"].astype(str)) if len(md) > 0 else set()
    candidates = sorted(set(df[merchant_key_col].astype(str)) - known)
    if not MAP_ALL or not candidates:
        print("No new merchants to label."); return 0

    added = 0
    for i in range(0, len(candidates), BATCH):
        batch = candidates[i:i+BATCH]
        items = azure_label_batch(batch)
        now = datetime.now(timezone.utc).isoformat()  # timezone-aware

        rows = []
        for it in items:
            mk = str(it.get("merchant_key") or "").strip()
            if not mk:
                continue
            display = str(it.get("display_name", mk)).upper().strip()
            category = str(it.get("category","")).strip()
            subcat   = str(it.get("subcategory","")).strip()
            tags_val = it.get("tags", [])
            tags_csv = ",".join([str(t).strip() for t in tags_val]) if isinstance(tags_val, list) else ""
            rows.append({
                "merchant_key": mk,
                "display_name": display,
                "category": category,
                "subcategory": subcat,
                "tags": tags_csv,
                "source": "azure",
                "confidence": 0.90,
                "last_updated": now
            })

        if rows:
            chunk = pd.DataFrame(rows)
            # ensure schema for both frames
            for col, dt in SCHEMA.items():
                if col not in chunk.columns:
                    chunk[col] = pd.Series(dtype=dt)
            chunk = chunk.astype(SCHEMA, copy=False)
            md = md.astype(SCHEMA, copy=False)

            md = pd.concat([md, chunk], ignore_index=True)
            md = md.sort_values("last_updated").drop_duplicates(["merchant_key"], keep="last")
            md.to_csv(MERCHANT_DIM_PATH, index=False)
            added += len(chunk)
            print(f"Added {len(chunk)} merchant mappings (running total {added}).")

        time.sleep(0.1)

    return added


In [43]:
# Load latest.csv (from build_latest.ipynb), robust path resolution
candidates = [
    LATEST_CSV_PATH,
    Path(os.getenv("OUTPUT_DIR", str(REPO / "data" / "raw"))) / "latest.csv",
    REPO / "data" / "raw" / "latest.csv",
]
src = next((p for p in candidates if p.exists()), None)
if src is None:
    raise FileNotFoundError(
        "latest.csv not found.\nChecked:\n- " + "\n- ".join(str(p) for p in candidates) +
        f"\nCWD={Path.cwd()}  REPO={REPO}"
    )

df = pd.read_csv(src)

# Ensure expected columns exist
expected = {"date","name","merchant_name","category","amount","bank_name"}
missing = expected - set(df.columns)
if missing:
    raise ValueError(f"latest.csv missing columns: {missing}")

# Ensure card_name exists (fallback to bank_name)
if "card_name" not in df.columns:
    df["card_name"] = df["bank_name"]

# Coerce types
df["date"] = pd.to_datetime(df["date"], errors="coerce")
df["amount"] = pd.to_numeric(df["amount"], errors="coerce")

# Basic cleanups
df["merchant_name"] = df["merchant_name"].fillna("")
df["name"] = df["name"].fillna("")

# A robust unique id for each transaction (for embeddings & caching)
def make_txn_uid(row):
    key = f"{row.get('date')}_{row.get('name')}_{row.get('merchant_name')}_{row.get('amount')}_{row.get('bank_name')}"
    return hashlib.sha1(key.encode("utf-8")).hexdigest()

df["txn_uid"] = df.apply(make_txn_uid, axis=1)

print(f"Loaded {len(df)} transactions.")


Loaded 147 transactions.


In [44]:
# Normalize noisy merchant strings into a stable 'merchant_key'
# Use 'merchant_name' when available, else 'name'
def normalize_merchant_key(txt: str) -> str:
    t = (txt or "").upper().strip()
    # Remove common noise: excessive spaces, digits, #, store ids, etc.
    t = re.sub(r"\d{2,}", "", t)              # drop long digit runs
    t = re.sub(r"[-_/#*]+", " ", t)           # separators -> space
    t = re.sub(r"\s{2,}", " ", t).strip()
    # Drop locale suffixes like "NV", "CA" at the end if present
    t = re.sub(r"\b([A-Z]{2})\b$", "", t).strip()
    # Collapse APPLE PAY / GOOGLE PAY hints
    t = t.replace("APPLE PAY", "").replace("GOOGLE PAY", "").strip()
    # Fallback
    return t or "UNKNOWN"

df["merchant_key"] = np.where(
    df["merchant_name"].str.len() > 0,
    df["merchant_name"].apply(normalize_merchant_key),
    df["name"].apply(normalize_merchant_key)
)

print("Merchant keys normalized.")


Merchant keys normalized.


In [45]:
# Load or initialize merchant dimension table
dim_cols = [
    "merchant_key", "display_name", "category", "subcategory", "tags",
    "source", "confidence", "last_updated"
]
if MERCHANT_DIM_PATH.exists():
    dim = pd.read_csv(MERCHANT_DIM_PATH)
    # ensure columns
    for c in dim_cols:
        if c not in dim.columns:
            dim[c] = np.nan
    dim = dim[dim_cols]
else:
    dim = pd.DataFrame(columns=dim_cols)

# Left-join to see which keys are already mapped
df = df.merge(dim, on="merchant_key", how="left", suffixes=("", "_dim"))

# Identify unmapped merchants
unmapped_keys = sorted(k for k in df.loc[df["display_name"].isna(), "merchant_key"].unique() if k != "UNKNOWN")
print(f"Unmapped merchants needing AI labels: {len(unmapped_keys)}")


Unmapped merchants needing AI labels: 0


In [46]:
import json
from tenacity import retry, stop_after_attempt, wait_exponential

SYSTEM = (
    "You are a financial data labeling assistant. "
    "For each merchant_key, produce concise JSON objects with fields: "
    "display_name (string), category (string), subcategory (string), tags (list of short strings). "
    "Use US personal finance categories like Dining, Groceries, Gas, Utilities, Subscriptions, Travel, Health, Shopping, Income, Transfers. "
    "Keep display_name human-friendly (e.g., 'APPLEBEE'S', 'PANDA EXPRESS'). "
    "When uncertain, make your best guess."
)

def build_user_prompt(merchant_keys):
    # Keep prompt compact; model can handle ~50-80 at once easily; we’ll batch anyway.
    examples = "\n".join(f'- "{k}"' for k in merchant_keys)
    return (
        "Label the following merchant keys. Return ONLY a valid JSON array where each item is:\n"
        "{merchant_key, display_name, category, subcategory, tags}\n\n"
        f"MERCHANT_KEYS:\n{examples}"
    )

@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=8))
def azure_label_batch(keys_batch):
    if client is None:
        return []
    msg = [
        {"role":"system", "content": SYSTEM},
        {"role":"user", "content": build_user_prompt(keys_batch)}
    ]
    resp = client.chat.completions.create(
        model=AZURE_OPENAI_DEPLOYMENT,
        messages=msg,
        temperature=0.2,
        max_tokens=1200
    )
    txt = resp.choices[0].message.content.strip()
    # Sometimes models wrap in code fences—strip them
    if txt.startswith("```"):
        txt = re.sub(r"^```(json)?", "", txt, flags=re.IGNORECASE).strip()
        txt = re.sub(r"```$", "", txt).strip()
    try:
        parsed = json.loads(txt)
        if isinstance(parsed, dict):
            parsed = [parsed]
        return parsed
    except Exception as e:
        # Last resort: try to eval if it's accidentally Python-ish
        try:
            parsed = ast.literal_eval(txt)
            if isinstance(parsed, dict):
                parsed = [parsed]
            return parsed
        except Exception:
            raise RuntimeError(f"Failed to parse AI JSON:\n{txt}") from e


In [55]:
USE_AZURE = all(os.getenv(k) for k in [
    "AZURE_OPENAI_ENDPOINT","AZURE_OPENAI_API_KEY",
    "AZURE_OPENAI_DEPLOYMENT","AZURE_OPENAI_EMBEDDINGS"
])

if USE_AZURE and 'df' in globals():
    try:
        added = label_new_merchants(df)
        print(f"AI labeling done. New merchants added: {added}")
    except Exception as e:
        print("⚠️ Labeling failed (final):", e)

    try:
        embedded = build_embeddings(df)
        print(f"Embeddings built: {embedded}")
    except Exception as e:
        print("⚠️ Embeddings failed (final):", e)
else:
    print("Azure not configured or df missing; skipping AI steps.")

No new merchants to label.
AI labeling done. New merchants added: 0
Wrote 147 embeddings (dim 3072) → vectorstore\embeddings.parquet
Embeddings built: 147


In [48]:
df = df.drop(columns=["display_name","category","subcategory","tags","source","confidence","last_updated"], errors="ignore")
df = df.merge(dim, on="merchant_key", how="left", suffixes=("", "_dim"))

# Final output columns (feel free to adjust ordering)
final_cols = [
    "txn_uid", "date", "bank_name", "card_name",
    "merchant_key", "display_name",
    "category", "subcategory", "tags",
    "name", "merchant_name", "amount"
]
# Ensure existence even if null
for c in final_cols:
    if c not in df.columns:
        df[c] = np.nan

# Canonical display name fallback
df["display_name"] = df["display_name"].fillna(df["merchant_key"])

print("Labels joined.")


Labels joined.


In [49]:
# --- Cell 8: Subscription detection (robust, version-proof) ---

def detect_subscription(group: pd.DataFrame) -> bool:
    # Sort and keep only rows with valid date & amount
    g = group.dropna(subset=["date", "amount"]).sort_values("date")
    if len(g) < 3:
        return False

    # Expenses only
    amounts = g["amount"].to_numpy(dtype=float)
    amounts = amounts[np.isfinite(amounts)]
    if amounts.size < 3:
        return False

    # Inter-payment gaps in DAYS using int64 ns -> days
    # (Avoids .dt on object-dtype and works across pandas versions)
    ts_ns = g["date"].astype("int64").to_numpy()  # datetime64[ns] -> int ns
    gaps_days = np.diff(ts_ns) / 86_400_000_000_000  # ns per day
    if gaps_days.size < 2:
        return False

    # Monthly-ish cadence and amount consistency
    monthlyish_med = float(np.median(gaps_days))
    frac_monthly = float(np.mean((gaps_days >= 27) & (gaps_days <= 33))) if gaps_days.size else 0.0

    mu = float(np.mean(amounts))
    if mu <= 0:
        return False
    cv = float(np.std(amounts) / (mu + 1e-9))  # coefficient of variation

    return (27 <= monthlyish_med <= 33) and (frac_monthly >= 0.6) and (cv <= 0.2)

# Group by display_name; avoid future warning by applying on selected columns only
subs_series = (
    df.loc[df["amount"] > 0, ["display_name", "date", "amount"]]
      .groupby("display_name")[["date", "amount"]]
      .apply(detect_subscription)            # returns scalar per group
      .rename("is_subscription")             # Series: index=display_name
)

# Map back to the frame (no merge → no duplicate columns)
df["is_subscription"] = (
    df["display_name"]
      .map(subs_series)                      # align by display_name
      .astype("boolean")                     # nullable boolean to avoid downcast warning
      .fillna(False)
      .astype(bool)                          # plain bool for CSV/Power BI
)

print(f"Subscriptions flagged: {int(df['is_subscription'].sum())} candidates.")

Subscriptions flagged: 0 candidates.


In [50]:
def zscores(x):
    mu = np.mean(x)
    sd = np.std(x)
    if sd == 0:
        return np.zeros_like(x)
    return (x - mu) / sd

df["amount_abs"] = df["amount"].abs()
df["z_by_merchant"] = (
    df.groupby("display_name", dropna=False)["amount_abs"]
      .transform(zscores)
)
df["is_anomaly"] = (df["z_by_merchant"] >= ANOMALY_Z)

print(f"Anomalies flagged: {int(df['is_anomaly'].sum())}")


Anomalies flagged: 1


In [51]:
today = pd.Timestamp(date.today())
cut1 = today - pd.Timedelta(days=30)
cut2 = today - pd.Timedelta(days=60)

cur = df[(df["date"] > cut1) & (df["amount"] > 0)]
prev = df[(df["date"] > cut2) & (df["date"] <= cut1) & (df["amount"] > 0)]

cur_total = cur["amount"].sum()
prev_total = prev["amount"].sum()
delta = cur_total - prev_total

top_merchants = (
    cur.groupby("display_name", dropna=False)["amount"].sum()
       .sort_values(ascending=False)
       .head(3)
)

top_category = (
    cur.groupby("category", dropna=False)["amount"].sum()
       .sort_values(ascending=False)
       .head(1)
)
top_category_name = top_category.index[0] if len(top_category) else "N/A"
top_category_amt = float(top_category.iloc[0]) if len(top_category) else 0.0

digest = []
digest.append(f"Period: last 30 days vs prior 30")
digest.append(f"Spend: ${cur_total:,.2f} ({'+' if delta>=0 else ''}{delta:,.2f} vs prior)")
digest.append("Top 3 merchants: " + ", ".join([f"{m} (${v:,.2f})" for m, v in top_merchants.items()]))
digest.append(f"Biggest category driver: {top_category_name} (${top_category_amt:,.2f})")

DATA_PROCESSED.mkdir(parents=True, exist_ok=True)
with open(DIGEST_PATH, "w", encoding="utf-8") as f:
    f.write("\n".join(digest))

print("\n".join(digest))
print(f"\nSaved digest → {DIGEST_PATH}")


Period: last 30 days vs prior 30
Spend: $5,746.32 (+2,470.06 vs prior)
Top 3 merchants: ALLY PAYMENT ($1,494.22), KOSISONNA UGOCHUKWU ($777.78), PETAL CARD ($738.96)
Biggest category driver: Transfers ($2,839.17)

Saved digest → C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\digest_latest.txt


In [52]:
# Suggest % cuts in top categories to reach GOAL_SAVINGS over next 30 days
cur_by_cat = (
    df[(df["date"] > cut1) & (df["amount"] > 0)]
      .groupby("category", dropna=False)["amount"].sum()
      .sort_values(ascending=False)
)

nudges = []
remaining = GOAL_SAVINGS
for cat, amt in cur_by_cat.items():
    if remaining <= 0:
        break
    # propose cutting up to 40% of this category
    max_cut = 0.40 * amt
    if max_cut <= 0:
        continue
    pct_needed = min(remaining / amt, 0.40)  # cap at 40%
    if pct_needed > 0:
        nudges.append((cat, pct_needed))
        remaining -= pct_needed * amt

lines = [f"Goal: Save ${GOAL_SAVINGS:,.0f} next 30 days"]
if nudges:
    for (cat, pct) in nudges:
        lines.append(f"- Cut {cat} by {pct*100:.0f}%")
else:
    lines.append("- Spending already low or insufficient category concentration to suggest cuts.")

with open(GOAL_PATH, "w", encoding="utf-8") as f:
    f.write("\n".join(lines))

print("\n".join(lines))
print(f"\nSaved goal nudges → {GOAL_PATH}")


Goal: Save $1,000 next 30 days
- Cut Transfers by 35%

Saved goal nudges → C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\goal_nudges_latest.txt


In [53]:
# Build text field and store embeddings for semantic search
def build_search_text(row):
    parts = [
        str(row.get("display_name") or ""),
        str(row.get("name") or ""),
        str(row.get("merchant_name") or ""),
        str(row.get("category") or ""),
        str(row.get("subcategory") or ""),
        str(row.get("tags") or ""),
    ]
    return " | ".join(p for p in parts if p)

# Prepare rows (limit to recent for cost-control)
embed_df = df.sort_values("date", ascending=False).head(500).copy()
embed_df["search_text"] = embed_df.apply(build_search_text, axis=1)

# Load existing cache
if EMBEDDINGS_PATH.exists():
    old = pd.read_parquet(EMBEDDINGS_PATH)
else:
    old = pd.DataFrame(columns=["txn_uid","embedding"])

existing = set(old["txn_uid"]) if len(old) else set()
to_embed = embed_df[~embed_df["txn_uid"].isin(existing)][["txn_uid", "search_text"]]

def get_embeddings(texts):
    if client is None:
        return [None for _ in texts]
    # Use the Azure embeddings deployment name from env
    emb_deploy = os.getenv("AZURE_OPENAI_EMBEDDINGS", "")
    if not emb_deploy:
        return [None for _ in texts]

    # New OpenAI client pattern for embeddings under Azure:
    # base_url should be resource; we temporarily create a fresh client pointing to embeddings deployment
    emb_client = OpenAI(
        api_key=AZURE_OPENAI_API_KEY,
        base_url=f"{AZURE_OPENAI_ENDPOINT}/openai/deployments/{emb_deploy}",
        default_query={"api-version": AZURE_OPENAI_API_VERSION},
        default_headers={"api-key": AZURE_OPENAI_API_KEY},
    )
    res = emb_client.embeddings.create(model=emb_deploy, input=list(texts))
    return [d.embedding for d in res.data]

new_rows = []
if len(to_embed):
    B = 64
    for i in range(0, len(to_embed), B):
        chunk = to_embed.iloc[i:i+B]
        vecs = get_embeddings(chunk["search_text"].tolist())
        for uid, vec in zip(chunk["txn_uid"].tolist(), vecs):
            if vec is not None:
                new_rows.append({"txn_uid": uid, "embedding": vec})

if new_rows:
    add = pd.DataFrame(new_rows)
    merged = pd.concat([old, add], ignore_index=True).drop_duplicates("txn_uid", keep="last")
    merged.to_parquet(EMBEDDINGS_PATH, index=False)
    print(f"Embeddings cached: +{len(add)} → total {len(merged)}")
else:
    print("No new embeddings added (either none missing or AI disabled).")


No new embeddings added (either none missing or AI disabled).


In [54]:
# Reorder and save
save_cols = [
    "txn_uid","date","bank_name","card_name",
    "display_name","merchant_key",
    "category","subcategory","tags",
    "name","merchant_name",
    "amount","is_subscription","is_anomaly","z_by_merchant"
]

for c in save_cols:
    if c not in df.columns:
        df[c] = np.nan

df_out = df[save_cols].sort_values(["date", "bank_name"], ascending=[False, True])

# Write both the stable file (Power BI) and a processed copy
df_out.to_csv(ENRICHED_OUT_PATH, index=False)
df_out.to_csv(ENRICHED_COPY_PATH, index=False)

print(f"✅ Enriched CSV saved → {ENRICHED_OUT_PATH}")
print(f"📄 Copy saved → {ENRICHED_COPY_PATH}")


✅ Enriched CSV saved → C:\Users\kosis\Downloads\Automation\spending-dashboard\data\raw\latest.csv
📄 Copy saved → C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\latest_enriched.csv
