In [31]:
import os, time, json
from pathlib import Path
import pandas as pd
from dotenv import load_dotenv
from tenacity import retry, wait_exponential, stop_after_attempt
from openai import OpenAI

def load_azure_env():
    repo = Path.cwd()
    for p in [repo / "scripts" / ".env", repo / ".env"]:
        if p.exists():
            load_dotenv(p, override=True)
    required = ["AZURE_OPENAI_ENDPOINT","AZURE_OPENAI_API_KEY","AZURE_OPENAI_DEPLOYMENT","AZURE_OPENAI_EMBEDDINGS"]
    missing = [k for k in required if not os.getenv(k)]
    if missing:
        print("Azure not configured; missing:", ", ".join(missing))
        return False
    return True

In [32]:
def azure_chat_client():
    return OpenAI(
        api_key=os.getenv("AZURE_OPENAI_API_KEY"),
        base_url=f"{os.getenv('AZURE_OPENAI_ENDPOINT')}/openai/deployments/{os.getenv('AZURE_OPENAI_DEPLOYMENT')}",
        default_query={"api-version": os.getenv("AZURE_OPENAI_API_VERSION","2024-02-15-preview")},
        default_headers={"api-key": os.getenv("AZURE_OPENAI_API_KEY")},
    )

def azure_embed_client():
    return OpenAI(
        api_key=os.getenv("AZURE_OPENAI_API_KEY"),
        base_url=f"{os.getenv('AZURE_OPENAI_ENDPOINT')}/openai/deployments/{os.getenv('AZURE_OPENAI_EMBEDDINGS')}",
        default_query={"api-version": os.getenv("AZURE_OPENAI_API_VERSION","2024-02-15-preview")},
        default_headers={"api-key": os.getenv("AZURE_OPENAI_API_KEY")},
    )

In [33]:
# --- Merchant labeling (per-key JSON, fence-safe, no truncation) ---

import json, re, time
from datetime import datetime
from tenacity import retry, wait_exponential, stop_after_attempt

MERCHANT_DIM_PATH = Path("config/merchants_dim.csv")
MAP_ALL = True
MAX_LABELS_PER_RUN = None   # set an int (e.g., 200) to cap costs per run

def _ensure_merchants_dim():
    if MERCHANT_DIM_PATH.exists():
        return pd.read_csv(MERCHANT_DIM_PATH)
    MERCHANT_DIM_PATH.parent.mkdir(parents=True, exist_ok=True)
    return pd.DataFrame(columns=[
        "merchant_key","display_name","category","subcategory","tags",
        "source","confidence","last_updated"
    ])

def _strip_code_fences(s: str) -> str:
    s = s.strip()
    s = re.sub(r"^```(?:json)?\s*", "", s, flags=re.IGNORECASE)
    s = re.sub(r"\s*```$", "", s)
    s = s.replace("```json","").replace("```","")
    return s.strip()

def _first_json_object(s: str):
    """
    Find the first balanced {...} JSON object in string s.
    Works even if outer array is missing or reply has extra text.
    """
    s = _strip_code_fences(s or "")
    depth = 0; start = -1; in_str=False; esc=False
    for i, ch in enumerate(s):
        if ch == '"' and not esc: in_str = not in_str
        esc = (ch == '\\' and not esc)  # toggle escape
        if in_str: continue
        if ch == '{':
            if depth == 0: start = i
            depth += 1
        elif ch == '}':
            if depth > 0:
                depth -= 1
                if depth == 0 and start != -1:
                    frag = s[start:i+1]
                    try:
                        return json.loads(frag)
                    except Exception:
                        pass
    return None

@retry(wait=wait_exponential(multiplier=1, min=1, max=20), stop=stop_after_attempt(5), reraise=True)
def azure_label_one(merchant_key: str) -> dict:
    """
    Ask the model to return ONE JSON object (no array) for this merchant_key.
    """
    mk = str(merchant_key)[:180]  # keep prompt small
    sys_prompt = (
        "You label a single merchant identifier for a personal finance dashboard.\n"
        "Return a SINGLE JSON object with keys: merchant_key, display_name, category, subcategory, tags.\n"
        "- display_name: concise human-readable name (e.g., 'ARCO', 'Apple Card').\n"
        "- category: a sensible top-level (e.g., Dining, Groceries, Gas, Shopping, Utilities, Subscriptions, Transfers, Income, Health, Travel, Entertainment, Education, Fees, Misc).\n"
        "- subcategory: specific subtype (e.g., 'Gas Station', 'Internet Service').\n"
        "- tags: array of 1–5 lowercase keywords.\n"
        "Return ONLY pure JSON (no code fences), and do not include commentary."
    )
    user_payload = {"merchant_key": mk}

    c = azure_chat_client()
    r = c.chat.completions.create(
        model=os.getenv("AZURE_OPENAI_DEPLOYMENT"),
        messages=[
            {"role":"system","content": sys_prompt},
            {"role":"user","content": json.dumps(user_payload)}
        ],
        temperature=0,
        max_tokens=300,
        response_format={"type": "json_object"}  # helps enforce structure
    )
    txt = (r.choices[0].message.content or "").strip()

    # First try direct JSON
    try:
        obj = json.loads(_strip_code_fences(txt))
        if isinstance(obj, dict) and "merchant_key" in obj:
            return obj
    except Exception:
        pass

    # Salvage: extract first {...}
    obj = _first_json_object(txt)
    if isinstance(obj, dict) and "merchant_key" in obj:
        return obj

    raise RuntimeError(f"Cannot parse JSON for key '{mk}'. First 200 chars:\n{txt[:200]}")

def label_new_merchants(df, merchant_key_col="merchant_key"):
    md = _ensure_merchants_dim()
    if merchant_key_col not in df.columns:
        print(f"Column '{merchant_key_col}' not in dataframe; skipping labeling.")
        return 0

    known = set(md["merchant_key"].astype(str)) if len(md) else set()
    candidates = [k for k in map(str, df[merchant_key_col].astype(str).unique()) if k not in known]
    if not MAP_ALL or not candidates:
        print("No new merchants to label."); return 0

    if isinstance(MAX_LABELS_PER_RUN, int):
        candidates = candidates[:MAX_LABELS_PER_RUN]

    added = 0
    now = datetime.utcnow().isoformat()
    rows = []
    for mk in candidates:
        try:
            it = azure_label_one(mk)
        except Exception as e:
            # Skip noisy/unusual keys rather than failing the whole run
            print(f"Skip '{mk[:40]}...' → {e}")
            continue

        display = str(it.get("display_name", mk)).upper().strip()
        category = str(it.get("category","")).strip()
        subcat   = str(it.get("subcategory","")).strip()
        tags_val = it.get("tags", [])
        if isinstance(tags_val, list):
            tags_csv = ",".join([str(t).strip() for t in tags_val if str(t).strip()])
        else:
            tags_csv = ""

        rows.append({
            "merchant_key": mk,
            "display_name": display,
            "category": category,
            "subcategory": subcat,
            "tags": tags_csv,
            "source": "azure",
            "confidence": 0.95,
            "last_updated": now
        })
        added += 1
        # tiny pacing to be polite with rate limits
        time.sleep(0.05)

    if rows:
        chunk = pd.DataFrame(rows)
        md = pd.concat([md, chunk], ignore_index=True)
        md = md.sort_values("last_updated").drop_duplicates(["merchant_key"], keep="last")
        md.to_csv(MERCHANT_DIM_PATH, index=False)
        print(f"Added {added} merchant mappings.")
    else:
        print("No new rows appended.")

    return added

In [34]:
# --- Embeddings builder (caches to vectorstore/embeddings.parquet) ---

import numpy as np, pyarrow as pa, pyarrow.parquet as pq

VECTOR_PATH = Path("vectorstore/embeddings.parquet")
EMBED_ROWS = 500

def build_embeddings(df, text_cols=("display_name","description")):
    ec = azure_embed_client()
    cols = [c for c in text_cols if c in df.columns]
    if not cols:
        print("No text columns found; skipping embeddings."); return 0

    recent = df.tail(EMBED_ROWS).copy()
    texts = recent[cols[0]].astype(str)
    for c in cols[1:]:
        texts = texts + " | " + recent[c].astype(str)
    texts = texts.tolist()

    embs = []
    for t in texts:
        e = ec.embeddings.create(model=os.getenv("AZURE_OPENAI_EMBEDDINGS"), input=[t])
        embs.append(e.data[0].embedding)

    if not embs:
        print("No embeddings produced."); return 0
    dim = len(embs[0])

    flat = np.array(embs, dtype="float32").ravel()
    arr = pa.FixedSizeListArray.from_arrays(pa.array(flat), dim)
    table = pa.Table.from_pydict({"row_idx": pa.array(recent.index.astype(int)), "embedding": arr})
    VECTOR_PATH.parent.mkdir(parents=True, exist_ok=True)
    pq.write_table(table, VECTOR_PATH)
    print(f"Wrote {len(embs)} embeddings (dim {dim}) → {VECTOR_PATH}")
    return len(embs)


In [35]:
def azure_ai_enabled():
    req = ["AZURE_OPENAI_ENDPOINT","AZURE_OPENAI_API_KEY","AZURE_OPENAI_DEPLOYMENT","AZURE_OPENAI_EMBEDDINGS"]
    return all(os.getenv(k) for k in req)

def run_azure_ai(enriched_df):
    if not load_azure_env() or not azure_ai_enabled():
        print("Azure not configured; skipping AI steps.")
        return {"labeled":0,"embedded":0}
    labeled = label_new_merchants(enriched_df)
    embedded = build_embeddings(enriched_df)
    return {"labeled": labeled, "embedded": embedded}
