In [None]:
# ===== 1) CONFIG =====
from pathlib import Path

# >>> Update BASE if your path is different <<<
BASE = Path(r"C:\Users\kosis\Downloads\Automation\spending-dashboard\scripts")

PATH_RAW  = BASE / "data" / "raw"
PATH_PROC = BASE / "data" / "processed"
PATH_DOCS = BASE / "docs"
PATH_CONF = BASE / "config" / "categories.yaml"

# Ensure folders exist
for p in (PATH_RAW, PATH_PROC, PATH_DOCS):
    p.mkdir(parents=True, exist_ok=True)

print("BASE:", BASE)
print("RAW :", PATH_RAW)
print("PROC:", PATH_PROC)
print("DOCS:", PATH_DOCS)
print("CONF:", PATH_CONF)

BASE: C:\Users\kosis\Downloads\Automation\spending-dashboard
RAW : C:\Users\kosis\Downloads\Automation\spending-dashboard\data\raw
PROC: C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed
DOCS: C:\Users\kosis\Downloads\Automation\spending-dashboard\docs
CONF: C:\Users\kosis\Downloads\Automation\spending-dashboard\config\categories.yaml


In [54]:
# ===== 2) REQUIREMENTS =====
# If PyYAML isn't installed, uncomment and run once:
# %pip install pyyaml

import pandas as pd
import numpy as np
import yaml
import json
from datetime import datetime


In [55]:
# ===== 3) HELPERS =====

def safe_number(x):
    '''Convert to float safely; handles $ and commas.'''
    if pd.isna(x):
        return np.nan
    if isinstance(x, (int, float)):
        return float(x)
    s = str(x).strip().replace("$","").replace(",","")
    try:
        return float(s)
    except Exception:
        return np.nan

def normalize_date(s):
    '''Parse many date formats -> pandas.Timestamp (naive).'''
    return pd.to_datetime(s, errors="coerce")

def derive_merchant_key(desc: str) -> str:
    '''Create a stable merchant key from raw description.'''
    if pd.isna(desc):
        return ""
    s = str(desc).upper()
    # Remove long digit blocks and extra spaces
    import re
    s = re.sub(r"\d{3,}", " ", s)
    s = re.sub(r"[^A-Z\s&\-']", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s[:64]

def load_yaml_mapping(path_yaml: Path):
    if not path_yaml.exists():
        print(f"[WARN] YAML not found at {path_yaml}. Proceeding with empty mapping.")
        return {}
    with open(path_yaml, "r", encoding="utf-8") as f:
        data = yaml.safe_load(f) or {}
    return data

def apply_yaml_mapping(df: pd.DataFrame, ymap: dict) -> pd.DataFrame:
    """
    Accepts flexible YAML formats:
      Simple:  APPLEBEES: dining
      Rich:    APPLEBEES: {display_name: APPLEBEES, category: dining, tags: [sitdown], is_necessity: false}
      Weird:   NUMERICKEY: 1   -> coerced to category "1"
    Keys should correspond to df['merchant_key'] (recommended).
    """
    # If someone provided a list at the top level, coerce to dict by taking entries with 'key' fields
    if isinstance(ymap, list):
        tmp = {}
        for item in ymap:
            if isinstance(item, dict):
                # try common shapes: {'key': 'APPLEBEES', 'category': 'dining', ...} or {'APPLEBEES': {...}}
                if "key" in item:
                    tmp[str(item["key"]).upper()] = {k: v for k, v in item.items() if k != "key"}
                else:
                    # take first key in nested mapping
                    for k, v in item.items():
                        tmp[str(k).upper()] = v
            # ignore non-dict list items
        ymap = tmp

    # Build normalized mapping
    direct = {}
    for k, v in (ymap or {}).items():
        k_up = str(k).upper()

        if isinstance(v, dict):
            display = str(v.get("display_name", k_up)).upper()
            category = v.get("category", None)
            tags_val = v.get("tags", [])
            if isinstance(tags_val, (list, tuple)):
                tags_text = ",".join(map(str, tags_val))
            else:
                tags_text = str(tags_val) if tags_val is not None else ""
            is_necessity = bool(v.get("is_necessity", False))
        elif isinstance(v, str):
            display = k_up
            category = v
            tags_text = ""
            is_necessity = False
        elif v is None:
            display = k_up
            category = None
            tags_text = ""
            is_necessity = False
        else:
            # numbers / other types: treat as category string
            display = k_up
            category = str(v)
            tags_text = ""
            is_necessity = False

        direct[k_up] = {
            "display_name_final": display,
            "category_final": category,
            "tags_final": tags_text,
            "is_necessity": is_necessity,
        }

    out = df.copy()

    # Ensure expected columns exist
    for col, default in [
        ("display_name_final",""),
        ("category_final",""),
        ("tags_final",""),
        ("is_necessity", False),
        ("source_final",""),
        ("confidence_final",""),
    ]:
        if col not in out.columns:
            out[col] = default

    # Apply mapping by merchant_key
    def map_row(row):
        key = str(row.get("merchant_key","")).upper().strip()
        m = direct.get(key)
        if m:
            row["display_name_final"] = m["display_name_final"]
            row["category_final"]     = m["category_final"]
            row["tags_final"]         = m["tags_final"]
            row["is_necessity"]       = m["is_necessity"]
            row["source_final"]       = "yaml"
            row["confidence_final"]   = "yaml"
        return row

    return out.apply(map_row, axis=1)


def write_unknowns(df: pd.DataFrame, path_out: Path):
    unk = df[(df["display_name_final"]=="") | (df["category_final"]=="")].copy()
    if not unk.empty:
        path_out.parent.mkdir(parents=True, exist_ok=True)
        cols = [c for c in ["date","account","description","merchant_key","amount"] if c in unk.columns]
        unk[cols].to_csv(path_out, index=False)
        print(f"[INFO] Unknown merchants written → {path_out} ({len(unk)} rows)")
    else:
        print("[INFO] No unknown merchants to review.")


In [56]:
# ===== 4) INGEST RAW FILES (robust) =====
import csv
from io import StringIO

files = sorted(PATH_RAW.glob("*.csv"))
if not files:
    print(f"[WARN] No raw CSVs found in {PATH_RAW}. Add files and re-run.")
else:
    print(f"[INFO] Found {len(files)} raw file(s):")
    for f in files: print("   -", f.name)

def sniff_delimiter_and_header(text: str):
    """
    Try to detect delimiter and header row index.
    Returns (delimiter, header_row_idx). If unsure, defaults to (',', 0).
    """
    # Try csv.Sniffer on a sample
    sample = text[:100_000]
    try:
        dialect = csv.Sniffer().sniff(sample, delimiters=[',',';','\t','|'])
        delim = dialect.delimiter
    except Exception:
        # Fallback to heuristics
        counts = {d: sample.count(d) for d in [',',';','\t','|']}
        delim = max(counts, key=counts.get)

    # Find header row (look for a row containing any of these canonical column names)
    header_candidates = ["date","transaction date","posted date","post date","description","details","memo","amount","debit","credit"]
    lines = sample.splitlines()
    header_idx = 0
    for i, line in enumerate(lines[:50]):  # scan first 50 lines for a header
        parts = [p.strip().lower() for p in line.split(delim)]
        if any(h in parts for h in header_candidates):
            header_idx = i
            break
    return delim, header_idx

def robust_read_csv(path):
    # Try a couple encodings
    for enc in ["utf-8-sig", "utf-16", "latin-1", "utf-8"]:
        try:
            txt = path.read_text(encoding=enc, errors="ignore")
            delim, header_idx = sniff_delimiter_and_header(txt)
            df = pd.read_csv(
                StringIO(txt),
                sep=delim,
                header=header_idx,
                engine="python",            # more forgiving
                on_bad_lines="skip",        # skip malformed lines
                dtype=str                   # keep raw as text; we normalize later
            )
            # Drop any leading non-data rows above header
            if header_idx > 0:
                df = df.reset_index(drop=True)
            return df, delim, enc, header_idx
        except Exception as e:
            last_err = e
            continue
    raise last_err

frames = []
for f in files:
    try:
        df, delim, enc, hdr = robust_read_csv(f)
        print(f"[READ] {f.name} | sep='{delim}' enc={enc} header_row={hdr} shape={df.shape}")
    except Exception as e:
        print(f"[SKIP] {f.name} — could not parse ({e})")
        continue

    # Normalize column names
    df.columns = [c.strip().lower() for c in df.columns]

    # Flexible column detection
    candidates = {
        "date":        ["date","transaction date","transaction_date","posted date","post date","posted_date","date posted"],
        "account":     ["account","account name","account_name","acct","card"],
        "description": ["description","details","memo","merchant","name","narrative","transaction description"],
        "amount":      ["amount","transaction amount","transaction_amount","debit","credit","amt","value"]
    }
    pick = {}
    for tgt, opts in candidates.items():
        for c in opts:
            if c in df.columns:
                pick[tgt] = c; break

    # Some exports list debit/credit separately; combine into a single signed amount if needed
    if "amount" not in pick:
        if "debit" in df.columns or "credit" in df.columns:
            amt = pd.Series([np.nan]*len(df))
            if "debit" in df.columns:
                amt = amt.fillna(df["debit"])
            if "credit" in df.columns:
                # credits as negative
                cr = pd.to_numeric(df["credit"].str.replace(",","", regex=False), errors="coerce") * -1
                amt = amt.fillna(cr)
            df["__amount_combined__"] = amt
            pick["amount"] = "__amount_combined__"

    tmp = pd.DataFrame({
        "date":        df[pick.get("date")]        if "date" in pick        else np.nan,
        "account":     df[pick.get("account")]     if "account" in pick     else "unknown",
        "description": df[pick.get("description")] if "description" in pick else df.get("description",""),
        "amount":      df[pick.get("amount")]      if "amount" in pick      else df.get("amount", np.nan),
    })
    tmp["source_file"] = f.name

    # Normalize types
    tmp["date"] = tmp["date"].apply(normalize_date)
    tmp["amount"] = tmp["amount"].apply(safe_number)

# Make charges positive, credits/refunds negative — per-file
amt = pd.to_numeric(tmp["amount"], errors="coerce")
neg_ratio = (amt < 0).mean()
median_amt = amt.median(skipna=True)

# If most rows are negative (common for Chase exports), flip
if (neg_ratio > 0.6) or (pd.notna(median_amt) and median_amt < 0):
    tmp["amount"] = -amt
else:
    tmp["amount"] = amt

    # In case the export uses "CR"/"DR" signs embedded in description or amount, try to detect
    # (optional; comment out if not needed)
    # Example: if description contains "CREDIT" and amount is positive, flip sign
    mask_credit_hint = tmp["description"].str.upper().str.contains("CREDIT|REFUND|PAYMENT", na=False)
    tmp.loc[mask_credit_hint & (tmp["amount"] > 0), "amount"] *= -1

    # Merchant key
    tmp["merchant_key"] = tmp["description"].apply(derive_merchant_key)

    frames.append(tmp)

raw_all = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame(columns=["date","account","description","amount","merchant_key","source_file"])
raw_all = raw_all.dropna(subset=["date","amount"])
print(f"[INFO] Raw combined rows: {len(raw_all):,}")
raw_all.head()


[INFO] Found 6 raw file(s):
   - chase_amazonprime_6m.CSV
   - chase_checking_6m.CSV
   - chase_flex_6m.CSV
   - discover_credit_ytd.csv
   - ssscu_checking_6m.CSV
   - ssscu_credit_6m.CSV
[READ] chase_amazonprime_6m.CSV | sep=',' enc=utf-8-sig header_row=0 shape=(84, 7)
[READ] chase_checking_6m.CSV | sep=',' enc=utf-8-sig header_row=0 shape=(50, 7)
[READ] chase_flex_6m.CSV | sep=',' enc=utf-8-sig header_row=0 shape=(12, 7)
[READ] discover_credit_ytd.csv | sep=',' enc=utf-8-sig header_row=0 shape=(78, 5)
[READ] ssscu_checking_6m.CSV | sep=',' enc=utf-8-sig header_row=3 shape=(230, 11)
[READ] ssscu_credit_6m.CSV | sep=',' enc=utf-8-sig header_row=3 shape=(15, 11)
[INFO] Raw combined rows: 0


Unnamed: 0,date,account,description,amount,source_file,merchant_key


In [57]:
# ===== 5) ENRICH VIA YAML =====
ymap = load_yaml_mapping(PATH_CONF)
enriched = apply_yaml_mapping(raw_all, ymap)

# Mark non-spend flows
patterns_non_spend = ["PAYMENT","DIRECTPAY","TRANSFER","CREDIT","REFUND"]
def non_spend_flag(desc, amt):
    s = str(desc).upper()
    if any(p in s for p in patterns_non_spend): return True
    if amt < 0: return True
    return False

enriched["is_non_spend_flow"] = enriched.apply(lambda r: non_spend_flag(r["description"], r["amount"]), axis=1)

# Save unknowns for mapping
write_unknowns(enriched, PATH_DOCS / "review_unknowns.csv")

# Stable schema/order
cols_order = [
    "date","account","description","merchant_key",
    "display_name_final","category_final","tags_final",
    "confidence_final","source_final","amount",
    "is_necessity","is_non_spend_flow","source_file"
]
for c in cols_order:
    if c not in enriched.columns:
        if c in ["is_necessity","is_non_spend_flow"]:
            enriched[c] = False
        else:
            enriched[c] = ""

enriched = enriched[cols_order].sort_values("date").reset_index(drop=True)

out_tx = PATH_PROC / "transactions_enriched.csv"
enriched.to_csv(out_tx, index=False)
print(f"[OK] transactions_enriched.csv written → {out_tx}  ({len(enriched):,} rows)")
enriched.tail()


[INFO] No unknown merchants to review.
[OK] transactions_enriched.csv written → C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\transactions_enriched.csv  (0 rows)


Unnamed: 0,date,account,description,merchant_key,display_name_final,category_final,tags_final,confidence_final,source_final,amount,is_necessity,is_non_spend_flow,source_file


In [58]:
# ===== 6) AI KPIs (ai_insights.csv) =====
tx = pd.read_csv(PATH_PROC / "transactions_enriched.csv")
tx["date"] = pd.to_datetime(tx["date"], errors="coerce")
tx["amount"] = pd.to_numeric(tx["amount"], errors="coerce")
tx = tx.dropna(subset=["date","amount"])

spend = tx[tx["amount"] > 0].copy()
spend["month_start"] = spend["date"].values.astype("datetime64[M]")

monthly = (spend.groupby("month_start", as_index=False)
           .agg(total_outflows=("amount","sum"))
           .sort_values("month_start"))

monthly["prev"] = monthly["total_outflows"].shift(1)
monthly["mom_outflows_pct"] = ((monthly["total_outflows"] - monthly["prev"]) / monthly["prev"]).replace([np.inf, -np.inf], np.nan).fillna(0.0)

# Top category
top_cat = (spend.groupby(["month_start","category_final"], as_index=False)["amount"].sum()
           .sort_values(["month_start","amount"], ascending=[True, False])
           .drop_duplicates("month_start")
           .rename(columns={"category_final":"top_category"}))

# Top merchant
top_merch = (spend.groupby(["month_start","display_name_final"], as_index=False)["amount"].sum()
             .sort_values(["month_start","amount"], ascending=[True, False])
             .drop_duplicates("month_start")
             .rename(columns={"display_name_final":"top_merchant"}))

monthly = monthly.merge(top_cat, on="month_start", how="left").merge(top_merch, on="month_start", how="left")

# Subscription estimate = tags contain "subscription" OR recurring merchant >= 3 months
def to_text(x):
    if isinstance(x, (list, tuple, set)):
        return ",".join(map(str, x))
    if pd.isna(x): return ""
    return str(x)

if "tags_final" not in spend.columns:
    spend["tags_final"] = ""
spend["tags_text"] = spend["tags_final"].apply(to_text).str.lower()
spend["is_subscription_like"] = spend["tags_text"].str.contains("subscription", na=False)

rec_counts  = spend.groupby(["display_name_final","month_start"]).size().reset_index(name="n")
rec_months  = rec_counts.groupby("display_name_final")["month_start"].nunique()
rec_merchants = set(rec_months[rec_months >= 3].index)
spend["is_subscription_like"] = spend["is_subscription_like"] | spend["display_name_final"].isin(rec_merchants)

subs = (spend[spend["is_subscription_like"]]
        .groupby("month_start", as_index=False)["amount"].sum()
        .rename(columns={"amount":"subscriptions_estimate"}))

monthly = monthly.merge(subs, on="month_start", how="left")
monthly["subscriptions_estimate"] = monthly["subscriptions_estimate"].fillna(0.0)

out_ai = PATH_PROC / "ai_insights.csv"
monthly.to_csv(out_ai, index=False)
print(f"[OK] ai_insights.csv written → {out_ai}")
monthly.tail()


[OK] ai_insights.csv written → C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\ai_insights.csv


Unnamed: 0,month_start,total_outflows,prev,mom_outflows_pct,top_category,amount_x,top_merchant,amount_y,subscriptions_estimate


In [59]:
# ===== 7) Anomalies (ai_anomalies.csv) =====
def zscore(series):
    mu = series.mean()
    sd = series.std(ddof=0)
    if pd.isna(sd) or sd == 0:
        return pd.Series([0]*len(series), index=series.index)
    return (series - mu) / sd

spend["merchant_z"] = spend.groupby("display_name_final")["amount"].transform(zscore)
anoms = spend[spend["merchant_z"].abs() >= 2.5].copy()
anoms["reason"] = anoms.apply(lambda r: f"{abs(r['merchant_z']):.1f}σ vs usual at {r['display_name_final']}", axis=1)

anoms_out = anoms[["date","display_name_final","category_final","amount","merchant_z","reason"]].rename(columns={"merchant_z":"zscore"})
out_anoms = PATH_PROC / "ai_anomalies.csv"
anoms_out.to_csv(out_anoms, index=False)
print(f"[OK] ai_anomalies.csv written → {out_anoms}  (rows: {len(anoms_out)})")
anoms_out.head()


[OK] ai_anomalies.csv written → C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\ai_anomalies.csv  (rows: 0)


Unnamed: 0,date,display_name_final,category_final,amount,zscore,reason


In [60]:
# ===== 8) Forecast (ai_forecast.csv) =====
ms = (monthly.set_index("month_start")[["total_outflows"]].asfreq("MS"))
ma = ms.rolling(window=3, min_periods=1).mean().iloc[-1, 0] if not ms.empty else 0.0

projections = []
last = ms.index.max() if not ms.empty else pd.Timestamp("today").to_period("M").to_timestamp()
for i in range(1, 4):
    m = last + pd.offsets.MonthBegin(i)
    point = float(ma)
    projections.append({
        "month_start": m.date().isoformat(),
        "spend_point_est": point,
        "lower": point * 0.9,
        "upper": point * 1.1,
        "method": "3M Moving Average"
    })

fc_df = pd.DataFrame(projections)
out_fc = PATH_PROC / "ai_forecast.csv"
fc_df.to_csv(out_fc, index=False)
print(f"[OK] ai_forecast.csv written → {out_fc}")
fc_df


[OK] ai_forecast.csv written → C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\ai_forecast.csv


Unnamed: 0,month_start,spend_point_est,lower,upper,method
0,2025-10-01,0.0,0.0,0.0,3M Moving Average
1,2025-11-01,0.0,0.0,0.0,3M Moving Average
2,2025-12-01,0.0,0.0,0.0,3M Moving Average


In [61]:
# ===== 9) AI Summary CSV (ai_summary.csv) =====
rows = []
for _, r in monthly.iterrows():
    ms = pd.to_datetime(r["month_start"]).date().isoformat()
    headline = f"Spending {float(r['mom_outflows_pct']):+.0%} vs last month; top category: {r.get('top_category','—')}; top merchant: {r.get('top_merchant','—')}."
    bullets = [f"Subscriptions estimated: ${float(r.get('subscriptions_estimate', 0.0)):,.0f}"]
    rows.append({"month_start": ms, "headline": headline, "bullets": " • ".join(bullets)})

summary_df = pd.DataFrame(rows)
out_sum = PATH_PROC / "ai_summary.csv"
summary_df.to_csv(out_sum, index=False)
print(f"[OK] ai_summary.csv written → {out_sum}")
summary_df.tail()


[OK] ai_summary.csv written → C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\ai_summary.csv
