In [1]:
# Step 7 — Generate AI outputs from Gold (single cell)

import pandas as pd
import numpy as np
from pathlib import Path

REPO = Path(r"C:\Users\kosis\Downloads\Automation\spending-dashboard").resolve()
PROCESSED = REPO / "data" / "processed"
PROCESSED.mkdir(parents=True, exist_ok=True)

GOLD = PROCESSED / "transactions_enriched.csv"
assert GOLD.exists(), f"Missing {GOLD}"

df = pd.read_csv(GOLD, parse_dates=["date","month_start"])
# ----- canonical spend filter: charges only, exclude non-spend flows -----
spend = df[(df["amount"] > 0) & (~df["is_non_spend_flow"])].copy()

# Safety: fill blanks to avoid groupby issues
for col in ["category_final","display_name_final","merchant_key"]:
    spend[col] = spend[col].fillna("").astype(str)

# =========================
# 1) ai_insights.csv
# =========================
# Monthly totals
m = spend.groupby("month_start", as_index=False).agg(total_outflows=("amount","sum"))

# Month-over-month %
m["mom_outflows_pct"] = m["total_outflows"].pct_change()

# Top category per month (by total)
cat = spend.groupby(["month_start","category_final"], as_index=False)["amount"].sum()
cat["rk"] = cat.groupby("month_start")["amount"].rank(ascending=False, method="first")
top_cat = cat[cat["rk"]==1][["month_start","category_final"]].rename(columns={"category_final":"top_category"})

# Top merchant per month (by total)
mer = spend.groupby(["month_start","display_name_final"], as_index=False)["amount"].sum()
mer["rk"] = mer.groupby("month_start")["amount"].rank(ascending=False, method="first")
top_mer = mer[mer["rk"]==1][["month_start","display_name_final"]].rename(columns={"display_name_final":"top_merchant"})

# Lightweight "subscriptions" estimate:
# merchants with charges in >=3 of the last 4 months, monthly avg over last 3 months
# (tunable heuristic; simple and stable)
last4 = spend["month_start"].sort_values().unique()[-4:]
sub_spend = spend[spend["month_start"].isin(last4)]
active_counts = sub_spend.groupby("display_name_final")["month_start"].nunique()
likely_subs = set(active_counts[active_counts >= 3].index)

recent3 = spend["month_start"].sort_values().unique()[-3:]
sub_recent = spend[(spend["display_name_final"].isin(likely_subs)) & (spend["month_start"].isin(recent3))]
subs_est = sub_recent.groupby("month_start")["amount"].sum().reindex(m["month_start"]).fillna(0)
subs_est.name = "subscriptions_estimate"

ai_insights = (
    m.merge(top_cat, on="month_start", how="left")
     .merge(top_mer, on="month_start", how="left")
)
ai_insights["subscriptions_estimate"] = subs_est.values
ai_insights = ai_insights[["month_start","total_outflows","mom_outflows_pct","top_category","top_merchant","subscriptions_estimate"]]
ai_insights.to_csv(PROCESSED / "ai_insights.csv", index=False)

# =========================
# 2) ai_anomalies.csv
# =========================
# Z-score per merchant across their own transaction amounts; flag |z| >= 2.5
g = spend.groupby("display_name_final")["amount"]
mean = g.transform("mean")
std = g.transform("std").replace(0, np.nan)
z = (spend["amount"] - mean) / std
spend["zscore"] = z

anoms = spend[spend["zscore"].abs() >= 2.5].copy()
# brief reason text
def reason_row(r):
    direction = "high" if r["zscore"] > 0 else "low"
    return f"{direction} outlier vs. {r['display_name_final']}'s average"
anoms["reason"] = anoms.apply(reason_row, axis=1)

ai_anomalies = anoms[["date","display_name_final","category_final","amount","zscore","reason"]].sort_values(["date","zscore"], ascending=[True, False])
ai_anomalies.to_csv(PROCESSED / "ai_anomalies.csv", index=False)

# =========================
# 3) ai_forecast.csv
# =========================
# 3-month moving average forecast with simple ±1 std envelope from last 6 months
monthly = spend.groupby("month_start", as_index=False)["amount"].sum().rename(columns={"amount":"spend"})
monthly = monthly.sort_values("month_start").reset_index(drop=True)

monthly["spend_ma3"] = monthly["spend"].rolling(3, min_periods=1).mean()
# naive "forecast" = last MA3 projected forward one month
point_est = monthly["spend_ma3"].iloc[-1] if len(monthly) else 0.0

# bounds from last 6 months of residuals
resid = monthly["spend"] - monthly["spend_ma3"]
sigma = resid.tail(6).std(ddof=0) if len(resid) else 0.0
lower = max(0.0, point_est - sigma) if sigma==sigma else max(0.0, point_est)  # handle NaN
upper = point_est + sigma if sigma==sigma else point_est

ai_forecast = monthly[["month_start"]].copy()
ai_forecast["spend_point_est"] = monthly["spend_ma3"]
# Append one future row (optional)
if len(monthly):
    next_month = pd.to_datetime(monthly["month_start"].iloc[-1]) + pd.offsets.MonthBegin(1)
    fut = pd.DataFrame({"month_start":[next_month], "spend_point_est":[point_est]})
    ai_forecast = pd.concat([ai_forecast, fut], ignore_index=True)

ai_forecast["lower"] = ai_forecast["spend_point_est"].apply(lambda x: max(0.0, x - sigma) if sigma==sigma else max(0.0, x))
ai_forecast["upper"] = ai_forecast["spend_point_est"].apply(lambda x: x + sigma if sigma==sigma else x)
ai_forecast["method"] = "ma3±σ"
ai_forecast.to_csv(PROCESSED / "ai_forecast.csv", index=False)

# =========================
# 4) ai_summary.csv
# =========================
def fmt_usd(x): 
    return "${:,.0f}".format(x)

ins = ai_insights.merge(
    spend.groupby(["month_start","category_final"], as_index=False)["amount"].sum()
         .sort_values(["month_start","amount"], ascending=[True, False])
         .groupby("month_start").head(3)  # top 3 cats for color
         .groupby("month_start")["category_final"].apply(lambda s: ", ".join(s.head(3))),
    on="month_start", how="left"
)
rows = []
for _, r in ins.iterrows():
    ms = pd.to_datetime(r["month_start"]).date()
    total = fmt_usd(r["total_outflows"])
    mom = r["mom_outflows_pct"]
    mom_txt = "n/a" if pd.isna(mom) else f"{mom:+.1%}"
    top_cat = r["top_category"] or "—"
    top_mer = r["top_merchant"] or "—"
    subs = fmt_usd(r["subscriptions_estimate"]) if pd.notna(r["subscriptions_estimate"]) else "$0"
    top3cats = r.get("category_final","") or "—"
    txt = (
        f"{ms}: Spent {total} ({mom_txt} MoM). "
        f"Top category: {top_cat}. Top merchant: {top_mer}. "
        f"Estimated subscriptions: {subs}. "
        f"Notable categories: {top3cats}."
    )
    rows.append({"month_start": r["month_start"], "summary": txt})

ai_summary = pd.DataFrame(rows)
ai_summary.to_csv(PROCESSED / "ai_summary.csv", index=False)

print("✔ Wrote:")
for f in ["ai_insights.csv","ai_anomalies.csv","ai_forecast.csv","ai_summary.csv"]:
    print("   -", PROCESSED / f)


✔ Wrote:
   - C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\ai_insights.csv
   - C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\ai_anomalies.csv
   - C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\ai_forecast.csv
   - C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\ai_summary.csv
