In [13]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
import math

# Base paths
BASE = Path(r"C:\Users\kosis\Downloads\Automation\spending-dashboard")
TX_PATH = BASE / "data/processed/transactions_enriched.csv"
OUT_DIR = BASE / "data/processed"
OUT_DIR.mkdir(parents=True, exist_ok=True)

print("Notebook paths set up ✅")

Notebook paths set up ✅


In [14]:
# Load enriched transactions
tx = pd.read_csv(TX_PATH)

# Ensure correct types
tx["date"] = pd.to_datetime(tx["date"])
tx["amount"] = pd.to_numeric(tx["amount"], errors="coerce")
tx = tx.dropna(subset=["date", "amount"])

print(f"Loaded {len(tx):,} transactions.")
tx.head()

Loaded 174 transactions.


Unnamed: 0,date,account,description,merchant_key,display_name_final,category_final,subcategory_final,tags_final,confidence_final,source_final,amount,is_necessity,is_non_spend_flow
0,2025-01-07,discover_credit_ytd,APPLEBEES 2160017 LAS VEGAS NVAPPLE PAY ENDING...,APPLEBEES LAS VEGAS NVAPPLE PAY ENDING IN,APPLEBEES,dining,,,,yaml,22.32,False,False
1,2025-01-08,discover_credit_ytd,PANDA EXPRESS #753 LAS VEGAS NVAPPLE PAY ENDIN...,PANDA EXPRESS LAS VEGAS NVAPPLE PAY ENDING IN,PANDA EXPRESS,dining,,,,yaml,16.26,False,False
2,2025-01-09,discover_credit_ytd,APPLEBEES 2160017 LAS VEGAS NVAPPLE PAY ENDING...,APPLEBEES LAS VEGAS NVAPPLE PAY ENDING IN,APPLEBEES,dining,,,,yaml,26.86,False,False
3,2025-01-13,discover_credit_ytd,IHOP 3144 OLO 336-377-2287 NC,IHOP OLO - - NC,IHOP,dining,,,,yaml,31.06,False,False
4,2025-02-02,discover_credit_ytd,DIRECTPAY FULL BALANCESEE DETAILS OF YOUR NEXT...,DIRECTPAY BALANCESEE OF YOUR NEXT DIRECTPAY BELOW,DIRECTPAY,credit_card_payment,,,,yaml,-75.0,False,False


In [15]:
# --- Estimate subscription spend (robust to mixed types in tags_final) ---

# Ensure the column exists
if "tags_final" not in spend.columns:
    spend["tags_final"] = ""

def to_text(x):
    # Turn lists/tuples/sets into comma-joined text; handle NaN/None cleanly
    if isinstance(x, (list, tuple, set)):
        return ",".join(map(str, x))
    if pd.isna(x):
        return ""
    return str(x)

spend["tags_text"] = spend["tags_final"].apply(to_text).str.lower()

# Tag-based subscription heuristic
spend["is_subscription_like"] = spend["tags_text"].str.contains("subscription", na=False)

# (Optional, stronger) Recurring-merchant heuristic: appears in >=3 distinct months
rec_counts = spend.groupby(["display_name_final", "month_start"]).size().reset_index(name="n")
rec_months = rec_counts.groupby("display_name_final")["month_start"].nunique()
rec_merchants = set(rec_months[rec_months >= 3].index)

spend["is_subscription_like"] = spend["is_subscription_like"] | spend["display_name_final"].isin(rec_merchants)

# Aggregate monthly estimate
subs = (
    spend[spend["is_subscription_like"]]
    .groupby("month_start", as_index=False)["amount"]
    .sum()
    .rename(columns={"amount": "subscriptions_estimate"})
)

monthly = monthly.merge(subs, on="month_start", how="left").fillna({"subscriptions_estimate": 0})


MergeError: Passing 'suffixes' which cause duplicate columns {'subscriptions_estimate_x'} is not allowed.

In [None]:
def zscore(series):
    mu = series.mean()
    sigma = series.std(ddof=0)
    return (series - mu) / sigma if sigma > 0 else pd.Series([0]*len(series), index=series.index)

spend["merchant_z"] = spend.groupby("display_name_final")["amount"].transform(zscore)
anoms = spend[spend["merchant_z"].abs() >= 2.5].copy()

anoms["reason"] = anoms.apply(
    lambda r: f"{abs(r['merchant_z']):.1f}σ vs usual at {r['display_name_final']}",
    axis=1
)

anoms_out = anoms[[
    "date", "display_name_final", "category_final", "amount", "merchant_z", "reason"
]].rename(columns={"merchant_z": "zscore"})

anoms_out.to_csv(OUT_DIR / "ai_anomalies.csv", index=False)
anoms_out.head()


Unnamed: 0,date,display_name_final,category_final,amount,zscore,reason
70,2025-05-23,AMAZON MKTPLACE PMTS,shopping,135.46,2.909885,2.9σ vs usual at AMAZON MKTPLACE PMTS


In [None]:
# Use a simple moving average forecast
ms = monthly.set_index("month_start")[["total_outflows"]].asfreq("MS")
rolling_avg = ms.rolling(window=3, min_periods=1).mean().iloc[-1, 0]

forecast = []
last_date = ms.index.max()

for i in range(1, 4):
    next_month = last_date + pd.offsets.MonthBegin(i)
    forecast.append({
        "month_start": next_month.date().isoformat(),
        "spend_point_est": rolling_avg,
        "lower": rolling_avg * 0.9,
        "upper": rolling_avg * 1.1,
        "method": "3M Moving Average"
    })

forecast_df = pd.DataFrame(forecast)
forecast_df.to_csv(OUT_DIR / "ai_forecast.csv", index=False)
forecast_df


Unnamed: 0,month_start,spend_point_est,lower,upper,method
0,2025-10-01,705.736667,635.163,776.310333,3M Moving Average
1,2025-11-01,705.736667,635.163,776.310333,3M Moving Average
2,2025-12-01,705.736667,635.163,776.310333,3M Moving Average


In [None]:
# --- Cell 6: Monthly AI Summaries (robust) ---

# 0) Sanity checks / fallbacks
if "month_start" not in monthly.columns:
    raise ValueError("monthly is missing 'month_start'. Re-run Cells 2–3 to rebuild monthly KPIs.")

# If subscriptions_estimate is missing, recompute quickly from 'spend'
if "subscriptions_estimate" not in monthly.columns:
    if "spend" not in globals():
        # rebuild a minimal 'spend' if needed
        spend = tx[tx["amount"] > 0].copy()
        spend["month_start"] = spend["date"].values.astype("datetime64[M]")

    # robust tag → text
    def to_text(x):
        if isinstance(x, (list, tuple, set)):
            return ",".join(map(str, x))
        if pd.isna(x):
            return ""
        return str(x)

    if "tags_text" not in spend.columns:
        if "tags_final" not in spend.columns:
            spend["tags_final"] = ""
        spend["tags_text"] = spend["tags_final"].apply(to_text).str.lower()

    spend["is_subscription_like"] = spend["tags_text"].str.contains("subscription", na=False)

    # recurring merchant heuristic (>=3 distinct months)
    rec_counts = spend.groupby(["display_name_final", "month_start"]).size().reset_index(name="n")
    rec_months = rec_counts.groupby("display_name_final")["month_start"].nunique()
    rec_merchants = set(rec_months[rec_months >= 3].index)

    spend["is_subscription_like"] = spend["is_subscription_like"] | spend["display_name_final"].isin(rec_merchants)

    subs = (
        spend[spend["is_subscription_like"]]
        .groupby("month_start", as_index=False)["amount"]
        .sum()
        .rename(columns={"amount": "subscriptions_estimate"})
    )
    monthly = monthly.merge(subs, on="month_start", how="left")

# 1) Clean up columns and nulls
monthly = monthly.copy()
for col in ["top_category", "top_merchant"]:
    if col not in monthly.columns:
        monthly[col] = None
monthly["subscriptions_estimate"] = monthly["subscriptions_estimate"].fillna(0.0)
monthly["mom_outflows_pct"] = monthly["mom_outflows_pct"].fillna(0.0)

# 2) Build summary dict
summary = {}
for _, row in monthly.iterrows():
    ms = pd.to_datetime(row["month_start"])
    date_key = ms.date().isoformat()

    top_cat = (row["top_category"] or "—")
    top_merch = (row["top_merchant"] or "—")
    mom_pct = float(row["mom_outflows_pct"])
    subs_amt = float(row["subscriptions_estimate"])

    headline = f"Spending {mom_pct:+.0%} vs last month; top category: {top_cat}; top merchant: {top_merch}."
    bullets = [f"Subscriptions estimated: ${subs_amt:,.0f}"]

    summary[date_key] = {"headline": headline, "bullets": bullets}

# 3) Write JSON
with open(OUT_DIR / "ai_summary.json", "w") as f:
    json.dump(summary, f, indent=2)

print("ai_summary.json written ✅")


ai_summary.json written ✅


In [16]:
import pandas as pd
from pathlib import Path
import math

# --- Paths ---
BASE = Path(r"C:\Users\kosis\Downloads\Automation\spending-dashboard")
TX_PATH = BASE / "data/processed/transactions_enriched.csv"
OUT_DIR = BASE / "data/processed"
OUT_DIR.mkdir(parents=True, exist_ok=True)

print("OUT_DIR:", OUT_DIR.resolve())

# --- Load transactions ---
tx = pd.read_csv(TX_PATH)
tx["date"] = pd.to_datetime(tx["date"], errors="coerce")
tx["amount"] = pd.to_numeric(tx["amount"], errors="coerce")
tx = tx.dropna(subset=["date","amount"])

# --- Outflows-only & month key ---
spend = tx[tx["amount"] > 0].copy()
spend["month_start"] = spend["date"].values.astype("datetime64[M]")

# --- Monthly totals ---
monthly = (spend
    .groupby("month_start", as_index=False)
    .agg(total_outflows=("amount","sum"))
    .sort_values("month_start"))

# --- MoM % ---
monthly["prev"] = monthly["total_outflows"].shift(1)
monthly["mom_outflows_pct"] = ((monthly["total_outflows"] - monthly["prev"]) / monthly["prev"]).fillna(0)

# --- Top category per month ---
top_cat = (spend.groupby(["month_start","category_final"], as_index=False)["amount"].sum()
               .sort_values(["month_start","amount"], ascending=[True, False])
               .drop_duplicates("month_start")
               .rename(columns={"category_final":"top_category"}))

# --- Top merchant per month ---
top_merch = (spend.groupby(["month_start","display_name_final"], as_index=False)["amount"].sum()
                 .sort_values(["month_start","amount"], ascending=[True, False])
                 .drop_duplicates("month_start")
                 .rename(columns={"display_name_final":"top_merchant"}))

monthly = (monthly
           .merge(top_cat, on="month_start", how="left")
           .merge(top_merch, on="month_start", how="left"))

# --- Subscription estimate (robust) ---
def to_text(x):
    import pandas as pd
    if isinstance(x, (list, tuple, set)):
        return ",".join(map(str, x))
    if pd.isna(x):
        return ""
    return str(x)

if "tags_final" not in spend.columns:
    spend["tags_final"] = ""

spend["tags_text"] = spend["tags_final"].apply(to_text).str.lower()
spend["is_subscription_like"] = spend["tags_text"].str.contains("subscription", na=False)

# recurring merchants heuristic: appears in >=3 distinct months
rec_counts = spend.groupby(["display_name_final","month_start"]).size().reset_index(name="n")
rec_months = rec_counts.groupby("display_name_final")["month_start"].nunique()
rec_merchants = set(rec_months[rec_months >= 3].index)
spend["is_subscription_like"] = spend["is_subscription_like"] | spend["display_name_final"].isin(rec_merchants)

subs = (spend[spend["is_subscription_like"]]
        .groupby("month_start", as_index=False)["amount"].sum()
        .rename(columns={"amount":"subscriptions_estimate"}))

monthly = monthly.merge(subs, on="month_start", how="left")
monthly["subscriptions_estimate"] = monthly["subscriptions_estimate"].fillna(0)

# --- Save ---
out_path = OUT_DIR / "ai_insights.csv"
monthly.to_csv(out_path, index=False)
print("Wrote:", out_path.resolve())

# --- Quick verify ---
print("Columns:", list(monthly.columns))
display(monthly.tail())
print("Dir listing:", [p.name for p in OUT_DIR.glob("ai_*.*")])


OUT_DIR: C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed
Wrote: C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\ai_insights.csv
Columns: ['month_start', 'total_outflows', 'prev', 'mom_outflows_pct', 'top_category', 'amount_x', 'top_merchant', 'amount_y', 'subscriptions_estimate']


Unnamed: 0,month_start,total_outflows,prev,mom_outflows_pct,top_category,amount_x,top_merchant,amount_y,subscriptions_estimate
4,2025-05-01,1553.8,636.38,1.441623,government_fees,416.99,NC COURT PAYMENTS,272.99,443.08
5,2025-06-01,5658.3,1553.8,2.641588,auto_service,4995.72,FLETCHER JONES,4995.72,336.01
6,2025-07-01,993.88,5658.3,-0.82435,credit_card_payment,581.26,AUTOMATIC PAYMENT - THANK,581.26,720.24
7,2025-08-01,1084.33,993.88,0.091007,credit_card_payment,639.65,AUTOMATIC PAYMENT - THANK,639.65,837.57
8,2025-09-01,39.0,1084.33,-0.964033,shopping,39.0,AMAZON MKTPLACE PMTS,39.0,39.0


Dir listing: ['ai_anomalies.csv', 'ai_forecast.csv', 'ai_insights.csv', 'ai_summary.json']
