In [37]:
import os, json
from pathlib import Path
from dotenv import load_dotenv

# -------------------------
# Load environment
# -------------------------
REPO = Path(r"C:\Users\kosis\Downloads\Automation\spending-dashboard").resolve()
load_dotenv(REPO / ".env")

# -------------------------
# Plaid imports
# -------------------------
from plaid.configuration import Configuration, Environment
from plaid.api import plaid_api
from plaid import ApiClient
from plaid.model.products import Products
from plaid.model.sandbox_public_token_create_request import SandboxPublicTokenCreateRequest
from plaid.model.item_public_token_exchange_request import ItemPublicTokenExchangeRequest

# -------------------------
# Plaid client (sandbox)
# -------------------------
configuration = Configuration(
    host=Environment.Sandbox,
    api_key={
        "clientId": os.getenv("PLAID_CLIENT_ID"),
        "secret": os.getenv("PLAID_SECRET"),
    },
)
api_client = ApiClient(configuration)
client = plaid_api.PlaidApi(api_client)

# -------------------------
# File paths
# -------------------------
CONFIG_DIR = REPO / "config"
CONFIG_DIR.mkdir(parents=True, exist_ok=True)
ITEMS_JSON = CONFIG_DIR / "plaid_items.json"
CURSORS_JSON = CONFIG_DIR / "plaid_cursors.json"

# -------------------------
# Create sandbox Item
# -------------------------
try:
    prod_tx = Products("transactions")
except Exception:
    prod_tx = Products.TRANSACTIONS

def create_sandbox_item(institution_id="ins_3"):  # First Platypus Bank
    req = SandboxPublicTokenCreateRequest(
        institution_id=institution_id,
        initial_products=[prod_tx],
    )
    pub = client.sandbox_public_token_create(req)
    exch = client.item_public_token_exchange(
        ItemPublicTokenExchangeRequest(public_token=pub["public_token"])
    )
    return {"item_id": exch["item_id"], "access_token": exch["access_token"]}

item = create_sandbox_item()

# -------------------------
# Save configs
# -------------------------
with ITEMS_JSON.open("w", encoding="utf-8") as f:
    json.dump({"items": [item]}, f, indent=2)

with CURSORS_JSON.open("w", encoding="utf-8") as f:
    json.dump({"transactions": {item["item_id"]: ""}}, f, indent=2)

print("✔ Sandbox Item created")
print("✔ Wrote:", ITEMS_JSON)
print("✔ Wrote:", CURSORS_JSON)
print("item_id:", item["item_id"])
print("access_token (prefix):", item["access_token"][:28] + "…")


✔ Sandbox Item created
✔ Wrote: C:\Users\kosis\Downloads\Automation\spending-dashboard\config\plaid_items.json
✔ Wrote: C:\Users\kosis\Downloads\Automation\spending-dashboard\config\plaid_cursors.json
item_id: V6NJvaWpwgtPd56vnMAbFKxraAkqKbTWDQ5lk
access_token (prefix): access-sandbox-023548ca-0020…


In [38]:
from pathlib import Path
import os, json
from dotenv import load_dotenv
from plaid.configuration import Configuration, Environment
from plaid.api import plaid_api
from plaid import ApiClient
from plaid.model.transactions_sync_request import TransactionsSyncRequest
from plaid.model.transactions_sync_request_options import TransactionsSyncRequestOptions

# --- setup ---
REPO = Path(r"C:\Users\kosis\Downloads\Automation\spending-dashboard").resolve()
load_dotenv(REPO / ".env")
cfg_dir = REPO / "config"
items = json.loads((cfg_dir/"plaid_items.json").read_text())["items"]
cursors = json.loads((cfg_dir/"plaid_cursors.json").read_text()).get("transactions", {})

api = plaid_api.PlaidApi(ApiClient(Configuration(
    host=Environment.Sandbox,
    api_key={"clientId": os.getenv("PLAID_CLIENT_ID"), "secret": os.getenv("PLAID_SECRET")},
)))

for it in items:
    item_id = it["item_id"]
    token   = it["access_token"]
    cursor  = cursors.get(item_id, "")
    print(f"\n=== Item {item_id} ===")
    print("starting cursor:", repr(cursor))

    added = modified = removed = 0
    latest_cursor = cursor
    has_more = True

    while has_more:
        # build request kwargs; only include 'cursor' if non-empty
        req_kwargs = dict(access_token=token, count=500,
                          options=TransactionsSyncRequestOptions(include_personal_finance_category=False))
        if latest_cursor:
            req_kwargs["cursor"] = latest_cursor

        resp = api.transactions_sync(TransactionsSyncRequest(**req_kwargs))
        added    += len(resp["added"])
        modified += len(resp["modified"])
        removed  += len(resp["removed"])
        latest_cursor = resp["next_cursor"]
        has_more      = resp["has_more"]

    print(f"added={added} | modified={modified} | removed={removed}")
    print("next_cursor (NOT saved):", latest_cursor)



=== Item V6NJvaWpwgtPd56vnMAbFKxraAkqKbTWDQ5lk ===
starting cursor: ''
added=0 | modified=0 | removed=0
next_cursor (NOT saved): 


In [39]:
import os, json
from pathlib import Path
import pandas as pd
from dotenv import load_dotenv
from plaid.configuration import Configuration, Environment
from plaid.api import plaid_api
from plaid import ApiClient
from plaid.model.transactions_sync_request import TransactionsSyncRequest
from plaid.model.transactions_sync_request_options import TransactionsSyncRequestOptions

# --- paths/env
REPO = Path(r"C:\Users\kosis\Downloads\Automation\spending-dashboard").resolve()
DATA_DIR = REPO / "data"
INTERIM = DATA_DIR / "interim"
INTERIM.mkdir(parents=True, exist_ok=True)
SILVER = INTERIM / "transactions_canonical.csv"

CFG = REPO / "config"
ITEMS_JSON = CFG / "plaid_items.json"
CURSORS_JSON = CFG / "plaid_cursors.json"

load_dotenv(REPO / ".env")

# --- plaid client
client = plaid_api.PlaidApi(ApiClient(Configuration(
    host=Environment.Sandbox,
    api_key={"clientId": os.getenv("PLAID_CLIENT_ID"), "secret": os.getenv("PLAID_SECRET")},
)))

# --- load config
items   = json.loads(ITEMS_JSON.read_text())["items"]
cursors = json.loads(CURSORS_JSON.read_text()).get("transactions", {})

# --- load current Silver (if exists)
if SILVER.exists():
    silver_df = pd.read_csv(SILVER, dtype=str)
    # cast amount/date later
else:
    silver_df = pd.DataFrame(columns=[
        "transaction_id","item_id","account_id","date","name","merchant_name",
        "amount","pending"
    ])

# --- helper: upsert/delete by transaction_id
silver_df["transaction_id"] = silver_df["transaction_id"].astype(str)

def upsert_rows(df, rows):
    if not rows:
        return df
    add = pd.DataFrame(rows)
    if add.empty:
        return df
    add["transaction_id"] = add["transaction_id"].astype(str)
    # drop any existing ids then concat
    df = df[~df["transaction_id"].isin(add["transaction_id"])]
    df = pd.concat([df, add], ignore_index=True)
    return df

def delete_rows(df, removed_ids):
    if not removed_ids:
        return df
    ids = pd.Series([r["transaction_id"] for r in removed_ids], dtype=str)
    return df[~df["transaction_id"].isin(ids)]

# --- run incremental sync for each item and update Silver in-memory
next_cursors = {}
for it in items:
    item_id = it["item_id"]; token = it["access_token"]
    cursor  = cursors.get(item_id, "")

    latest_cursor = cursor
    has_more = True
    total_added = total_modified = total_removed = 0

    while has_more:
        req_kwargs = dict(access_token=token, count=500,
                          options=TransactionsSyncRequestOptions(include_personal_finance_category=False))
        if latest_cursor:
            req_kwargs["cursor"] = latest_cursor
        resp = client.transactions_sync(TransactionsSyncRequest(**req_kwargs))

        added    = resp["added"]
        modified = resp["modified"]
        removed  = resp["removed"]

        # keep posted only (pending == False)
        added    = [t for t in added    if not t.get("pending", False)]
        modified = [t for t in modified if not t.get("pending", False)]

        # normalize to our Silver schema
        def to_row(t):
            return {
                "transaction_id": t.get("transaction_id"),
                "item_id": item_id,
                "account_id": t.get("account_id"),
                "date": t.get("date"),
                "name": t.get("name"),
                "merchant_name": t.get("merchant_name"),
                "amount": str(t.get("amount")),   # keep as string for now; Power BI will parse
                "pending": str(t.get("pending", False)),
            }

        add_rows = [to_row(t) for t in added]
        mod_rows = [to_row(t) for t in modified]

        silver_df = upsert_rows(silver_df, add_rows)
        silver_df = upsert_rows(silver_df, mod_rows)
        silver_df = delete_rows(silver_df, removed)

        latest_cursor = resp["next_cursor"]
        has_more = resp["has_more"]

        total_added    += len(added)
        total_modified += len(modified)
        total_removed  += len(removed)

    next_cursors[item_id] = latest_cursor
    print(f"[{item_id}] added={total_added} modified={total_modified} removed={total_removed} | next_cursor set.")

# --- write Silver
silver_df.to_csv(SILVER, index=False)
print("✔ Silver written:", SILVER, "| rows:", len(silver_df))


[V6NJvaWpwgtPd56vnMAbFKxraAkqKbTWDQ5lk] added=0 modified=0 removed=0 | next_cursor set.
✔ Silver written: C:\Users\kosis\Downloads\Automation\spending-dashboard\data\interim\transactions_canonical.csv | rows: 386


In [40]:
# update cursors on disk only after Silver is safely written
cursors.update(next_cursors)
with CURSORS_JSON.open("w", encoding="utf-8") as f:
    json.dump({"transactions": cursors}, f, indent=2)

print("✔ Cursors updated:", CURSORS_JSON)

✔ Cursors updated: C:\Users\kosis\Downloads\Automation\spending-dashboard\config\plaid_cursors.json


In [41]:
import os, re, json
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv

# repo + paths
REPO = Path(r"C:\Users\kosis\Downloads\Automation\spending-dashboard").resolve()
load_dotenv(REPO / ".env")

DATA = REPO / "data"
INTERIM = DATA / "interim"
PROCESSED = DATA / "processed"
DOCS = REPO / "docs"
CONFIG = REPO / "config"

PROCESSED.mkdir(parents=True, exist_ok=True)
DOCS.mkdir(parents=True, exist_ok=True)

SILVER = INTERIM / "transactions_canonical.csv"
ENRICHED = PROCESSED / "transactions_enriched.csv"
UNKNOWN = DOCS / "review_unknowns.csv"
YAML_PATH = CONFIG / "categories.yaml"

assert SILVER.exists(), "Missing data/interim/transactions_canonical.csv"

df = pd.read_csv(SILVER, dtype=str)
print("Loaded Silver rows:", len(df))


Loaded Silver rows: 386


In [43]:
# Step 5 — Silver → Gold (single, updated cell)

import os, re, json, yaml
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv

# --- Paths & env ---
REPO = Path(r"C:\Users\kosis\Downloads\Automation\spending-dashboard").resolve()
load_dotenv(REPO / ".env")

DATA = REPO / "data"
INTERIM = DATA / "interim"
PROCESSED = DATA / "processed"
DOCS = REPO / "docs"
CONFIG = REPO / "config"

PROCESSED.mkdir(parents=True, exist_ok=True)
DOCS.mkdir(parents=True, exist_ok=True)

SILVER    = INTERIM / "transactions_canonical.csv"
ENRICHED  = PROCESSED / "transactions_enriched.csv"
UNKNOWN   = DOCS / "review_unknowns.csv"
YAML_PATH = CONFIG / "categories.yaml"

assert SILVER.exists(), "Missing data/interim/transactions_canonical.csv"

# --- Load Silver ---
df = pd.read_csv(SILVER, dtype=str)
print("Loaded Silver rows:", len(df))

# --- Helpers ---
def normalize_merchant_key(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = s.upper()
    s = re.sub(r"[^A-Z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

# --- Load YAML map (optional) ---
yaml_map = {"merchants": [], "patterns": []}
if YAML_PATH.exists():
    with open(YAML_PATH, "r", encoding="utf-8") as f:
        yraw = yaml.safe_load(f) or {}
        yaml_map["merchants"] = yraw.get("merchants", []) or []
        yaml_map["patterns"]  = yraw.get("patterns", []) or []
else:
    print("Note: categories.yaml not found; unmatched merchants will be logged to review_unknowns.csv")

# Build exact match map
exact_map = {}
for m in yaml_map["merchants"]:
    key = normalize_merchant_key(m.get("match", ""))
    if key:
        exact_map[key] = {
            "display_name_final": m.get("display_name") or key,
            "category_final": m.get("category") or "",
            "tags_final": ",".join(m.get("tags", []) or []),
            "confidence_final": "yaml",
            "source_final": "plaid",
        }

# Compile regex rules
regex_rules = []
for p in yaml_map["patterns"]:
    rx = p.get("regex")
    if rx:
        try:
            regex_rules.append((
                re.compile(rx, re.I),
                {
                    "display_name_final": p.get("display_name") or "",
                    "category_final": p.get("category") or "",
                    "tags_final": ",".join(p.get("tags", []) or []),
                    "confidence_final": "yaml",
                    "source_final": "plaid",
                }
            ))
        except re.error as e:
            print("Skipped bad regex:", rx, "|", e)

# --- Core field types ---
df["amount"] = pd.to_numeric(df["amount"], errors="coerce")     # Plaid: outflows +, inflows -
df["date"]   = pd.to_datetime(df["date"], errors="coerce").dt.date

# --- Derive description & merchant_key (prefer merchant_name fallback to name) ---
desc = df["merchant_name"].fillna("").replace("", pd.NA).fillna(df["name"])
df["description"] = desc
df["merchant_key"] = desc.apply(normalize_merchant_key)

# --- Enrichment scaffold ---
enriched = pd.DataFrame(index=df.index)
enriched["display_name_final"] = ""
enriched["category_final"]     = ""
enriched["tags_final"]         = ""
enriched["confidence_final"]   = ""
enriched["source_final"]       = "plaid"

# Exact matches
mask_exact = df["merchant_key"].isin(exact_map.keys())
if mask_exact.any():
    enriched.loc[mask_exact, ["display_name_final","category_final","tags_final","confidence_final","source_final"]] = \
        pd.DataFrame([exact_map[k] for k in df.loc[mask_exact, "merchant_key"]]).values

# Regex matches for remaining blanks
to_regex = enriched["display_name_final"].eq("")
if to_regex.any() and regex_rules:
    candidates = df.loc[to_regex, "description"].fillna("")
    matched_display, matched_category, matched_tags, matched_conf, matched_source = [], [], [], [], []
    for i, text in candidates.items():
        applied = False
        for rx, mapping in regex_rules:
            if rx.search(text or ""):
                matched_display.append(mapping["display_name_final"] or df.at[i, "merchant_key"])
                matched_category.append(mapping["category_final"])
                matched_tags.append(mapping["tags_final"])
                matched_conf.append("yaml")
                matched_source.append("plaid")
                applied = True
                break
        if not applied:
            matched_display.append("")
            matched_category.append("")
            matched_tags.append("")
            matched_conf.append("")
            matched_source.append("plaid")
    enriched.loc[to_regex, ["display_name_final","category_final","tags_final","confidence_final","source_final"]] = \
        pd.DataFrame({
            "display_name_final": matched_display,
            "category_final": matched_category,
            "tags_final": matched_tags,
            "confidence_final": matched_conf,
            "source_final": matched_source
        }).values

# Fallback: use merchant_key if still blank
still_blank = enriched["display_name_final"].eq("")
enriched.loc[still_blank, "display_name_final"] = df.loc[still_blank, "merchant_key"]

# --- Non-spend flow detection ---
patterns_non_spend = [
    r"\bPAYMENT\b", r"\bAUTOPAY\b", r"\bDIRECT\s?PAY\b", r"\bCREDIT\b", r"\bREFUND\b",
    r"\bTRANSFER\b", r"\bZELLE\b", r"\bVENMO\b", r"\bREVERSAL\b"
]
rx_non_spend = re.compile("|".join(patterns_non_spend), re.I)
is_non_spend = df["name"].fillna("").str.contains(rx_non_spend) | df["merchant_name"].fillna("").str.contains(rx_non_spend)
# Also treat inflows (amount < 0) as non-spend by default
is_non_spend = is_non_spend | (df["amount"] < 0)

# --- Month start (as Python date series) ---
s  = pd.to_datetime(df["date"], errors="coerce")
ms = (s.values.astype("datetime64[M]")).astype("datetime64[D]")   # first day of month
month_start = pd.Series(pd.DatetimeIndex(ms).date, index=df.index)

# --- Assemble Gold ---
gold = pd.DataFrame({
    "date": df["date"],
    "account": df["account_id"].astype(str),
    "description": df["name"].astype(str),
    "merchant_key": df["merchant_key"].astype(str),
    "display_name_final": enriched["display_name_final"].astype(str),
    "category_final": enriched["category_final"].astype(str),
    "tags_final": enriched["tags_final"].astype(str),
    "confidence_final": enriched["confidence_final"].replace("", "plaid"),
    "source_final": enriched["source_final"],
    "amount": df["amount"],
    "is_necessity": False,
    "is_non_spend_flow": is_non_spend.astype(bool),
    "month_start": month_start,
})

# --- Write outputs ---
gold.to_csv(ENRICHED, index=False)

unknown_mask = (gold["category_final"] == "") & (~gold["is_non_spend_flow"])
unknowns = gold.loc[unknown_mask, ["merchant_key","description"]].drop_duplicates().sort_values("merchant_key")
unknowns.to_csv(UNKNOWN, index=False)

print("✔ Wrote:", ENRICHED,  "| rows:", len(gold))
print("✔ Logged unknowns:", UNKNOWN, "| rows:", len(unknowns))

# Preview
gold.head(5)


Loaded Silver rows: 386
✔ Wrote: C:\Users\kosis\Downloads\Automation\spending-dashboard\data\processed\transactions_enriched.csv | rows: 386
✔ Logged unknowns: C:\Users\kosis\Downloads\Automation\spending-dashboard\docs\review_unknowns.csv | rows: 12


Unnamed: 0,date,account,description,merchant_key,display_name_final,category_final,tags_final,confidence_final,source_final,amount,is_necessity,is_non_spend_flow,month_start
0,2025-08-30,ozDabnNg3KhjmPE6KMbjfjgDx7DK1aioAo7yB,Uber 072515 SF**POOL**,UBER,UBER,,,plaid,plaid,6.33,False,False,2025-08-01
1,2025-08-17,ozDabnNg3KhjmPE6KMbjfjgDx7DK1aioAo7yB,Uber 063015 SF**POOL**,UBER,UBER,,,plaid,plaid,5.4,False,False,2025-08-01
2,2025-08-15,ozDabnNg3KhjmPE6KMbjfjgDx7DK1aioAo7yB,United Airlines,UNITED AIRLINES,UNITED AIRLINES,,,plaid,plaid,-500.0,False,True,2025-08-01
3,2025-08-14,ozDabnNg3KhjmPE6KMbjfjgDx7DK1aioAo7yB,McDonald's,MCDONALD S,MCDONALD S,,,plaid,plaid,12.0,False,False,2025-08-01
4,2025-08-14,ozDabnNg3KhjmPE6KMbjfjgDx7DK1aioAo7yB,Starbucks,STARBUCKS,STARBUCKS,,,plaid,plaid,4.33,False,False,2025-08-01
