# This is a notebook used to perform spend analysis.
Using a comprehensive collection of spend and supplier data, this notebook sets to analyse the distribution of spend for a provided Class2 Code.
The user will be presented with a highover view of spend per Class3, per Group Vendor, and per Country of Origin. This notebook also contains an export element that combines order data with spend data, does clustering using K-means on purchase amount, and saves to a .xlsx file with columns ProductNumber, ProductDescription, SoldQuantity - all grouped per cluster. 

The insights provided by this notebook may be included when tendering assortment.

In [9]:
# Bootstrap + explicit table FQN
from __future__ import annotations
from pathlib import Path
import os, sys
from dataclasses import dataclass
from dotenv import load_dotenv

REQUIRED_TABLE_FQN = "kramp-sharedmasterdata-prd.MadsH.purchase_data"  # authoritative table id


def find_root(start: Path) -> Path:
    cur = start.resolve()
    for _ in range(6):
        if (cur/".env").exists() or (cur/"src").exists():
            return cur
        cur = cur.parent
    return start.resolve()

ROOT = find_root(Path.cwd())
sys.path.insert(0, str(ROOT))
load_dotenv(ROOT / ".env")

# Environment (PROJECT_ID optional now; we hardcode FULL_TABLE)
PROJECT_ID   = os.getenv("PROJECT_ID", "kramp-sharedmasterdata-prd")
BQ_LOCATION  = os.getenv("BQ_LOCATION", "EU")
FULL_TABLE   = REQUIRED_TABLE_FQN  # ignore composed variants to prevent accidental misuse

# Defensive: if PROJECT_ID was mistakenly set to full table, just keep FULL_TABLE above
# (we rely solely on REQUIRED_TABLE_FQN)

@dataclass
class Params:
    min_total_eur: float | None = None  # e.g., 1000 to drop tiny groups; None = keep all
    force_refresh: bool = False         # if you later use caching with read_or_query

P = Params()
FULL_TABLE

'kramp-sharedmasterdata-prd.MadsH.purchase_data'

In [10]:
# Core imports for vendor distribution EDA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from src.utils.validate import expect_non_empty
from src.utils.spend import aggregate_group_vendor, distribution_summary  # friendly names inside

pd.set_option("display.max_columns", 120)
pd.set_option("display.width", 160)


In [16]:
# Cell 3 — Top 20 per-year, € formatting, scatter (via utils) — hardened
import os
from pathlib import Path
from src.utils.validate import expect_non_empty
from src.utils.spend import (
    topN_by_year, overview_topN, pivot_topN_yearly, fmt_eur, plot_scatter_eur
)

year_start = int(os.getenv("YEAR_START", "2020"))
year_end   = int(os.getenv("YEAR_END", "2025"))
if year_start > year_end:  # guard
    year_start, year_end = year_end, year_start

df = topN_by_year(FULL_TABLE, year_start, year_end, limit=20)
expect_non_empty(df)

overview = overview_topN(df)
display(overview.style.format({"Total Over Period (EUR)": fmt_eur}))

pivot = pivot_topN_yearly(df)
display(pivot.style.format({c: fmt_eur for c in pivot.columns}))

plot_scatter_eur(
    df,
    title=f"Top 20 Group Vendors — Yearly Spend (EUR), {year_start}-{year_end}",
    logy=False,  # set True if magnitudes vary a lot
)

# (optional) export for sharing
# out = Path("data/outputs"); out.mkdir(parents=True, exist_ok=True)
# overview.to_csv(out / "top20_overview.csv", index=False)
# pivot.to_csv(out / "top20_yearly_pivot.csv")


ImportError: cannot import name 'topN_by_year' from 'src.utils.spend' (C:\Users\madsh\Data Analysis\tenders-1\src\utils\spend.py)