# ABCDE Segmentation of spend

This notebook performs a simple ABCDE-segmentation of spend data, and exports results to .xlsx files.
Run each cell in the order they appear.

Scope (class2/brand) can be adjusted by changing the query. If changed, check compability for export function (exported columns).

### Imports, data loading & preprocessing

In [None]:
import os, sys, re, numpy as np, pandas as pd
from pathlib import Path
import importlib
import source.data_processing.export_utils as eu
eu = importlib.reload(eu)
from dotenv import load_dotenv

# Make attached helper modules importable (these are present next to the notebook or in /mnt/data)
EXTRA_MODULE_DIRS = [
    ".", "/mnt/data",  # adjust if your helpers live elsewhere
]
for p in EXTRA_MODULE_DIRS:
    if p not in sys.path:
        sys.path.insert(0, p)

# Import our helper utilities (from your attachments)
from source.data_prep import field_desc_utils as fdesc
from source.data_prep import field_value_utils as fval
from source.data_processing import analysis_utils as au
from source.data_processing.analysis_utils import compute_abcde_per_class4
from source.data_processing.export_utils import export_year_split_purchase_quantity
from source.data_processing import export_utils as xpu
import sql_queries as qreg
from source.db_connect import bigquery_connector

load_dotenv()  # reads .env in the working directory
PROJECT_ID = os.getenv("PROJECT_ID")
DATASET_ID = os.getenv("DATASET_ID")
TABLE_ID   = os.getenv("TABLE_ID")
OUTPUT_DIR = os.getenv("OUTPUT_DIR", "./output")
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
print("PROJECT_ID:", PROJECT_ID, "| DATASET_ID:", DATASET_ID, "| TABLE_ID:", TABLE_ID)


In [None]:
# Build fully qualified table and fetch data
fqtn = f"{PROJECT_ID}.{DATASET_ID}.{TABLE_ID}"
print("Reading from:", fqtn)

# Use correct import for BigQueryConnector
from source.db_connect import bigquery_connector
bq = bigquery_connector.BigQueryConnector(project_id=PROJECT_ID)  # uses default creds or GOOGLE_APPLICATION_CREDENTIALS
# Use analysis_utils registry to fetch the table you described
df = au.fetch_purchase_data(bq_client=bq)
print(df.shape, "rows x cols")
df.head(3)



In [None]:
# Print columns for debugging after preprocessing
# Ensure df is defined before preprocessing!
try:
    df
except NameError:
    # If df is not defined, load it first
    fqtn = f"{PROJECT_ID}.{DATASET_ID}.{TABLE_ID}"
    print("Reading from:", fqtn)
    bq = bigquery_connector.BigQueryConnector(project_id=PROJECT_ID)
    df = au.fetch_purchase_data(bq_client=bq)
    print(df.shape, "rows x cols")
    df.head(3)

df = au.preprocess_detailed_data(df)
print("Columns in DataFrame after preprocessing:", list(df.columns))

# Ensure key columns exist with consistent names
def _resolve(df, candidates):
    # mirror analysis_utils._resolve_col
    def norm(s): return re.sub(r"[^a-z0-9]", "", s.lower())
    by_norm = {norm(c): c for c in df.columns}
    for c in candidates:
        if c in df.columns: return c
        if norm(c) in by_norm: return by_norm[norm(c)]
    return candidates[0]  # fallback to first candidate, but user must check resolved value

# Force correct column names for robustness
COL_CLASS4 = "Class4"
COL_PNUM   = _resolve(df, ["ProductNumber"])
COL_PDESC  = _resolve(df, ["ProductDescription"])
COL_EUR    = _resolve(df, ["Amount Eur", "Purchase Amount Eur", "amount_eur", "purchase_amount_eur"])
COL_QTY    = _resolve(df, ["Purchase Quantity", "Quantity"])
print("Resolved:", {"Class4":COL_CLASS4, "ProductNumber":COL_PNUM, "ProductDescription":COL_PDESC, "â‚¬":COL_EUR, "Qty":COL_QTY})

# Normalize spend to numeric
df[COL_EUR] = pd.to_numeric(df[COL_EUR], errors="coerce").fillna(0.0)
df[COL_QTY] = pd.to_numeric(df[COL_QTY], errors="coerce")



### ABCDE Segmentation

In [None]:
# Use the correct DataFrame name (df) for aggregation
product_spend_df = (
    df
        .groupby(["Class4", "ProductNumber", "ProductDescription"], as_index=False)
        .agg({"Purchase Amount Eur": "sum"})
)
print(product_spend_df.head())

In [None]:
segmented = compute_abcde_per_class4(
    df=product_spend_df,
    class_col="Class4",
    product_col="ProductNumber",      # unused in function, but fine
    spend_col="Purchase Amount Eur",
)


In [None]:
# Use the correct DataFrame name (df) for merging segmentation results
purchase_with_seg = df.merge(
    segmented[["ProductNumber", "Segmentation"]],
    on="ProductNumber",
    how="left",
)
print(purchase_with_seg.head())
#

In [None]:
# Summarize tiers by Class4 (robust, no EUR mismatch, dedup tags)
# 1. Aggregate EUR per product in original df
# 2. Merge segmentation tags (deduplicated) onto this per-product EUR DataFrame
# 3. Use this for summary

# Aggregate EUR per product
per_product = df.groupby([COL_CLASS4, COL_PNUM], as_index=False)[COL_EUR].sum()
# Compute and deduplicate segmentation tags
seg_tags = compute_abcde_per_class4(df, COL_CLASS4, COL_PNUM, COL_EUR)
seg_tags = seg_tags.drop_duplicates(subset=[COL_CLASS4, COL_PNUM])
# Merge segmentation tags
per_product = per_product.merge(
    seg_tags,
    on=[COL_CLASS4, COL_PNUM], how="left"
)

# Print columns for debugging
print("Columns in per_product:", list(per_product.columns))

# Find EUR column by substring match (case-insensitive)
eur_cols = [col for col in per_product.columns if 'eur' in col.lower()]
print("Columns containing 'eur':", eur_cols)
if eur_cols:
    # Prefer the first column that ends with '_x', else just the first
    eur_col_candidates = [col for col in eur_cols if col.endswith('_x')]
    actual_eur_col = eur_col_candidates[0] if eur_col_candidates else eur_cols[0]
    print(f"Using EUR column for aggregation: {actual_eur_col}")
else:
    raise KeyError(f"No column containing 'eur' found in per_product. Columns: {list(per_product.columns)}")

# Summary by Class4 and Segmentation
summary = (
    per_product.groupby([COL_CLASS4, "Segmentation"], as_index=False)
    .agg(
        products=(COL_PNUM, "nunique"),
        purchase_amount_eur_total=(actual_eur_col, "sum")
    )
    .sort_values([COL_CLASS4, "Segmentation"])
)
# Format # products as e.g. 1,000
summary["products_fmt"] = summary["products"].apply(lambda x: f"{x:,}")
# Format purchase_amount_eur_total as e.g. 1,000,000 EUR
summary["purchase_amount_eur_total_fmt"] = summary["purchase_amount_eur_total"].apply(lambda x: f"{x:,.0f} EUR")
summary[[COL_CLASS4, "Segmentation", "products_fmt", "purchase_amount_eur_total_fmt"]]

### Data validation step

In [None]:
# Check the raw total from the original table (no deduplication)
raw_total = df[COL_EUR].sum()
print(f"Raw total from original table: {raw_total:,.0f} EUR")

In [None]:
# Check that the sum of purchase_amount_eur_total in summary matches the raw total from the original table
total_summary = summary["purchase_amount_eur_total"].sum()
print(f"Summary tiers by Class4 total: {total_summary:,.0f} EUR")
if np.isclose(total_summary, raw_total):
    print("Summary tiers by Class4 total matches raw total from original table.")
else:
    print("Summary tiers by Class4 total does NOT match raw total from original table!")

### Export results to .xlsx

In [None]:
import importlib
import source.data_processing.export_utils as eu
eu = importlib.reload(eu)

from source.data_processing.analysis_utils import compute_abcde_per_class4
from source.data_processing.export_utils import export_year_split_purchase_quantity, fetch_year_purchase_quantity


# # 1) Compute per-product spend for ABCDE from your preprocessed df
product_spend_df = (
    df
      .groupby(["Class4", "ProductNumber", "ProductDescription"], as_index=False)
      .agg({"Purchase Amount Eur": "sum"})
)

segmented = compute_abcde_per_class4(
    df=product_spend_df,
    class_col="Class4",
    product_col="ProductNumber",
    spend_col="Purchase Amount Eur",
)

# 2) Merge Segmentation back into your enriched df
purchase_with_seg = df.merge(
    segmented[["ProductNumber", "Segmentation"]],
    on="ProductNumber",
    how="left",
)

# 3) Build segmentation_df with only columns that exist in the DataFrame, now including Class4 and ProductDescription
pretty_cols = [
    "Class4",
    "ProductNumber",
    "ProductDescription",
    "Segmentation",
    "SalesRounding",
]
existing_cols = [col for col in pretty_cols if col in purchase_with_seg.columns]
segmentation_df = purchase_with_seg[existing_cols].drop_duplicates("ProductNumber")

print("segmentation_df:", segmentation_df.shape, segmentation_df.columns.tolist())

# (Optional) sanity check: df_year
df_year = fetch_year_purchase_quantity(
    bq,
    table="kramp-sharedmasterdata-prd.MadsH.purchase_data",
)
print("df_year:", df_year.shape, df_year.columns.tolist())

# Standardize class4 column name in df_year to match segmentation_df
if 'class4' in df_year.columns:
    df_year = df_year.rename(columns={'class4': 'Class4'})

# Ensure ProductDescription is present in df_year_merged after merge
if 'ProductDescription' not in df_year.columns and 'ProductDescription' in segmentation_df.columns:
    df_year['ProductDescription'] = df_year['ProductNumber'].map(
        dict(zip(segmentation_df['ProductNumber'], segmentation_df['ProductDescription']))
    )

# 4) Merge segmentation_df into df_year on ProductNumber only (to avoid Class4 mismatch)
df_year_merged = df_year.merge(
    segmentation_df,
    on=["ProductNumber"],
    how="left",
    suffixes=("", "_seg")
)

# If ProductDescription is missing after merge, fill from segmentation_df
if 'ProductDescription' not in df_year_merged.columns and 'ProductDescription_seg' in df_year_merged.columns:
    df_year_merged['ProductDescription'] = df_year_merged['ProductDescription_seg']

# 5) Call the export utility with the merged DataFrame, including Class4 and ProductDescription if present
written_files = export_year_split_purchase_quantity(
    bq,
    output_dir="./output",
    table="kramp-sharedmasterdata-prd.MadsH.purchase_data",
    fmt_thousands=True,
    merged_header_label="PurchaseQuantity",
    segmentation_df=df_year_merged,
    segmentation_col="Segmentation",
)

print("\nWritten Excel files:")
for f in written_files:
    print(" -", f)
