# Part 1: Spend analysis

### üì¶ Cell 1 ‚Äî Imports & prep

In [None]:
# Silence warnings
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

from source.db_connect.bigquery_connector import BigQueryConnector
from source.data_processing.analysis_utils import (
    preprocess_detailed_data,
    compute_abc_tiers,
    fetch_purchase_data_enriched,
    )
from source.data_processing.normalization_utils import resolve_all, resolve_spend

# Load data
bq = BigQueryConnector()
raw = fetch_purchase_data_enriched(bq)
df = preprocess_detailed_data(raw)

# Normalize / standardize all core columns (spend, class3, vendor, product identifiers)
df = resolve_all(df)

# Ensure spend column name
SPEND_COL = 'total_spend'
if SPEND_COL not in df.columns:
    df, SPEND_COL = resolve_spend(df, spend_col=SPEND_COL)

# Add ABC tiers
df = compute_abc_tiers(df, spend_col=SPEND_COL, tier_col='abc_tier')

# Analysis year inference
if 'year' in df.columns:
    YEAR = int(pd.to_numeric(df['year'], errors='coerce').dropna().mode().iloc[0])
else:
    YEAR = datetime.now().year

print(f"Rows: {len(df):,} | Cols: {len(df.columns)} | YEAR={YEAR}")
print('Columns:', df.columns.tolist())
print(f"Sample data:\n{df.head()}")

### üìàScatter: distribution of spend per Class3

In [None]:
# Scatter: distribution of spend per Class3 (no year facet)
work = df.copy()
work[SPEND_COL] = pd.to_numeric(work[SPEND_COL], errors="coerce").fillna(0)

# Rank within each Class3 to spread points on X
work["rank_in_class3"] = work.groupby("Class3")[SPEND_COL].rank(method="first", ascending=False)

# Legend ordering by total spend descending
class3_spend = work.groupby("Class3")[SPEND_COL].sum().sort_values(ascending=False)
class3_order = class3_spend.index.tolist()

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10,6))
sns.scatterplot(
    data=work,
    x="rank_in_class3",
    y=SPEND_COL,
    hue="Class3",
    hue_order=class3_order,
    alpha=0.6,
    s=25
)
plt.yscale("log")  # heavy-tailed spend
plt.title("Spend distribution per Class3 (log scale)")
plt.xlabel("Rank within Class3 (1 = highest spend)")
plt.ylabel("Spend")
plt.legend(title="Class3 (desc spend)", bbox_to_anchor=(1.02, 1), loc="upper left")
plt.tight_layout()
plt.show()

# Optional: summary of product counts per Class3
summary_counts = work.groupby("Class3")[SPEND_COL].count().rename("product_count").reset_index()
print("Sample product counts per Class3:")
print(summary_counts.head(15))

### üè∑Ô∏è Cell 3 ‚Äî Scatter: distribution of spend by Group Vendor

In [None]:
work = df.copy()
work[SPEND_COL] = pd.to_numeric(work[SPEND_COL], errors="coerce").fillna(0)

# Limit legend noise: focus on top N vendors by total spend
N_VENDORS = 10
top_vendors = (work.groupby("GroupVendor")[SPEND_COL]
               .sum()
               .sort_values(ascending=False)
               .head(N_VENDORS)
               .index)
work["VendorTop"] = np.where(work["GroupVendor"].isin(top_vendors), work["GroupVendor"], "Other")

# Rank globally by spend
work["rank_global"] = work[SPEND_COL].rank(method="first", ascending=False)

# Legend ordering: VendorTop descending by total spend, keeping 'Other' last if present
vendor_spend = work.groupby("VendorTop")[SPEND_COL].sum().sort_values(ascending=False)
vendor_order = [v for v in vendor_spend.index if v != "Other"] + (["Other"] if "Other" in vendor_spend.index else [])

plt.figure(figsize=(10,6))
sns.scatterplot(
    data=work,
    x="rank_global",
    y=SPEND_COL,
    hue="VendorTop",
    hue_order=vendor_order,
    alpha=0.6,
    s=25
)
plt.yscale("log")
plt.title(f"Spend distribution by Group Vendor (Top {N_VENDORS})")
plt.xlabel("Global rank (1 = highest spend)")
plt.ylabel("Spend")
plt.legend(title="GroupVendor (desc spend)", bbox_to_anchor=(1.02, 1), loc="upper left")
plt.tight_layout()
plt.show()


### üßÆ Cell 4 ‚Äî Count of products by spend interval

In [None]:
work = df.copy()
work[SPEND_COL] = pd.to_numeric(work[SPEND_COL], errors="coerce").fillna(0)

# Use log-spaced bins for heavy-tailed spend
min_pos = max(work[SPEND_COL].replace(0, np.nan).min(), 1e-6)
max_val = max(work[SPEND_COL].max(), 1)
BINS = 15
bins = np.geomspace(min_pos, max_val, BINS)

work["spend_bin"] = pd.cut(work[SPEND_COL].clip(lower=min_pos), bins=bins, include_lowest=True)
counts = work["spend_bin"].value_counts().sort_index()

plt.figure(figsize=(12,5))
counts.plot(kind="bar")
plt.title("Count of products by spend interval (log-spaced bins)")
plt.xlabel("Spend interval (EUR)")
plt.ylabel("Count of products")
plt.tight_layout()
plt.show()

# (Optional) show a small table
display(counts.to_frame("count").head(10))


### ü§ñ Cell 5 ‚Äî Cluster products at Class3 level by purchase amount (EUR)

In [None]:
# Cluster products within each Class3 using spend column
from source.data_processing.class3_analysis import cluster_products

k = 3
try:
    df_class3_clusters = cluster_products(df, spend_col=SPEND_COL, k=k)
    if df_class3_clusters.empty:
        print("No clusters produced (empty after filtering).")
    else:
        print(df_class3_clusters.head())
except Exception as e:
    print("Clustering failed:", e)

### üì§ Cell 6 ‚Äî Export per Class3 to .xlsx with normalized columns and a top metadata row for

In [None]:
import os
import importlib
import source.data_processing.export_utils as export_utils

# Reload to ensure latest function signature (after code edits)
importlib.reload(export_utils)

# Export with Level2 filter (e.g. 'Threaded Fasteners')
LEVEL2_FILTER = ["Threaded Fasteners"]  # or any valid value
output_dir_level2 = os.path.join(os.getcwd(), "exports_per_class3_year_split_level2")
paths_level2 = export_utils.export_year_split_purchase_quantity(
    bq,
    output_dir_level2,
    fmt_thousands=True,
    segmentation_df=df,
    segmentation_col="abc_tier",
    level2_filter=LEVEL2_FILTER
)
print({"files_written": len(paths_level2), "output_dir": output_dir_level2})

# Export without Level2 filter (all data)
output_dir_all = os.path.join(os.getcwd(), "exports_per_class3_year_split")
paths_all = export_utils.export_year_split_purchase_quantity(
    bq,
    output_dir_all,
    fmt_thousands=True,
    segmentation_df=df,
    segmentation_col="abc_tier"
)
print({"files_written": len(paths_all), "output_dir": output_dir_all})