# Spend Analysis Workflow
Step-by-step spend analysis, segmentation, dashboarding, and export.

## Step 1: Import, preprocess & load data

### ðŸ“¦ Cell 1 â€” Imports & prep

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns
from datetime import datetime
from source.db_connect.bigquery_connector import BigQueryConnector
from source.data_processing.analysis_utils import (
    preprocess_detailed_data, compute_abc_tiers, fetch_purchase_data_enriched
    )
from source.data_processing.normalization_utils import resolve_all, resolve_spend
from source.data_prep.field_value_utils import parse_eur

bq = BigQueryConnector()
raw = fetch_purchase_data_enriched(bq)
print('Raw columns:', raw.columns.tolist())
if 'Class2' in raw.columns:
    print('Raw unique Class2 values:', raw['Class2'].unique())
else:
    print('Raw Class2 column missing.')

df = preprocess_detailed_data(raw)
print('After preprocess_detailed_data:')
print('Columns:', df.columns.tolist())
if 'Class2' in df.columns:
    print('Unique Class2 values:', df['Class2'].unique())
else:
    print('Class2 column missing.')
print(f'Sample data:\n{df.head()}')

df = resolve_all(df)
print('After resolve_all:')
print('Columns:', df.columns.tolist())
if 'Class2' in df.columns:
    print('Unique Class2 values:', df['Class2'].unique())
else:
    print('Class2 column missing.')
print(f'Sample data:\n{df.head()}')

SPEND_COL = 'total_spend'
if SPEND_COL not in df.columns:
    df, SPEND_COL = resolve_spend(df, spend_col=SPEND_COL)
df = compute_abc_tiers(df, spend_col=SPEND_COL, tier_col='abc_tier')



print(f"Rows: {len(df):,} | Cols: {len(df.columns)}")
print('Columns:', df.columns.tolist())
print(f"Sample data:\n{df.head()}")


## Step 2: Spend dashboards & summary tables

In [None]:
from source.data_processing.analysis_utils import make_spend_col
print('Columns available for analysis:', df.columns.tolist())
df, spend_col = make_spend_col(df, prefer='Amount Eur_numeric')
print(f"Spend column used: {spend_col}")
YEAR_COL = 'Authorization'

def fmt_eur(val):
    try:
        return f"{val:,.0f} EUR" if pd.notnull(val) else ""
    except Exception:
        return str(val)

if YEAR_COL in df.columns and df[YEAR_COL].notnull().any():
    print(f"Year column non-null count: {df[YEAR_COL].notnull().sum()}")
    print(f"Year column unique values: {sorted(df[YEAR_COL].dropna().unique())}")
    spend_by_year = df.groupby(YEAR_COL)[spend_col].sum().sort_index()
    spend_by_year_fmt = spend_by_year.apply(fmt_eur)
    print('spend_by_year head (formatted):', spend_by_year_fmt.head())
    if spend_by_year.empty or not spend_by_year.apply(lambda x: isinstance(x, (int, float, complex)) and not isinstance(x, bool)).any():
        print('No numeric data to plot for spend_by_year. Check data and column names.')
    else:
        # Plot spend trend by year
        plt.figure(figsize=(8,4))
        ax = spend_by_year.plot(marker='o')
        ax.set_title('Total Spend by Year')
        ax.set_ylabel('Purchase Amount EUR')
        ax.set_xlabel('Year')
        ax.grid(True)
        ax.set_yticklabels([fmt_eur(y) for y in ax.get_yticks()])
        plt.show()

    spend_by_vendor_year = df.groupby(['GroupVendor', YEAR_COL])[spend_col].sum().unstack().fillna(0)
    top_vendors = spend_by_vendor_year.sum(axis=1).sort_values(ascending=False).head(10).index
    top_vendor_table = spend_by_vendor_year.loc[top_vendors]
    top_vendor_table_fmt = top_vendor_table.applymap(fmt_eur)
    print('top_vendor_table head (formatted):', top_vendor_table_fmt.head())
    if top_vendor_table.empty or not top_vendor_table.applymap(lambda x: isinstance(x, (int, float, complex)) and not isinstance(x, bool)).any().any():
        print('No numeric data to plot for top_vendor_table. Check data and column names.')
    else:
        # Plot top 10 vendors by year
        ax2 = top_vendor_table.T.plot(kind='bar', figsize=(10,6))
        ax2.set_title('Top 10 Group Vendors by Year')
        ax2.set_ylabel('Purchase Amount EUR')
        ax2.set_xlabel('Year')
        ax2.legend(title='GroupVendor', bbox_to_anchor=(1.05, 1), loc='upper left')
        ax2.set_yticklabels([fmt_eur(y) for y in ax2.get_yticks()])
        plt.tight_layout()
        plt.show()
else:
    print("No valid year column for spend analysis.")

## Step 3: Segment products per Class3 and visualize

In [None]:
# Segment products per Class3 by spend_col and visualize by year
# Note: '54 | Fasteners' is a Class2 value, not Class3.
def fmt_eur(val):
    try:
        return f"{val:,.0f} EUR" if pd.notnull(val) else ""
    except Exception:
        return str(val)

# Visualization by Class3 on x-axis
if 'Class3' in df.columns and 'Authorization' in df.columns:
    class3_year = df.groupby(['Class3', 'Authorization'])[spend_col].sum().unstack().fillna(0)
    # Optionally format for display
    class3_year_fmt = class3_year.applymap(fmt_eur)
    print('Class3 x Year spend head (formatted):', class3_year_fmt.head())
    ax = class3_year.plot(kind='bar', stacked=True, figsize=(12,6))
    ax.set_title('Purchase Amount EUR by Class3 and Year')
    ax.set_ylabel('Purchase Amount EUR')
    ax.set_xlabel('Class3')
    ax.set_xticklabels(class3_year.index, rotation=45, ha='right')
    ax.set_yticklabels([fmt_eur(y) for y in ax.get_yticks()])
    plt.tight_layout()
    plt.show()
else:
    print("Required columns 'Class3' or 'Authorization' missing from DataFrame.")



### ðŸ¤– Cell 4 â€” Cluster products at Class3 level by purchase amount (EUR)

In [None]:
# Cluster products within each Class3 using spend column
from source.data_processing.class3_analysis import cluster_products

k = 3
try:
    df_class3_clusters = cluster_products(df, spend_col=SPEND_COL, k=k)
    if df_class3_clusters.empty:
        print("No clusters produced (empty after filtering).")
    else:
        print(df_class3_clusters.head())
except Exception as e:
    print("Clustering failed:", e)

### ðŸ“¤ Cell 5 â€” Export per Class3 to .xlsx with normalized columns and a product dimensions

In [None]:
import os
import importlib
import source.data_processing.export_utils as export_utils

# Define enrichment fields for export (for reference, not used directly in export function)
ENRICHMENT_FIELDS = [
    'head_shape', 'thread_type', 'head_height', 'head_outside_diameter_width', 'quality',
    'surface_treatment', 'material', 'din_standard', 'weight_per_100_pcs', 'content_in_sales_unit',
    'thread_diameter', 'length', 'height', 'total_height', 'width', 'iso_standard', 'inside_diameter',
    'outside_diameter', 'thickness', 'designed_for_thread', 'total_length', 'head_type', 'thread_length'
 ]

# Check if segmentation column exists before export
segmentation_col = "abc_tier"
if segmentation_col not in df.columns:
    print(f"Warning: Segmentation column '{segmentation_col}' not found in DataFrame. Export will exclude segmentation.")
    segmentation_col = None

# Check if all required enrichment fields are present in df before export
REQUIRED_EXPORT_FIELDS = [
    'year_authorization', 'ProductNumber', 'ProductDescription', 'purchase_amount_eur',
    'head_shape', 'thread_type', 'head_height', 'head_outside_diameter_width', 'quality',
    'surface_treatment', 'material', 'din_standard', 'weight_per_100_pcs', 'content_in_sales_unit',
    'thread_diameter', 'length', 'height', 'total_height', 'width', 'iso_standard', 'inside_diameter',
    'outside_diameter', 'thickness', 'designed_for_thread', 'total_length', 'head_type', 'thread_length',
    'salesRounding'
]
missing_fields = [f for f in REQUIRED_EXPORT_FIELDS if f not in df.columns]
print(f"Missing enrichment fields in df before export: {missing_fields}")

# If missing, fetch and merge enrichment fields
if missing_fields:
    from source.data_processing.product_utils import get_product_details_mapping
    enrich_df = get_product_details_mapping(bq, df['ProductNumber'])
    if enrich_df is not None and not enrich_df.empty:
        print(f"Fetched enrichment fields: {enrich_df.columns.tolist()}")
        # Merge enrichment fields into df
        df = df.merge(enrich_df, on='ProductNumber', how='left')
        print(f"After enrichment merge, df columns: {df.columns.tolist()}")
    else:
        print("No enrichment data fetched or enrichment table is empty.")
else:
    print("All required enrichment fields are present in df.")

output_dir_all = os.path.join(os.getcwd(), "exports_per_class3_year_split")
paths_all = export_utils.export_year_split_purchase_quantity(
    bq,
    output_dir_all,
    fmt_thousands=True,
    segmentation_df=df,
    segmentation_col=segmentation_col if segmentation_col else None
    )
print({"files_written": len(paths_all), "output_dir": output_dir_all})

# The export will only contain: ProductNumber, ProductDescription, salesRounding, segmentation (if available), enrichment fields, and PurchaseQuantity by year (e.g., PurchaseQuantity.2020)