# Tender Material Generator
This notebook fetches any assortment in scope based on provided filtering.
The notebook prepares and cleans the data, before loading it into a data frame.
The notebook will provide the user with summary statistics and key insights for internal usage.
The notebook generates a .csv file suitable for external sharing when doing RFX-processes.

In [None]:
# import pandas packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
from scipy.stats import mode
from collections import Counter

warnings.filterwarnings('ignore')

from datetime import datetime

# Unified imports from data_processing (__init__ now exposes helpers & registry)
from source.data_processing import (
    clustering_params as params,
    clustering_pipeline as pipeline,
    clustering_utils as utils,
    fetch_super_table,
    fetch_super_table_for_clustering,
    pivot_attributes_wide,
    fetch_distinct_values,
    apply_df_filters,
    compute_totals,
    summarize_super_table,
    build_year_metrics_if_missing,
    SQL_SCRIPT_PATHS,
    load_sql_script,
)

# Visualization utilities (may rely on pipeline state)
from source.data_processing.clustering_visualization_utils import (
    build_totals,
    plot_overall_cluster_metrics,
    plot_yearly_trends,
    plot_feature_scatter,
    plot_origin_heatmap,
    plot_brand_supplier_bars,
    summarize_cluster_tables,
)

print(f"Analysis started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("Available SQL scripts:", sorted(SQL_SCRIPT_PATHS))

## Step 1: Setting the Scope
First we set the scope by filtering on Class2, Class3, Class4, Brand Name, Country of Origin, Group Supplier. 

In [None]:
# Create and configure clustering pipeline
from source.data_processing.clustering_pipeline import ClusteringPipeline
pipeline = ClusteringPipeline(
    min_transactions=0
    class2_description='Fasteners',
    class3_description='',
    product_description_keyword='Bolt',
    group_supplier='Stafa Group',
    brand_name='Kramp',
    countryoforigin='CN',
    max_clusters=10,

)

print("Clustering pipeline configured.")

In [None]:
print(f"ðŸš€ Starting clustering analysis pipeline...")
print(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Step 1: Load data via pipeline (uses filters in params)
try:
    pipeline.load_data()
except Exception as e:
    print("Load failure:", e)
    raise

# Step 2: Prepare features (adds totals internally)
pipeline.prepare_features()

# Step 3: Optimize cluster numbers
opt = pipeline.optimize_clusters()
print("Optimization summary (truncated):", {k: opt[k] for k in ['optimal_elbow','optimal_silhouette','optimal_calinski']})

# Step 2: Running Clustering Pipeline

In [None]:
# Map products to clusters (kmeans example)
if 'kmeans' in pipeline.clustering_results:
    mapped_products = pipeline.clustering_results['kmeans']['df_clustered'][['item_number', 'kmeans_cluster']]
    print(mapped_products.head())
else:
    print("kmeans results not available yet.")

In [None]:
# Advanced feature engineering placeholder (aligns with analysis_params spec)
import analysis_params as P

if pipeline.features is None:
    raise RuntimeError("Pipeline features not prepared yet.")

spec = P.feature_spec()
X = pipeline.features.copy()

def _combine(cols, strategy="sum"):
    existing = [c for c in cols if c in X.columns]
    s = X[existing].apply(pd.to_numeric, errors='coerce')
    if strategy == "latest":
        return s.ffill(axis=1).iloc[:, -1]
    if strategy == "mean":
        return s.mean(axis=1, skipna=True)
    return s.sum(axis=1, skipna=True)

for feat_name, cfg in spec.items():
    X[feat_name] = _combine(cfg["from"], cfg.get("strategy", "sum"))

features = list(P.CLUSTER_FEATURES)
print("Engineered feature columns added:", features)
X[features].head()