# Tender Material Generator
This notebook fetches any assortment in scope based on provided filtering.
The notebook prepares and cleans the data, before loading it into a data frame.
The notebook will provide the user with summary statistics and key insights for internal usage.
The notebook generates a .csv file suitable for external sharing when doing RFX-processes.

In [None]:
# import pandas packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
from scipy.stats import mode
from collections import Counter

warnings.filterwarnings('ignore')

# Import parameters
import source.data_processing.clustering_params as params
from datetime import datetime

# Import the new clustering pipeline
try:
    from source.data_processing.clustering_pipeline import (
        ClusteringPipeline
    )
    print("Successfully imported ClusteringPipeline")
except SyntaxError as e:
    print(f"Syntax error in clustering_pipeline.py: {e}")
    print("Please check line 29 in clustering_pipeline.py for syntax issues")
except ImportError as e:
    print(f"Import error: {e}")

# Import functions for data fetching and cleaning
from source.data_processing.analysis_utils import (
    fetch_tender_material,
    pivot_attributes_wide,
    fetch_distinct_values,
    apply_df_filters,
)

# import functions to visualize and analyze clustering results
from source.data_processing.clustering_visualization_utils import (
    build_totals,
    plot_overall_cluster_metrics,
    plot_yearly_trends,
    plot_feature_scatter,
    plot_origin_heatmap,
    plot_brand_supplier_bars,
    summarize_cluster_tables,
)

print(f"Analysis started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

## Step 1: Setting the Scope
First we set the scope by filtering on Class2, Class3, Class4, Brand Name, Country of Origin, Group Supplier. 

In [None]:
# --- Simple filtering (Class2, Class3, Class4, BrandName, CountryOfOrigin, GroupSupplier) ---
from product_utils import (
    filter_by_class2,
    filter_by_class3,
    filter_by_class4,
    filter_by_brand_name,
    filter_by_country_of_origin,   # NEW
    filter_by_group_supplier,      # NEW
)

# 1) Configure filters (use None or [] to skip a filter)
CLASS2 = ["Fasteners"]              # e.g. ["Tractors","Implements"]
CLASS3 = None                       # e.g. ["Hydraulics"]
CLASS4 = None                       # e.g. ["1234 - Filters"]
BRANDNAME = ["Kramp"]               # e.g. ["Kramp"]
COUNTRYOFORIGIN = None              # e.g. ["Germany", "PL", "CN"]
GROUPSUPPLIER = None                # e.g. ["Kerbl Group"]
NEGATE = False                      # set True to invert selection

# 2) Apply filters (only those provided)
df_f = df.copy()

if CLASS2:          df_f = filter_by_class2(df_f, CLASS2, negate=NEGATE)
if CLASS3:          df_f = filter_by_class3(df_f, CLASS3, negate=NEGATE)
if CLASS4:          df_f = filter_by_class4(df_f, CLASS4, negate=NEGATE)
if BRANDNAME:       df_f = filter_by_brand_name(df_f, BRANDNAME, negate=NEGATE)
if COUNTRYOFORIGIN: df_f = filter_by_country_of_origin(df_f, COUNTRYOFORIGIN, negate=NEGATE)
if GROUPSUPPLIER:   df_f = filter_by_group_supplier(df_f, GROUPSUPPLIER, negate=NEGATE)

print(f"Rows after filtering: {len(df_f):,}")
df_f.head()


In [None]:
print(f"ðŸš€ Starting full clustering analysis pipeline...")
print(f"   Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Step 1: Load data
pipeline.load_data()

# Step 2: Prepare features
pipeline.prepare_features()

# Step 3: Optimize clusters
pipeline.optimize_clusters()


In [None]:
NUMBER_CLUSTERS = 5
# Step 4: Run clustering methods
if params.INCLUDE_KMEANS:
    pipeline.run_kmeans_clustering(NUMBER_CLUSTERS)

# if params.INCLUDE_HIERARCHICAL:
#     pipeline.run_hierarchical_clustering(NUMBER_CLUSTERS)

# Step 5: Compare methods
pipeline.compare_clustering_methods()

# Step 6: Generate visualizations
if params.SHOW_VISUALIZATIONS:
    pipeline.generate_visualizations()

In [None]:
CHOSEN_METHOD = ['kmeans'] # ['kmeans', 'hierarchical', 'dbscan']
# Step 7: Export results
if params.EXPORT_RESULTS:
    pipeline.export_results(methods=CHOSEN_METHOD)


# Step 2: Running Clustering Pipeline

In [None]:
# map products to clusters
mapped_products = pipeline.clustering_results['kmeans']['df_clustered'][['ProductNumber', 'kmeans_cluster']]

In [None]:
import analysis_params as P

spec = P.feature_spec()
# Build features from df:
# - for each key in spec, combine columns in spec[key]["from"] using the chosen strategy.
# - create df[feat_name] = ...
X = df.copy()

def _combine(cols, strategy="sum"):
    s = X[cols].apply(pd.to_numeric, errors="coerce")
    if strategy == "latest":
        # pick the last non-null by year ordering
        return s.ffill(axis=1).iloc[:, -1]
    if strategy == "mean":
        return s.mean(axis=1, skipna=True)
    return s.sum(axis=1, skipna=True)  # default sum

for feat_name, cfg in spec.items():
    X[feat_name] = _combine(cfg["from"], cfg["strategy"])

features = list(P.CLUSTER_FEATURES)  # ["feat_purchase_amount_eur", "feat_quantity_sold"]
