In [None]:
# import pandas packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
from scipy.stats import mode
from collections import Counter

warnings.filterwarnings('ignore')

# Import parameters
import source.data_processing.clustering_params as params
from datetime import datetime

# Import the new clustering pipeline
from source.data_processing.clustering_pipeline import (
    ClusteringPipeline
)

# Import functions for data fetching
from source.data_processing.analysis_utils import (
    fetch_purchase_data,
)

# Visualization helper module previously referenced (clustering_visualization_utils) was removed.
# Use pipeline.generate_visualizations() after running clustering methods instead.


# Ensure notebook sees latest module changes
import importlib
import source.data_processing.clustering_utils as cu
import source.data_processing.clustering_pipeline as cp
import source.data_processing.analysis_utils as au
importlib.reload(cu)
importlib.reload(cp)
importlib.reload(au)

print(f"Analysis started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

## STEP 1: Running clustering pipeline

First we run all the steps to cluster the products based on their sales distribution.

One can opt for a standard pipeline. However, here the code is build to make two decisions:
    1) what is teh optimal number of clusters
    2) preferred clustering algorithm

In [None]:
# Create and configure the pipeline
pipeline = ClusteringPipeline(
    min_transactions=1,
    class3_description= None,
    product_description= None,
    max_clusters=10
)

print("ðŸš€ Pipeline configured and ready!")

In [None]:
print(f"ðŸš€ Starting full clustering analysis pipeline...")
print(f"   Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Step 1: Load data
pipeline.load_data()

# Step 2: Prepare features
pipeline.prepare_features()

In [None]:
NUMBER_CLUSTERS = 5
# Step 4: Run clustering methods
if params.INCLUDE_KMEANS:
    pipeline.run_kmeans_clustering(NUMBER_CLUSTERS)

# if params.INCLUDE_HIERARCHICAL:
#     pipeline.run_hierarchical_clustering(NUMBER_CLUSTERS)

# Step 5: Compare methods
pipeline.compare_clustering_methods()

# Step 6: Generate visualizations
if params.SHOW_VISUALIZATIONS:
    pipeline.generate_visualizations()

In [None]:
CHOSEN_METHOD = ['kmeans'] # ['kmeans', 'hierarchical', 'dbscan']
# Step 7: Export results
if params.EXPORT_RESULTS:
    pipeline.export_results(methods=CHOSEN_METHOD)


## STEP 2: Get all mapped products

In [None]:
# map products to clusters
mapped_products = pipeline.clustering_results['kmeans']['df_clustered'][['ProductNumber', 'kmeans_cluster']]

In [None]:
# Load the list of products with same description
list_products = mapped_products['ProductNumber'].tolist()
list_products = list(set(list_products))  # Remove duplicates

# Create a SQL-formatted string with list of products
products_list = "'" + "', '".join(list_products) + "'"

print(f"ðŸ“Š Prepared product list for {len(list_products)} products with duplicates")

## STEP 3: Cluster Analysis Pipeline

Now we'll run comprehensive analysis per cluster to understand the business characteristics of each product group.

## STEP 4: Data Preparation and Feature Engineering

Before running the comprehensive analysis, we need to prepare additional features and metrics.

## STEP 5: Comprehensive Cluster Analysis

## STEP 6: Save clustering results to BigQuery

In [None]:
# Save clustering results to BigQuery
pipeline.save_results_to_bq(method='kmeans')