# Prep

In [438]:
# Import packages
import pandas as pd
import plotly.graph_objects as go
import networkx as nx
from ds_common_utils.aux.io.snowflake_tools import SnowflakeTools
from math import log
from networkx.algorithms.community import greedy_modularity_communities
import matplotlib.pyplot as plt
import community.community_louvain as community_louvain
import numpy as np

In [439]:
# Get connection
con = SnowflakeTools().get_snowflake_ctx(
     method = 'token',
    user = '250807',
    role = 'INSIGHT_ANALYST_MERCH_DE_GENERAL_PRD',
    warehouse = 'INSIGHT_ANALYST_WH'
)
with open('sql/item_association.sql', 'r') as query:
        df = pd.read_sql_query(query.read(), con)
with open('sql/excluded_class_list.sql', 'r') as query:
        target_item_class = pd.read_sql_query(query.read(), con)

In [440]:
with open('sql/overview.sql', 'r') as query:
        supplier_items = pd.read_sql_query(query.read(), con)
supplier_items        

Unnamed: 0,ITEM_NUMBER,ITEM_DESCRIPTION,TRANSACTIONS,SALES_SUM,QUANTITY_SUM
0,TOTAL,TOTAL,1579,72647610,99792
1,0823814,SKYLIGHT FIXED VELUX++780X1400MM FSM082004A,112,4735300,7140
2,0820072,SKYLIGHT FLAT ROOF VELUX++665X1275MM FCM224600...,90,2864119,5460
3,0415002,SKYLIGHT OPENABLE ELEC VELUX++780X980MM VSEM0...,21,2640826,1400
4,0823806,SKYLIGHT FIXED VELUX++780X1180MM FSM062004A,81,2571554,4165
...,...,...,...,...,...
154,0811278,VELUX SKYLIGHT ACC++SOLAR HONEYCOMB FSCD C041045,2,0,0
155,0176088,VELUX FLASHING SLATE++EDL S06 0000J,2,0,0
156,0296335,VELUX FLASHING KIT++EDLMK08 0000,4,0,0
157,0296321,VELUX FLASHING KIT++EDLCK02 0000,2,0,0


Accross 3 years, 2022-06-18 to 2025-06-18, 158 distinct items sold from this supplier. Total Sales $72.6m, quantity 100k.

# Parameter tuning

In [441]:
TRX_THRESHOLD = 10 # Min N frequent pairs
QUANTILE = 0.975 # Nth percentile of metric similarity
TOP_N = 5 # top-N associated item classes per primary class
RESO = 0 # >1.0 → more, smaller communities
METRIC = 'JACCARD' # LIFT or JACCARD

3Y
TRX_THRESHOLD = 10 # Min N frequent pairs
QUANTILE = 0.9 # Nth percentile of metric similarity
TOP_N = 3 # top-N associated item classes per primary class
RESO = 0.5 # >1.0 → more, smaller communities
METRIC = 'JACCARD' # LIFT or JACCARD

# Working

In [448]:
filtered_df = df[
        df['PRIMARY_ID'].isin(target_item_class['ITEM_CLASS_NAME']) &
        ~df['SECONDARY_ID'].isin(target_item_class['ITEM_CLASS_NAME'])
]

filtered_df = filtered_df[filtered_df['PAIR_TRANSACTIONS'] >= TRX_THRESHOLD]
threshold = filtered_df[METRIC].quantile(QUANTILE)
filtered_df = filtered_df[filtered_df[METRIC] >= threshold]
top_edges = filtered_df.sort_values(METRIC, ascending=False).groupby('PRIMARY_ID').head(TOP_N)

In [449]:
filtered_df.sort_values(by='PAIR_TRANSACTIONS', ascending=False)

Unnamed: 0,PRIMARY_ID,SECONDARY_ID,PRIMARY_TRANSACTIONS,SECONDARY_TRANSACTIONS,TRX_START_MONTH,TRX_END_MONTH,PAIR_TRANSACTIONS,LIFT,JACCARD
1785,500 ROOF WINDOWS,500 BUILDING SUPPLIES OFFRANGE,1090,39,2022-08-01,2025-05-01,324,11.280169,0.402484
2066,500 SKYLIGHT ACCESSORIES,500 BUILDING SUPPLIES OFFRANGE,670,39,2022-10-01,2025-05-01,258,14.613088,0.572062
1791,500 ROOF WINDOWS,500 CONNECTORS,1090,15,2022-08-01,2025-05-01,244,22.08685,0.283391
1860,500 ROOF WINDOWS,500 STRUCT FRAMG H1 2 KD LINKD,1090,12,2024-08-01,2025-05-01,240,27.155963,0.278422
2122,500 SKYLIGHT ACCESSORIES,500 STRUCT FRAMG H1 2 KD LINKD,670,12,2024-07-01,2025-05-01,168,30.925373,0.326848


In [450]:
mode='lines',
line_shape='spline'
    
# Build node size map from your DataFrame
node_size_map = dict(zip(top_edges['PRIMARY_ID'], top_edges['PRIMARY_TRANSACTIONS']))

# Convert values to NumPy array
sizes = np.array(list(node_size_map.values()))

# Use np.ptp instead of .ptp()
norm_sizes = (sizes - sizes.min()) / np.ptp(sizes)  # scale to 0–1
scaled_sizes = 10 + norm_sizes * 30                 # scale to 10–40

# Final size map: Product ID → Scaled Size
scaled_size_map = dict(zip(node_size_map.keys(), scaled_sizes))

# Step 1: Build the full graph
G = nx.Graph()
for _, row in top_edges.iterrows():
    G.add_edge(row['PRIMARY_ID'], row['SECONDARY_ID'], weight=row[METRIC])

# Step 2: Louvain community detection
partition = community_louvain.best_partition(G, weight=METRIC, resolution=RESO)

# Step 3: Layout
pos = nx.spring_layout(G, weight='weight', seed=42
                      )

# Step 4: Edge trace (no filter on weight)
edge_x, edge_y = [], []
for u, v in G.edges():
    x0, y0 = pos[u]
    x1, y1 = pos[v]
    edge_x += [x0, x1, None]
    edge_y += [y0, y1, None]

edge_trace = go.Scatter(
    x=edge_x,
    y=edge_y,
    line=dict(width=1, color='gray'),
    hoverinfo='none',
    mode='lines'
)

# Step 5: Node trace with Louvain cluster coloring
color_list = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728',
              '#9467bd', '#8c564b', '#e377c2', '#7f7f7f',
              '#bcbd22', '#17becf']  # Extend if you expect >10 clusters

node_x, node_y, node_color, node_size, hover_text = [], [], [], [], []

for node in G.nodes():
    x, y = pos[node]
    cluster = partition[node]
    size = scaled_size_map.get(node, 10)  # default to small if missing

    node_x.append(x)
    node_y.append(y)
    node_color.append(color_list[cluster % len(color_list)])
    node_size.append(size)
    hover_text.append(f"Product ID: {node}<br>Cluster: {cluster}<br>Transactions: {int(size)}")

node_trace = go.Scatter(
    x=node_x,
    y=node_y,
    mode='markers+text',
    hoverinfo='text',
    text=[str(node) for node in G.nodes()],
    textposition='top center',
    marker=dict(
        color=node_color,
        size=node_size,  # <--- size now dynamic!
        line=dict(width=2, color='black')
    ),
    textfont=dict(
    size=16,          # increase for visibility
    color='black',
    family='Arial'
)
)
# Step 6: Plot
fig = go.Figure(
    data=[edge_trace, node_trace],
    layout=go.Layout(
        width=1200,          # increase width
    height=800, 
        title=dict(text='Louvain Product Clustering (All Edges)', x=0.5, font=dict(size=16)),
        showlegend=False,
        hovermode='closest',
        margin=dict(b=20, l=20, r=20, t=40),
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
    )
)

# Results

In [451]:
fig.show()

In [452]:
partition_df = pd.DataFrame.from_dict(partition, orient='index', columns=['segment']).reset_index()
partition_df.rename(columns={'index': 'PRIMARY_ID'}, inplace=True)
labelled_df = filtered_df.merge(partition_df, on='PRIMARY_ID', how='left')
labelled_df = labelled_df[['PRIMARY_ID','PRIMARY_TRANSACTIONS', 'segment']].drop_duplicates()
labelled_df.sort_values(by='segment')

Unnamed: 0,PRIMARY_ID,PRIMARY_TRANSACTIONS,segment
3,500 SKYLIGHT ACCESSORIES,670,0
0,500 ROOF WINDOWS,1090,1


New skylight framing or structural retrofit:
Segment 0 – Roof cutout prep and reinforcement

Roof recladding or cut-in during roof works:
Segment 1 – Mixed Site Prep, Sheathing, and Noise

Full-frame integration for high-quality install:
Segment 2 – Fibre Cement Integration & Segment 3 – Connector Framing & Timber

Internal ceiling refit or thermal efficiency upgrade:
Segment 4 – Insulation & Finishing Support

In [453]:
labelled_df.to_csv('Labelled_class_segment.csv',index=False)