# Prep

In [1]:
# Import packages
import pandas as pd
import plotly.graph_objects as go
import networkx as nx
from ds_common_utils.aux.io.snowflake_tools import SnowflakeTools
from math import log
from networkx.algorithms.community import greedy_modularity_communities
import matplotlib.pyplot as plt
import community.community_louvain as community_louvain
import numpy as np

In [2]:
# Get connection
con = SnowflakeTools().get_snowflake_ctx(
     method = 'token',
    user = '250807',
    role = 'INSIGHT_ANALYST_MERCH_DE_GENERAL_PRD',
    warehouse = 'INSIGHT_ANALYST_WH'
)
with open('sql/item_association.sql', 'r') as query:
        df = pd.read_sql_query(query.read(), con)
with open('sql/excluded_class_list.sql', 'r') as query:
        target_item_class = pd.read_sql_query(query.read(), con)

Generating new token with 'INSIGHT_ANALYST_MERCH_DE_GENERAL_PRD' role...
  password = getpass("Enter Password: ", stream=sys.stderr)
Saved token with 'SESSION:ROLE-ANY' scope and '2025-06-19-04-04-40' expiry...


In [55]:
with open('sql/overview.sql', 'r') as query:
        supplier_items = pd.read_sql_query(query.read(), con)
supplier_items        

# Parameter tuning

In [48]:
TRX_THRESHOLD = 10 # Min N frequent pairs
QUANTILE = 0.9 # Nth percentile of metric similarity
TOP_N = 10 # top-N associated item classes per primary class
RESO = 1 # >1.0 → more, smaller communities
METRIC = 'JACCARD' # LIFT or JACCARD

2Y
TRX_THRESHOLD = 10 # Min N frequent pairs
QUANTILE = 0.95 # Nth percentile of Jaccard similarity
TOP_N = 20 # top-N associated item classes per primary class
RESO = 0.1 # >1.0 → more, smaller communities
METRIC = 'JACCARD' # LIFT or JACCARD

# Working

In [49]:
filtered_df = df[
    ~(
        df['PRIMARY_ID'].isin(target_item_class['ITEM_CLASS_NAME']) |
        df['SECONDARY_ID'].isin(target_item_class['ITEM_CLASS_NAME'])
    )
]

filtered_df = filtered_df[filtered_df['PAIR_TRANSACTIONS'] >= TRX_THRESHOLD]
threshold = filtered_df[METRIC].quantile(QUANTILE)
filtered_df = filtered_df[filtered_df[METRIC] >= threshold]
top_edges = filtered_df.sort_values(METRIC, ascending=False).groupby('PRIMARY_ID').head(TOP_N)

In [50]:
filtered_df.sort_values(by='PAIR_TRANSACTIONS', ascending=False)

Unnamed: 0,PRIMARY_ID,SECONDARY_ID,PRIMARY_TRANSACTIONS,SECONDARY_TRANSACTIONS,TRX_START_MONTH,TRX_END_MONTH,PAIR_TRANSACTIONS,LIFT,JACCARD
408,500 CONNECTORS,500 BUILDING SUPPLIES OFFRANGE,15,39,2025-03-01,2025-05-01,48,45.538462,8.0
189,500 BUILDING WRAP,500 BUILDING SUPPLIES OFFRANGE,14,39,2025-03-01,2025-05-01,48,31.912088,9.6
720,500 FIBRE CEMENT SHEETING ACC,500 BUILDING SUPPLIES OFFRANGE,11,39,2025-03-01,2025-05-01,48,45.426573,24.0
153,500 BUILDING SUPPLIES OFFRANGE,500 BUILDING WRAP,39,14,2025-03-01,2025-05-01,48,70.241758,9.6
155,500 BUILDING SUPPLIES OFFRANGE,500 CONNECTORS,39,15,2025-03-01,2025-05-01,48,65.558974,8.0
...,...,...,...,...,...,...,...,...,...
1343,500 PAINT ADHESIVES,500 SAFETY ANTI SLIP,9,1,2022-10-01,2022-10-01,8,321.777778,4.0
1344,500 PAINT ADHESIVES,500 SHEETING NAILS,9,3,2022-08-01,2022-08-01,8,107.259259,2.0
1345,500 PAINT ADHESIVES,500 SHRUB GENERAL,9,1,2022-10-01,2022-10-01,8,321.777778,4.0
1353,500 PAINT ADHESIVES,500 VACUUM CLEANING ACCESS,9,1,2022-10-01,2022-10-01,8,321.777778,4.0


In [51]:
mode='lines',
line_shape='spline'
    
# Build node size map from your DataFrame
node_size_map = dict(zip(top_edges['PRIMARY_ID'], top_edges['PRIMARY_TRANSACTIONS']))

# Convert values to NumPy array
sizes = np.array(list(node_size_map.values()))

# Use np.ptp instead of .ptp()
norm_sizes = (sizes - sizes.min()) / np.ptp(sizes)  # scale to 0–1
scaled_sizes = 10 + norm_sizes * 30                 # scale to 10–40

# Final size map: Product ID → Scaled Size
scaled_size_map = dict(zip(node_size_map.keys(), scaled_sizes))

# Step 1: Build the full graph
G = nx.Graph()
for _, row in top_edges.iterrows():
    G.add_edge(row['PRIMARY_ID'], row['SECONDARY_ID'], weight=row[METRIC])

# Step 2: Louvain community detection
partition = community_louvain.best_partition(G, weight=METRIC, resolution=RESO)

# Step 3: Layout
pos = nx.spring_layout(G, weight='weight', seed=42
                      )

# Step 4: Edge trace (no filter on weight)
edge_x, edge_y = [], []
for u, v in G.edges():
    x0, y0 = pos[u]
    x1, y1 = pos[v]
    edge_x += [x0, x1, None]
    edge_y += [y0, y1, None]

edge_trace = go.Scatter(
    x=edge_x,
    y=edge_y,
    line=dict(width=1, color='gray'),
    hoverinfo='none',
    mode='lines'
)

# Step 5: Node trace with Louvain cluster coloring
color_list = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728',
              '#9467bd', '#8c564b', '#e377c2', '#7f7f7f',
              '#bcbd22', '#17becf']  # Extend if you expect >10 clusters

node_x, node_y, node_color, node_size, hover_text = [], [], [], [], []

for node in G.nodes():
    x, y = pos[node]
    cluster = partition[node]
    size = scaled_size_map.get(node, 10)  # default to small if missing

    node_x.append(x)
    node_y.append(y)
    node_color.append(color_list[cluster % len(color_list)])
    node_size.append(size)
    hover_text.append(f"Product ID: {node}<br>Cluster: {cluster}<br>Transactions: {int(size)}")

node_trace = go.Scatter(
    x=node_x,
    y=node_y,
    mode='markers+text',
    hoverinfo='text',
    text=[str(node) for node in G.nodes()],
    textposition='top center',
    marker=dict(
        color=node_color,
        size=node_size,  # <--- size now dynamic!
        line=dict(width=2, color='black')
    ),
    textfont=dict(
    size=16,          # increase for visibility
    color='black',
    family='Arial'
)
)
# Step 6: Plot
fig = go.Figure(
    data=[edge_trace, node_trace],
    layout=go.Layout(
        width=1200,          # increase width
    height=800, 
        title=dict(text='Louvain Product Clustering (All Edges)', x=0.5, font=dict(size=16)),
        showlegend=False,
        hovermode='closest',
        margin=dict(b=20, l=20, r=20, t=40),
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
    )
)

# Results

In [52]:
fig.show()

In [53]:
partition_df = pd.DataFrame.from_dict(partition, orient='index', columns=['segment']).reset_index()
partition_df.rename(columns={'index': 'PRIMARY_ID'}, inplace=True)
labelled_df = filtered_df.merge(partition_df, on='PRIMARY_ID', how='left')
labelled_df = labelled_df[['PRIMARY_ID','PRIMARY_TRANSACTIONS', 'segment']].drop_duplicates()
labelled_df.sort_values(by='segment')

Unnamed: 0,PRIMARY_ID,PRIMARY_TRANSACTIONS,segment
195,500 PLSTR FILLERS N FINISHING,1,0
126,500 MDF BOARDS,1,0
186,500 PINE MOULDINGS TREATED EA,2,0
189,500 PINE MOULDINGS UT CTL,1,0
116,500 INSULATION BLANKET,1,0
...,...,...,...
219,500 SHRUB GENERAL,1,4
211,500 SAFETY ANTI SLIP,1,4
302,503 TIMBER LATTICE,1,4
284,501 POOL AND SPA CHEMICALS,1,4


In [54]:
labelled_df.to_csv('Labelled_class_segment.csv',index=False)