# Prep

In [75]:
# Import packages
import pandas as pd
import plotly.graph_objects as go
import networkx as nx
from ds_common_utils.aux.io.snowflake_tools import SnowflakeTools
from math import log
from networkx.algorithms.community import greedy_modularity_communities
import matplotlib.pyplot as plt
import community.community_louvain as community_louvain
import numpy as np

In [542]:
# Get connection
con = SnowflakeTools().get_snowflake_ctx(
     method = 'token',
    user = '250807',
    role = 'INSIGHT_ANALYST_MERCH_DE_GENERAL_PRD',
    warehouse = 'INSIGHT_ANALYST_WH'
)
with open('sql/item_association.sql', 'r') as query:
        df = pd.read_sql_query(query.read(), con)
with open('sql/excluded_class_list.sql', 'r') as query:
        target_item_class = pd.read_sql_query(query.read(), con)

# Parameter tuning

In [632]:
TRX_THRESHOLD = 10 # Min N frequent pairs
QUANTILE = 0.92 # Nth percentile of Jaccard similarity
TOP_N = 5 # top-N associated item classes per primary class
RESO = 1 # >1.0 → more, smaller communities

# Working

In [633]:
# filtered_df = df[
#     ~(
#         df['PRIMARY_ID'].isin(target_item_class['ITEM_CLASS_NAME']) |
#         df['SECONDARY_ID'].isin(target_item_class['ITEM_CLASS_NAME'])
#     )
# ]

filtered_df = df[(df['PRIMARY_TRANSACTIONS'] > 1 ) & ( df['SECONDARY_TRANSACTIONS'] > 1)]

filtered_df = filtered_df[filtered_df['PAIR_TRANSACTIONS'] >= TRX_THRESHOLD]
threshold = filtered_df['JACCARD'].quantile(QUANTILE)
filtered_df = filtered_df[filtered_df['JACCARD'] >= threshold]
top_edges = filtered_df.sort_values('JACCARD', ascending=False).groupby('PRIMARY_ID').head(TOP_N)

In [634]:
filtered_df.sort_values(by='PAIR_TRANSACTIONS', ascending=False)

Unnamed: 0,PRIMARY_ID,SECONDARY_ID,PRIMARY_TRANSACTIONS,SECONDARY_TRANSACTIONS,TRX_START_MONTH,TRX_END_MONTH,PAIR_TRANSACTIONS,LIFT,JACCARD
863,500 ROOF WINDOWS,500 STRUCT FRAMG H1 2 KD LINKD,413,12,2024-08-01,2025-05-01,120,13.7046,0.3934426
992,500 STRUCT FRAMG H1 2 KD LINKD,500 ROOF WINDOWS,12,413,2024-08-01,2025-05-01,120,9.418886,0.3934426
240,500 CONNECTORS,500 ROOF WINDOWS,7,413,2024-10-01,2025-05-01,84,7.234867,0.25
939,500 SKYLIGHT ACCESSORIES,500 STRUCT FRAMG H1 2 KD LINKD,269,12,2024-07-01,2025-05-01,84,14.728625,0.4263959
826,500 ROOF WINDOWS,500 CONNECTORS,413,7,2024-10-01,2025-05-01,84,16.445521,0.25
994,500 STRUCT FRAMG H1 2 KD LINKD,500 SKYLIGHT ACCESSORIES,12,269,2024-07-01,2025-05-01,84,10.122677,0.4263959
381,500 FIBRE CEMENT SHEETING ACC,500 ROOF WINDOWS,7,413,2024-08-01,2025-05-01,72,7.297129,0.2068966
836,500 ROOF WINDOWS,500 FIBRE CEMENT SHEETING ACC,413,7,2024-08-01,2025-05-01,72,14.09616,0.2068966
905,500 SKYLIGHT ACCESSORIES,500 BUILDING SUPPLIES OFFRANGE,269,8,2024-07-01,2025-05-01,60,15.780669,0.2764977
242,500 CONNECTORS,500 SKYLIGHT ACCESSORIES,7,269,2024-10-01,2025-05-01,60,7.934148,0.2777778


In [670]:
mode='lines',
line_shape='spline'
    
# Build node size map from your DataFrame
node_size_map = dict(zip(top_edges['PRIMARY_ID'], top_edges['PRIMARY_TRANSACTIONS']))

# Convert values to NumPy array
sizes = np.array(list(node_size_map.values()))

# Use np.ptp instead of .ptp()
norm_sizes = (sizes - sizes.min()) / np.ptp(sizes)  # scale to 0–1
scaled_sizes = 10 + norm_sizes * 30                 # scale to 10–40

# Final size map: Product ID → Scaled Size
scaled_size_map = dict(zip(node_size_map.keys(), scaled_sizes))

# Step 1: Build the full graph
G = nx.Graph()
for _, row in top_edges.iterrows():
    G.add_edge(row['PRIMARY_ID'], row['SECONDARY_ID'], weight=row['JACCARD'])

# Step 2: Louvain community detection
partition = community_louvain.best_partition(G, weight='JACCARD', resolution=RESO)

# Step 3: Layout
pos = nx.spring_layout(G, weight='weight', seed=42
                      )

# Step 4: Edge trace (no filter on weight)
edge_x, edge_y = [], []
for u, v in G.edges():
    x0, y0 = pos[u]
    x1, y1 = pos[v]
    edge_x += [x0, x1, None]
    edge_y += [y0, y1, None]

edge_trace = go.Scatter(
    x=edge_x,
    y=edge_y,
    line=dict(width=1, color='gray'),
    hoverinfo='none',
    mode='lines'
)

# Step 5: Node trace with Louvain cluster coloring
color_list = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728',
              '#9467bd', '#8c564b', '#e377c2', '#7f7f7f',
              '#bcbd22', '#17becf']  # Extend if you expect >10 clusters

node_x, node_y, node_color, node_size, hover_text = [], [], [], [], []

for node in G.nodes():
    x, y = pos[node]
    cluster = partition[node]
    size = scaled_size_map.get(node, 10)  # default to small if missing

    node_x.append(x)
    node_y.append(y)
    node_color.append(color_list[cluster % len(color_list)])
    node_size.append(size)
    hover_text.append(f"Product ID: {node}<br>Cluster: {cluster}<br>Transactions: {int(size)}")

node_trace = go.Scatter(
    x=node_x,
    y=node_y,
    mode='markers+text',
    hoverinfo='text',
    text=[str(node) for node in G.nodes()],
    textposition='top center',
    marker=dict(
        color=node_color,
        size=node_size,  # <--- size now dynamic!
        line=dict(width=2, color='black')
    ),
    textfont=dict(
    size=16,          # increase for visibility
    color='black',
    family='Arial'
)
)
# Step 6: Plot
fig = go.Figure(
    data=[edge_trace, node_trace],
    layout=go.Layout(
        width=1200,          # increase width
    height=800, 
        title=dict(text='Louvain Product Clustering (All Edges)', x=0.5, font=dict(size=16)),
        showlegend=False,
        hovermode='closest',
        margin=dict(b=20, l=20, r=20, t=40),
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
    )
)

# Results

### PRE-INSTALLATION (Segment 2: Roof Framing & Structural Integration)
**Item Classes:**
- 500 BUILDING BRACKETS  
- 500 TIMBER WEATHERBOARDS LINKD  
- 500 STRUCT FRAMG H1 2 KD LINKD  
- 500 LAMINATED TIMBER  
- 500 WEATHERBOARDS  
- 500 GLASSWOOL INSULATION BATTS  
- 500 COLLATED SCREWS  
- 500 NON STRUCT TRTD PINE GAUGE  
- 500 WEDGES AND SPACERS

**Interpretation:**  
Materials for reinforcing roof structure, re-cladding, and thermally insulating the skylight area. These support compliance, durability, and weather-tightness.

### INSTALLATION (Segment 0: Skylight Core + Install Essentials)
**Item Classes:**
- 500 BUILDING WRAP  
- 500 CONNECTORS  
- 500 FIBRE CEMENT SHEETING ACC  
- 500 SKYLIGHT ACCESSORIES  
- 500 ROOF WINDOWS

**Interpretation:**  
This segment includes the central components required for skylight installation: the roof window unit itself, accessory kits, moisture wrap, and structural fixings.

Noted: Supplier VELUX NEW ZEALAND LTD sells products in ROOF WINDOWS and SKYLIGHT ACCESSORIES

### POST-INSTALLATION (Segment 1: Shaft Build & Interior Finishing)
**Item Classes:**
- 500 BUILDING SUPPLIES OFFRANGE  ???
- 500 COLLATED NAILS  
- 500 PAINT ADHESIVES  
- 500 PLYWOOD SHEETS

**Interpretation:**  
General building materials used to line the skylight shaft or interior roof space allowing fast installation and clean finishes.

In [671]:
fig.show()

In [672]:
partition_df = pd.DataFrame.from_dict(partition, orient='index', columns=['segment']).reset_index()
partition_df.rename(columns={'index': 'PRIMARY_ID'}, inplace=True)
labelled_df = filtered_df.merge(partition_df, on='PRIMARY_ID', how='left')
labelled_df = labelled_df[['PRIMARY_ID','PRIMARY_TRANSACTIONS', 'segment']].drop_duplicates()
labelled_df.sort_values(by='segment')

Unnamed: 0,PRIMARY_ID,PRIMARY_TRANSACTIONS,segment
0,500 BUILDING BRACKETS,3,0
36,500 TIMBER WEATHERBOARDS LINKD,2,0
26,500 STRUCT FRAMG H1 2 KD LINKD,12,0
13,500 NON STRUCT TRTD PINE GAUGE,3,0
37,500 WEATHERBOARDS,3,0
11,500 GLASSWOOL INSULATION BATTS,2,0
12,500 LAMINATED TIMBER,3,0
6,500 COLLATED SCREWS,2,0
38,500 WEDGES AND SPACERS,2,0
5,500 COLLATED NAILS,5,1


In [673]:
labelled_df.to_csv('Labelled_class_segment.csv',index=False)

In [674]:
all_products = pd.concat([
    top_edges[['PRIMARY_ID', 'PRIMARY_TRANSACTIONS']].rename(columns={'PRIMARY_ID': 'Product_ID', 'PRIMARY_TRANSACTIONS': 'Transactions'}),
    top_edges[['SECONDARY_ID', 'SECONDARY_TRANSACTIONS']].rename(columns={'SECONDARY_ID': 'Product_ID', 'SECONDARY_TRANSACTIONS': 'Transactions'})
])

all_products['segment'] = all_products['Product_ID'].map(partition)
all_products.sort_values(by='segment')

Unnamed: 0,Product_ID,Transactions,segment
76,500 BUILDING BRACKETS,3,0
1072,500 STRUCT FRAMG H1 2 KD LINKD,12,0
998,500 TIMBER WEATHERBOARDS LINKD,2,0
963,500 COLLATED SCREWS,2,0
1001,500 WEDGES AND SPACERS,2,0
...,...,...,...
383,500 FIBRE CEMENT SHEETING ACC,7,2
836,500 ROOF WINDOWS,413,2
381,500 FIBRE CEMENT SHEETING ACC,7,2
939,500 SKYLIGHT ACCESSORIES,269,2
