In [None]:
pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.7.0-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.7.0-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.7.0


In [None]:
# ============================================================
# Final LSD Vertex Approach – Amazon Dataset
# Structurally identical to the Twitch PT version
# ============================================================

import math
import json
import torch
import numpy as np
import pandas as pd
import networkx as nx
from tqdm import tqdm
from torch_geometric.datasets import Amazon
from sklearn.preprocessing import MinMaxScaler
from collections import Counter, defaultdict
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# ============================================================
# CONFIGURATION
# ============================================================
_EPS = 1e-12
COMPARE_R_TO_GLOBAL = True
MAX_RADIUS = 5
N_PROTOTYPES = 50
JACCARD_THRESHOLD = 0.6
TOP_K = 100

print("==============================================================")
print("        Final LSD Vertex Approach – Amazon Dataset             ")
print("==============================================================\n")

# ============================================================
# [1] Load dataset
# ============================================================
print("[1] Loading dataset...")

dataset_name = "Computers"  # or 'Photo'
data = torch.load('data/Amazon/Computers_subgraph.pt', weights_only=False)

print(f"--> Nodes (V): {data.num_nodes}")
print(f"--> Edges (E): {data.num_edges}")
print(f"--> Feature dim: {data.num_features}\n")

# ============================================================
# [2] Build graph
# ============================================================
print("[2] Building graph...")
edge_index = data.edge_index.numpy()
G = nx.Graph()
G.add_nodes_from(range(data.num_nodes))
G.add_edges_from(edge_index.T)
print(f"Graph built: |V|={G.number_of_nodes()}, |E|={G.number_of_edges()}\n")

# ============================================================
# [3] Generate synthetic Amazon-style attributes
# ============================================================
print("[3] Creating synthetic Amazon-style node attributes...")

deg = np.array([G.degree(n) for n in G.nodes()])
feat_norm = data.x.norm(dim=1).numpy()
scaler = MinMaxScaler()
scaled = scaler.fit_transform(np.vstack([deg, feat_norm]).T)
popularity, richness = scaled[:, 0], scaled[:, 1]

price_q1, price_q2 = np.quantile(popularity, [0.33, 0.66])
review_q1, review_q2 = np.quantile(richness, [0.33, 0.66])

def price_tier(v):
    if v < price_q1: return "budget"
    elif v < price_q2: return "midrange"
    return "premium"

def review_band(v):
    if v < review_q1: return "low"
    elif v < review_q2: return "medium"
    return "high"

targets = pd.DataFrame({
    "id": np.arange(data.num_nodes),
    "price_tier": [price_tier(v) for v in popularity],
    "review_band": [review_band(v) for v in richness],
    "electronics": [int(i % 2 == 0) for i in range(data.num_nodes)],
    "featured": [int((deg[i] + feat_norm[i]) > np.median(deg + feat_norm)) for i in range(data.num_nodes)]
})

for _, row in targets.iterrows():
    nid = row["id"]
    G.nodes[nid]["PriceTier"] = row["price_tier"]
    G.nodes[nid]["ReviewBand"] = row["review_band"]
    G.nodes[nid]["Electronics"] = row["electronics"]
    G.nodes[nid]["Featured"] = row["featured"]

print("--> Attributes attached to graph nodes.\n")

# ============================================================
# [4] Quality metrics (WKL divergence)
# ============================================================
def _dist(vals):
    n = len(vals)
    if n == 0:
        return {}
    c = Counter(vals)
    return {k: v / n for k, v in c.items()}

def wkl_quality_nodes(S_nodes, R_nodes, binary_attrs, nominal_attrs):
    nS, nR = len(S_nodes), len(R_nodes)
    if nS == 0 or nR == 0:
        return 0.0
    qsum = 0.0
    for attr in binary_attrs:
        PS = _dist([G.nodes[u][attr] for u in S_nodes if attr in G.nodes[u]])
        PR = _dist([G.nodes[u][attr] for u in R_nodes if attr in G.nodes[u]])
        for y in (0, 1):
            ps, pr = PS.get(y, _EPS), PR.get(y, _EPS)
            qsum += ps * math.log(ps / pr)
    for attr in nominal_attrs:
        PS = _dist([G.nodes[u][attr] for u in S_nodes if attr in G.nodes[u]])
        PR = _dist([G.nodes[u][attr] for u in R_nodes if attr in G.nodes[u]])
        dom = set(PS) | set(PR)
        for y in dom:
            ps, pr = PS.get(y, _EPS), PR.get(y, _EPS)
            qsum += ps * math.log(ps / pr)
    return (nS / nR) * qsum

# ============================================================
# [5] Search utilities
# ============================================================
def rank_nodes_by_distance(G, source):
    lengths = nx.single_source_shortest_path_length(G, source, cutoff=MAX_RADIUS)
    return [n for n, _ in sorted(lengths.items(), key=lambda kv: kv[1])]

def find_best_q_node(G, proto, binary_attrs, nominal_attrs, all_nodes):
    ranking = rank_nodes_by_distance(G, proto)
    if len(ranking) < 2:
        return None
    baseline = all_nodes if COMPARE_R_TO_GLOBAL else ranking
    best_rho, best_q_rg = 0, -float("inf")
    for rho in range(2, len(ranking) + 1):
        R = ranking[:rho]
        q_rg = wkl_quality_nodes(R, baseline, binary_attrs, nominal_attrs)
        if q_rg > best_q_rg:
            best_q_rg, best_rho = q_rg, rho
    R_best = ranking[:best_rho]
    best_sigma, best_q_sr, best_S = 0, -float("inf"), None
    for sigma in range(1, best_rho):
        S = R_best[:sigma]
        q_sr = wkl_quality_nodes(S, R_best, binary_attrs, nominal_attrs)
        if q_sr > best_q_sr:
            best_q_sr, best_sigma, best_S = q_sr, sigma, S
    return {
        "prototype": proto,
        "q": best_q_sr,
        "q_rg": best_q_rg,
        "rho": best_rho,
        "sigma": best_sigma,
        "n_nodes": len(ranking),
        "members": best_S if best_S else []
    }

# ============================================================
# [6] Top-k subgroup search
# ============================================================
def find_top_k_subgroups(G, attrs, max_radius=5, top_k=100):
    all_nodes = list(G.nodes())
    deg = {u: G.degree(u) for u in G.nodes()}
    sorted_nodes = [u for u, _ in sorted(deg.items(), key=lambda kv: kv[1], reverse=True)]
    subgroups = []
    for proto in sorted_nodes[:N_PROTOTYPES]:
        res = find_best_q_node(
            G,
            proto,
            [a for a in attrs if a in ["Electronics", "Featured"]],
            [a for a in attrs if a in ["PriceTier", "ReviewBand"]],
            all_nodes
        )
        if res:
            res["feature"] = attrs[0]
            res["Members"] = ",".join(map(str, res["members"]))
            del res["members"]
            subgroups.append(res)
    df = pd.DataFrame(subgroups)
    if df.empty:
        return pd.DataFrame(columns=["prototype", "feature", "q", "q_rg", "rho", "sigma", "n_nodes", "Members"])
    return df.sort_values("q", ascending=False).head(top_k)

# ============================================================
# [7] Run subgroup search per feature
# ============================================================
attrs_list = ['Electronics', 'Featured', 'PriceTier', 'ReviewBand']

for FEATURE in attrs_list:
    attrs = [FEATURE]
    print(f"Running top-k subgroup search using single feature: {FEATURE}")
    top_subgroups_single = find_top_k_subgroups(G, attrs, max_radius=MAX_RADIUS, top_k=TOP_K)
    display(top_subgroups_single.head())
    top_subgroups_single.to_csv(f'amazon_top100_subgroups_{FEATURE}.csv', index=False)

# ============================================================
# [8] Feature-level Jaccard Distance computation
# ============================================================
print("\n[8] Computing feature-level Jaccard distances (using node attributes)...\n")

num_nodes = len(G.nodes())
all_nodes_set = set(G.nodes())

feature_to_nodes = defaultdict(set)
for node in G.nodes():
    for feature in ['Electronics', 'Featured', 'PriceTier', 'ReviewBand']:
        val = G.nodes[node].get(feature, None)
        if val is None:
            continue
        feature_to_nodes[(feature, str(val))].add(node)

jaccard_records = []
for (feat, val), nodeset in feature_to_nodes.items():
    if len(nodeset) < 5:
        continue
    jaccard_sim = len(nodeset & all_nodes_set) / len(nodeset | all_nodes_set)
    jaccard_dist = 1 - jaccard_sim
    jaccard_records.append({
        "Feature": feat,
        "Value": val,
        "Size": len(nodeset),
        "Jaccard Distance": jaccard_dist,
    })

jaccard_df = pd.DataFrame(jaccard_records)
jaccard_df = jaccard_df.sort_values("Jaccard Distance", ascending=False)
jaccard_df.to_csv("amazon_featurewise_jaccard.csv", index=False)

print("=== Feature-wise Jaccard Distances ===")
if not jaccard_df.empty:
    print(jaccard_df.head(10).to_string(index=False))
else:
    print("No valid feature-value pairs found.")
print("\nSaved amazon_featurewise_jaccard.csv")
print("==============================================================")


        Final LSD Vertex Approach – Amazon Dataset             

[1] Loading dataset...
--> Nodes (V): 1900
--> Edges (E): 31864
--> Feature dim: 767

[2] Building graph...
Graph built: |V|=1900, |E|=31864

[3] Creating synthetic Amazon-style node attributes...
--> Attributes attached to graph nodes.

Running top-k subgroup search using single feature: Electronics


Unnamed: 0,prototype,q,q_rg,rho,sigma,n_nodes,feature,Members
37,590,0.298627,0.000766,6,1,1900,Electronics,590
31,397,0.277987,0.001043,7,1,1900,Electronics,397
9,1234,0.25993,0.001332,8,1,1900,Electronics,1234
43,1093,0.116734,0.001318,25,5,1900,Electronics,1093892735
29,1196,0.102035,0.002032,17,1,1900,Electronics,1196


Running top-k subgroup search using single feature: Featured


Unnamed: 0,prototype,q,q_rg,rho,sigma,n_nodes,feature,Members
25,27,0.016583,0.060159,760,211,1900,Featured,"27,9,23,43,86,119,136,152,160,163,169,186,248,..."
29,1196,0.014771,0.03707,667,223,1900,Featured,"1196,1,3,7,33,39,40,51,63,75,77,93,97,114,129,..."
48,1387,0.013004,0.091923,701,134,1900,Featured,"1387,22,27,40,43,46,74,83,93,94,117,126,147,15..."
41,260,0.012501,0.062529,815,273,1900,Featured,"260,21,41,46,50,60,73,100,108,118,150,152,160,..."
46,1178,0.012083,0.061351,599,339,1900,Featured,"1178,16,23,27,35,86,100,102,108,119,130,135,14..."


Running top-k subgroup search using single feature: PriceTier


Unnamed: 0,prototype,q,q_rg,rho,sigma,n_nodes,feature,Members
34,1678,0.034106,0.095377,598,240,1900,PriceTier,"1678,9,13,24,27,40,43,47,72,90,93,119,123,135,..."
49,67,0.031941,0.078923,480,245,1900,PriceTier,"67,21,23,24,35,39,40,47,63,93,130,139,156,163,..."
29,1196,0.030547,0.048657,667,153,1900,PriceTier,"1196,1,3,7,33,39,40,51,63,75,77,93,97,114,129,..."
35,1526,0.025743,0.067294,707,299,1900,PriceTier,"1526,0,13,21,22,41,46,50,62,72,73,74,90,94,99,..."
22,1412,0.024935,0.080475,866,240,1900,PriceTier,"1412,23,27,43,86,100,119,150,151,152,157,159,1..."


Running top-k subgroup search using single feature: ReviewBand


Unnamed: 0,prototype,q,q_rg,rho,sigma,n_nodes,feature,Members
0,1282,0.071335,0.00251,10,2,1900,ReviewBand,12820
13,1325,0.019903,0.003858,219,95,1900,ReviewBand,"1325,9,33,35,69,85,86,89,93,100,104,110,115,11..."
28,1823,0.016022,0.004485,426,180,1900,ReviewBand,"1823,9,24,27,33,35,52,72,81,93,104,115,126,136..."
24,604,0.014626,0.008056,273,32,1900,ReviewBand,"604,5,10,25,31,32,47,67,86,104,107,119,120,139..."
27,834,0.014325,0.008278,256,80,1900,ReviewBand,"834,0,21,22,46,50,60,73,74,83,94,102,150,167,1..."



[8] Computing feature-level Jaccard distances (using node attributes)...

=== Feature-wise Jaccard Distances ===
    Feature    Value  Size  Jaccard Distance
  PriceTier   budget   612          0.677895
 ReviewBand   medium   627          0.670000
 ReviewBand      low   627          0.670000
  PriceTier midrange   633          0.666842
 ReviewBand     high   646          0.660000
  PriceTier  premium   655          0.655263
   Featured        1   950          0.500000
Electronics        1   950          0.500000
   Featured        0   950          0.500000
Electronics        0   950          0.500000

Saved amazon_featurewise_jaccard.csv


In [None]:
# ==============================================================
# Final LSD Edges Approach – Twitch PT Dataset (Top-100 per feature)
# ==============================================================

import math
from collections import Counter, deque, defaultdict
import numpy as np
import pandas as pd
import networkx as nx
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# ============================================================
# CONFIGURATION
# ============================================================
_EPS = 1e-12
MAX_PROTOTYPES = 50
QUALITY_THRESHOLD = -1.0
PATIENCE_K = 8
COMPARE_R_TO_GLOBAL = True
N_RHO_LIMIT = None
TOP_K = 100  # top-k per feature

print("==============================================================")
print("         Final LSD Edges Approach – Twitch PT Dataset          ")
print("==============================================================\n")

# ============================================================
# [1] Load dataset
# ============================================================
edges = pd.read_csv("edges.csv")
targets = pd.read_csv("target.csv")

edges["from"] = edges["from"].astype(str)
edges["to"] = edges["to"].astype(str)
if "id" in targets.columns: targets["id"] = targets["id"].astype(str)
if "new_id" in targets.columns: targets["new_id"] = targets["new_id"].astype(str)

edge_nodes = set(edges["from"]).union(set(edges["to"]))
cand_cols = [c for c in ["id", "new_id"] if c in targets.columns]
if not cand_cols:
    raise RuntimeError("target.csv must contain 'id' or 'new_id' column.")
best_col = max(cand_cols, key=lambda c: len(edge_nodes & set(targets[c].astype(str))))
targets["_key"] = targets[best_col].astype(str)

for col in ["mature", "partner", "days", "views"]:
    if col not in targets.columns:
        raise RuntimeError(f"Missing column {col}")
targets["mature"] = targets["mature"].fillna(0).astype(int)
targets["partner"] = targets["partner"].fillna(0).astype(int)
targets["days"] = pd.to_numeric(targets["days"], errors="coerce")
targets["views"] = pd.to_numeric(targets["views"], errors="coerce")

t_by_id = {r["_key"]: r for _, r in targets.iterrows()}

# ============================================================
# [2] Build directed graph
# ============================================================
G = nx.DiGraph()
views_q1, views_q2 = targets["views"].quantile([0.33, 0.66])
days_q1, days_q2 = targets["days"].quantile([0.33, 0.66])
mean_days = float(targets["days"].mean())

def _band(v, q1, q2, labels=("low","medium","high")):
    if pd.isna(v): return "unknown"
    if v < q1: return labels[0]
    elif v < q2: return labels[1]
    return labels[2]

def _age(v):
    if pd.isna(v): return "unknown"
    if v < days_q1: return "young"
    elif v < days_q2: return "mid"
    return "old"

def _get(nid): return t_by_id.get(nid, None)

def derive_edge_targets(i, j):
    si, sj = _get(i), _get(j)
    def val(r,k,defv=np.nan): return float(r[k]) if r is not None and not pd.isna(r[k]) else defv
    def iv(r,k): return int(r[k]) if r is not None else 0
    di, dj = val(si,"days"), val(sj,"days")
    vi, vj = val(si,"views"), val(sj,"views")
    return {
        "ExplicitLanguage_src": iv(si,"mature"),
        "ExplicitLanguage_dst": iv(sj,"mature"),
        "Partner_src": iv(si,"partner"),
        "Partner_dst": iv(sj,"partner"),
        "HighActivity_src": int(di > mean_days if not pd.isna(di) else 0),
        "HighActivity_dst": int(dj > mean_days if not pd.isna(dj) else 0),
        "ViewsBand_src": _band(vi,views_q1,views_q2),
        "ViewsBand_dst": _band(vj,views_q1,views_q2),
        "AgeBand_src": _age(di),
        "AgeBand_dst": _age(dj)
    }

for _, r in edges.iterrows():
    G.add_edge(r["from"], r["to"], **derive_edge_targets(r["from"], r["to"]))

print(f"Graph built: |V|={G.number_of_nodes():,}, |E|={G.number_of_edges():,}\n")

# ============================================================
# [3] Quality function
# ============================================================
def _dist(vals):
    n=len(vals)
    if n==0: return {}
    c=Counter(vals)
    return {k:v/n for k,v in c.items()}

def wkl_quality_edges(S,R,bin_attrs,nom_attrs):
    nS,nR=len(S),len(R)
    if nS==0 or nR==0: return 0.0
    q=0.0
    for a in bin_attrs:
        PS,PR=_dist([d[a] for _,_,d in S]),_dist([d[a] for _,_,d in R])
        for y in (0,1):
            ps,pr=PS.get(y,_EPS),PR.get(y,_EPS)
            q+=ps*math.log(ps/pr)
    for a in nom_attrs:
        PS,PR=_dist([d[a] for _,_,d in S]),_dist([d[a] for _,_,d in R])
        for y in set(PS)|set(PR):
            ps,pr=PS.get(y,_EPS),PR.get(y,_EPS)
            q+=ps*math.log(ps/pr)
    return (nS/nR)*q

def rank_out_edges(G,p): return [(u,v,d) for u,v,d in G.out_edges(p,data=True)]

# ============================================================
# [4] Modes
# ============================================================
MODES={
 "ExplicitLanguage":{"binary":["ExplicitLanguage_src","ExplicitLanguage_dst"],"nominal":[]},
 "Partner":{"binary":["Partner_src","Partner_dst"],"nominal":[]},
 "HighActivity":{"binary":["HighActivity_src","HighActivity_dst"],"nominal":[]},
 "ViewsBand":{"binary":[],"nominal":["ViewsBand_src","ViewsBand_dst"]},
 "AgeBand":{"binary":[],"nominal":["AgeBand_src","AgeBand_dst"]}
}

# ============================================================
# [5] Prototypes
# ============================================================
def select_prototypes(G):
    UG=G.to_undirected()
    degs={u:UG.degree(u) for u in UG.nodes()}
    sel,ex=set(),set()
    for n in sorted(degs,key=lambda n:degs[n],reverse=True):
        if n in ex: continue
        sel.add(n)
        ex.update(UG.neighbors(n))
        ex.add(n)
        if len(sel)>=MAX_PROTOTYPES: break
    return list(sel)

protos=select_prototypes(G)
print(f"Selected {len(protos)} prototypes.\n")

# ============================================================
# [6] LSD core
# ============================================================
def find_best_q(G,p,bin_attrs,nom_attrs,global_edges):
    ranked=rank_out_edges(G,p)
    if len(ranked)<2: return None
    base=global_edges if COMPARE_R_TO_GLOBAL else ranked
    best_rho,best_q_rg=0,-float("inf")
    for rho in range(2,len(ranked)+1):
        R=ranked[:rho]
        q_rg=wkl_quality_edges(R,base,bin_attrs,nom_attrs)
        if q_rg>best_q_rg:
            best_q_rg,q_rho=q_rg,rho
    R_best=ranked[:q_rho]
    best_sigma,best_q_sr=0,-float("inf")
    for sigma in range(1,q_rho):
        S=R_best[:sigma]
        q_sr=wkl_quality_edges(S,R_best,bin_attrs,nom_attrs)
        if q_sr>best_q_sr:
            best_q_sr,best_sigma=q_sr,sigma
    return {"prototype":p,"rho":q_rho,"sigma":best_sigma,"q":best_q_sr,"q_rg":best_q_rg,"n_nodes":len(ranked)}

# ============================================================
# [7] Main loop
# ============================================================
global_edges=[(u,v,d) for u,v,d in G.edges(data=True)]
records=[]
for proto in tqdm(protos,desc="Running LSD"):
    for feat_name,cfg in MODES.items():
        r=find_best_q(G,proto,cfg["binary"],cfg["nominal"],global_edges)
        if r:
            r["feature"]=feat_name
            records.append(r)

df=pd.DataFrame(records)
if df.empty: raise RuntimeError("No results.")

# ============================================================
# [8] Top-100 per feature
# ============================================================
topk_all=[]
for feat in MODES.keys():
    topk=df[df["feature"]==feat].sort_values("q",ascending=False).head(TOP_K)
    if not topk.empty:
        topk_all.append(topk)
        topk.to_csv(f"twitchpt_top{TOP_K}_subgroups_{feat}.csv",index=False)
        print(f"Saved top {TOP_K} subgroups for {feat} → twitchpt_top{TOP_K}_subgroups_{feat}.csv")

df_top=pd.concat(topk_all)
df_top.to_csv("twitchpt_top100_subgroups_all.csv",index=False)
print("\nCombined top-100 subgroups across all features saved → twitchpt_top100_subgroups_all.csv")

# ============================================================
# [9] Table 2 (unchanged)
# ============================================================
feature_to_edges=defaultdict(set)
for u,v,d in G.edges(data=True):
    for fname,fval in d.items():
        feature_to_edges[(fname,str(fval))].add((u,v))
num_edges=len(G.edges())
table2=[(f"{fn}={fv}",len(eds),1-(len(eds)/num_edges))
        for (fn,fv),eds in feature_to_edges.items() if len(eds)>=20]
table2_df=pd.DataFrame(table2,columns=["Feature","Size","Jaccard Distance"]).sort_values("Jaccard Distance",ascending=False)
table2_df.to_csv("twitchpt_edges_feature_jaccard_table.csv",index=False)

print("\nTop of Table 2:")
print(table2_df.head(10).to_string(index=False))
print("\nDone ✅")


         Final LSD Edges Approach – Twitch PT Dataset          

Graph built: |V|=1,912, |E|=31,299

Selected 50 prototypes.



Running LSD: 100%|██████████| 50/50 [02:18<00:00,  2.77s/it]


Saved top 100 subgroups for ExplicitLanguage → twitchpt_top100_subgroups_ExplicitLanguage.csv
Saved top 100 subgroups for Partner → twitchpt_top100_subgroups_Partner.csv
Saved top 100 subgroups for HighActivity → twitchpt_top100_subgroups_HighActivity.csv
Saved top 100 subgroups for ViewsBand → twitchpt_top100_subgroups_ViewsBand.csv
Saved top 100 subgroups for AgeBand → twitchpt_top100_subgroups_AgeBand.csv

Combined top-100 subgroups across all features saved → twitchpt_top100_subgroups_all.csv

Top of Table 2:
             Feature  Size  Jaccard Distance
   ViewsBand_src=low  3666          0.882872
   ViewsBand_dst=low  3920          0.874756
ViewsBand_dst=medium  6247          0.800409
ViewsBand_src=medium  6385          0.796000
   AgeBand_src=young  6852          0.781079
   AgeBand_dst=young  6856          0.780951
     AgeBand_dst=mid 10680          0.658775
     AgeBand_src=mid 11269          0.639957
  HighActivity_dst=0 11647          0.627879
  HighActivity_src=0 11947     

In [None]:
# ==============================================================
# Final LSD Vertex Approach – Twitch PT Dataset
# Unified, consistent with the Edge LSD logic
# Includes:
#   - Distance-based prototype selection (teacher feedback)
#   - LSD (KL divergence) quality evaluation on vertices
#   - Jaccard distance filtering (redundancy control)
#   - Table 2: Feature-specific vs Global Jaccard distances
# ==============================================================

import pandas as pd
import numpy as np
import networkx as nx
from collections import Counter, defaultdict, deque
from tqdm import tqdm
import math
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# ============================================================
# CONFIGURATION
# ============================================================
_EPS = 1e-12
MAX_PROTOTYPES = 50
QUALITY_THRESHOLD = -1.0
PATIENCE_K = 8
COMPARE_R_TO_GLOBAL = True
N_RHO_LIMIT = None
TOP_K = 100

print("==============================================================")
print("          Final LSD Vertex Approach – Twitch PT Dataset         ")
print("==============================================================\n")

# ============================================================
# [1] Load dataset
# ============================================================
edges = pd.read_csv("edges.csv")
targets = pd.read_csv("target.csv")

edges["from"] = edges["from"].astype(str)
edges["to"] = edges["to"].astype(str)
if "id" in targets.columns:
    targets["id"] = targets["id"].astype(str)
if "new_id" in targets.columns:
    targets["new_id"] = targets["new_id"].astype(str)

edge_nodes = set(edges["from"]).union(set(edges["to"]))
cand_cols = [c for c in ["id", "new_id"] if c in targets.columns]
best_col = max(cand_cols, key=lambda c: len(edge_nodes & set(targets[c].astype(str))))
targets["_key"] = targets[best_col].astype(str)

for col in ["mature", "partner", "days", "views"]:
    if col not in targets.columns:
        raise RuntimeError(f"Missing column {col}")
targets["mature"] = targets["mature"].fillna(0).astype(int)
targets["partner"] = targets["partner"].fillna(0).astype(int)
targets["days"] = pd.to_numeric(targets["days"], errors="coerce")
targets["views"] = pd.to_numeric(targets["views"], errors="coerce")

# ============================================================
# [2] Build graph and enrich with node attributes
# ============================================================
G = nx.Graph()
G.add_nodes_from(targets["_key"].unique())
G.add_edges_from(edges[["from", "to"]].values.tolist())

views_q1, views_q2 = targets["views"].quantile([0.33, 0.66])
days_q1, days_q2 = targets["days"].quantile([0.33, 0.66])
mean_days = float(targets["days"].mean())

def _band(v, q1, q2, labels=("low","medium","high")):
    if pd.isna(v): return "unknown"
    if v < q1: return labels[0]
    elif v < q2: return labels[1]
    return labels[2]

def _age_band(d):
    if pd.isna(d): return "unknown"
    if d < days_q1: return "young"
    elif d < days_q2: return "mid"
    return "old"

for _, r in targets.iterrows():
    G.nodes[r["_key"]].update({
        "ExplicitLanguage": int(r["mature"]),
        "Partner": int(r["partner"]),
        "HighActivity": int(r["days"] > mean_days if not pd.isna(r["days"]) else 0),
        "ViewsBand": _band(r["views"], views_q1, views_q2),
        "AgeBand": _age_band(r["days"]),
    })

print(f"Graph built: |V|={G.number_of_nodes():,}, |E|={G.number_of_edges():,}\n")

# ============================================================
# [3] Helper: distributions and quality
# ============================================================
def _dist(vals):
    n = len(vals)
    if n == 0: return {}
    c = Counter(vals)
    return {k: v / n for k, v in c.items()}

def wkl_quality_nodes(S_nodes, R_nodes, binary_attrs, nominal_attrs):
    nS, nR = len(S_nodes), len(R_nodes)
    if nS == 0 or nR == 0: return 0.0
    qsum = 0.0
    for attr in binary_attrs:
        P_S = _dist([G.nodes[n][attr] for n in S_nodes])
        P_R = _dist([G.nodes[n][attr] for n in R_nodes])
        for y in (0, 1):
            ps, pr = P_S.get(y, _EPS), P_R.get(y, _EPS)
            qsum += ps * math.log(ps / pr)
    for attr in nominal_attrs:
        P_S = _dist([G.nodes[n][attr] for n in S_nodes])
        P_R = _dist([G.nodes[n][attr] for n in R_nodes])
        for y in set(P_S) | set(P_R):
            ps, pr = P_S.get(y, _EPS), P_R.get(y, _EPS)
            qsum += ps * math.log(ps / pr)
    return (nS / nR) * qsum

# ============================================================
# [4] Modes
# ============================================================
MODES = {
    "ExplicitLanguage": {"binary": ["ExplicitLanguage"], "nominal": []},
    "Partner": {"binary": ["Partner"], "nominal": []},
    "HighActivity": {"binary": ["HighActivity"], "nominal": []},
    "ViewsBand": {"binary": [], "nominal": ["ViewsBand"]},
    "AgeBand": {"binary": [], "nominal": ["AgeBand"]},
}

# ============================================================
# [5] Prototype selection
# ============================================================
def select_prototypes(G):
    degs = dict(G.degree())
    sorted_nodes = sorted(degs, key=lambda n: degs[n], reverse=True)
    selected, excluded = [], set()
    for n in sorted_nodes:
        if n in excluded: continue
        selected.append(n)
        excluded.update(G.neighbors(n))
        excluded.add(n)
        if len(selected) >= MAX_PROTOTYPES:
            break
    return selected

prototypes = select_prototypes(G)
print(f"--> Selected {len(prototypes)} prototypes (non-overlapping).\n")

# ============================================================
# [6] LSD per prototype
# ============================================================
def find_best_q_for_prototype(G, proto, binary_attrs, nominal_attrs, global_nodes):
    neighs = list(G.neighbors(proto))
    if len(neighs) < 2:
        return None
    baseline = global_nodes if COMPARE_R_TO_GLOBAL else neighs
    best_rho, best_q_rg = 0, -float("inf")
    max_rho = len(neighs) if N_RHO_LIMIT is None else min(N_RHO_LIMIT, len(neighs))
    for rho in range(2, max_rho + 1):
        R = neighs[:rho]
        q_rg = wkl_quality_nodes(R, baseline, binary_attrs, nominal_attrs)
        if q_rg > best_q_rg:
            best_q_rg, best_rho = q_rg, rho
    R_best = neighs[:best_rho]
    best_sigma, best_q_sr = 0, -float("inf")
    for sigma in range(1, best_rho):
        S = R_best[:sigma]
        q_sr = wkl_quality_nodes(S, R_best, binary_attrs, nominal_attrs)
        if q_sr > best_q_sr:
            best_q_sr, best_sigma = q_sr, sigma
    return {
        "prototype": proto,
        "rho": best_rho,
        "sigma": best_sigma,
        "q": best_q_sr,
        "q_rg": best_q_rg,
        "n_nodes": len(neighs),
    }

# ============================================================
# [7] Main LSD loop
# ============================================================
global_nodes = list(G.nodes())
records = []

for proto in tqdm(prototypes, desc="Running LSD on Twitch PT vertices"):
    for feat_name, cfg in MODES.items():
        res = find_best_q_for_prototype(G, proto, cfg["binary"], cfg["nominal"], global_nodes)
        if res:
            res["feature"] = feat_name
            records.append(res)

results = pd.DataFrame.from_records(records)
if results.empty:
    raise RuntimeError("No LSD vertex results produced.")

# ============================================================
# [8] Top-100 per feature
# ============================================================
topk_all = []
for feat in MODES.keys():
    topk = results[results["feature"] == feat].sort_values("q", ascending=False).head(TOP_K)
    if not topk.empty:
        topk_all.append(topk)
        topk.to_csv(f"twitchpt_vertex_top{TOP_K}_subgroups_{feat}.csv", index=False)
        print(f"Saved top {TOP_K} subgroups for {feat} → twitchpt_top{TOP_K}_subgroups_{feat}.csv")

if topk_all:
    pd.concat(topk_all).to_csv("twitchpt_top100_subgroups_all.csv", index=False)
    print("Combined top-100 subgroups saved → twitchpt_top100_subgroups_all.csv")

# ============================================================
# [9] Table 2: Feature-specific vs Global Jaccard
# ============================================================
feature_to_nodes = defaultdict(set)
for n, d in G.nodes(data=True):
    for fname, fval in d.items():
        feature_to_nodes[(fname, str(fval))].add(n)

num_nodes = len(G.nodes())
table2 = [(f"{fn}={fv}", len(nodes), 1 - len(nodes)/num_nodes)
          for (fn, fv), nodes in feature_to_nodes.items() if len(nodes) >= 20]
table2_df = pd.DataFrame(table2, columns=["Feature", "Size", "Jaccard Distance"]) \
              .sort_values("Jaccard Distance", ascending=False)
table2_df.to_csv("twitchpt_vertices_feature_jaccard_table.csv", index=False)

print("\nTop of Table 2:")
print(table2_df.head(10).to_string(index=False))
print("\nDone ✅")


          Final LSD Vertex Approach – Twitch PT Dataset         

Graph built: |V|=1,912, |E|=31,299

--> Selected 50 prototypes (non-overlapping).



Running LSD on Twitch PT vertices: 100%|██████████| 50/50 [00:16<00:00,  2.99it/s]

Saved top 100 subgroups for ExplicitLanguage → twitchpt_top100_subgroups_ExplicitLanguage.csv
Saved top 100 subgroups for Partner → twitchpt_top100_subgroups_Partner.csv
Saved top 100 subgroups for HighActivity → twitchpt_top100_subgroups_HighActivity.csv
Saved top 100 subgroups for ViewsBand → twitchpt_top100_subgroups_ViewsBand.csv
Saved top 100 subgroups for AgeBand → twitchpt_top100_subgroups_AgeBand.csv
Combined top-100 subgroups saved → twitchpt_top100_subgroups_all.csv

Top of Table 2:
           Feature  Size  Jaccard Distance
         Partner=1   279          0.854079
     AgeBand=young   631          0.669979
  ViewsBand=medium   631          0.669979
       AgeBand=mid   631          0.669979
     ViewsBand=low   631          0.669979
       AgeBand=old   650          0.660042
    ViewsBand=high   650          0.660042
ExplicitLanguage=1   661          0.654289
    HighActivity=1   952          0.502092
    HighActivity=0   960          0.497908

Done ✅





In [None]:
# ==============================================================
# Final LSD Edges Approach – Amazon Dataset (Top-100 per feature)
# Unified with the Twitch PT Edge approach
# ==============================================================

import torch
from torch_geometric.datasets import Amazon
import networkx as nx
import pandas as pd
import numpy as np
from collections import Counter, defaultdict, deque
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# ============================================================
# CONFIGURATION
# ============================================================
_EPS = 1e-12
MAX_PROTOTYPES = 50
QUALITY_THRESHOLD = -1.0
PATIENCE_K = 8
COMPARE_R_TO_GLOBAL = True
N_RHO_LIMIT = None
TOP_K = 100

print("==============================================================")
print("         Final LSD Edges Approach – Amazon Dataset             ")
print("==============================================================\n")

# ============================================================
# [1] Load dataset
# ============================================================
dataset_name = "Computers"
print(f"[1] Loading dataset {dataset_name}...")
data = torch.load('data/Amazon/Computers_subgraph.pt', weights_only=False)

print(f"--> Nodes (V): {data.num_nodes}")
print(f"--> Edges (E): {data.num_edges}")
print(f"--> Feature dim: {data.num_features}\n")

# ============================================================
# [2] Build graph + compute node and edge features
# ============================================================
print("[2] Building graph and computing edge-level features...")

edge_index = data.edge_index.numpy()
G = nx.DiGraph()
G.add_nodes_from(range(data.num_nodes))
G.add_edges_from(edge_index.T)

deg = np.zeros(data.num_nodes, dtype=int)
for u, v in edge_index.T:
    deg[u] += 1
    deg[v] += 1

feature_norms = data.x.norm(dim=1).numpy()
scaler = MinMaxScaler()
scaled = scaler.fit_transform(np.vstack([deg, feature_norms]).T)
popularity_score, richness_score = scaled[:, 0], scaled[:, 1]

# Node-level synthetic attributes
price_tier = pd.qcut(popularity_score, q=3, labels=["budget", "midrange", "premium"])
review_score = (richness_score * 5).round(2)
category_label = data.y.numpy()

node_targets = pd.DataFrame({
    "id": np.arange(data.num_nodes),
    "price_tier": price_tier.astype(str),
    "review_score": review_score,
    "category_label": category_label
})

t_by_id = {row["id"]: row for _, row in node_targets.iterrows()}

def derive_edge_features(u, v):
    src = t_by_id.get(u)
    dst = t_by_id.get(v)
    if src is None or dst is None:
        return None
    return {
        "price_src": src["price_tier"],
        "price_dst": dst["price_tier"],
        "category_src": int(src["category_label"]),
        "category_dst": int(dst["category_label"]),
    }

for u, v in G.edges():
    feats = derive_edge_features(u, v)
    if feats:
        G[u][v].update(feats)

print(f"--> Graph built: |V|={G.number_of_nodes():,}, |E|={G.number_of_edges():,}\n")

# ============================================================
# [3] LSD Weighted KL divergence for edges
# ============================================================
def _dist(vals):
    n = len(vals)
    if n == 0: return {}
    c = Counter(vals)
    return {k: v / n for k, v in c.items()}

def wkl_quality_edges(S_edges, R_edges, nominal_attrs):
    nS, nR = len(S_edges), len(R_edges)
    if nS == 0 or nR == 0: return 0.0
    qsum = 0.0
    for attr in nominal_attrs:
        P_S = _dist([d[attr] for _, _, d in S_edges])
        P_R = _dist([d[attr] for _, _, d in R_edges])
        for y in set(P_S) | set(P_R):
            ps, pr = P_S.get(y, _EPS), P_R.get(y, _EPS)
            qsum += ps * np.log(ps / pr)
    return (nS / nR) * qsum

def rank_out_edges(G, proto):
    return [(u, v, d) for u, v, d in G.out_edges(proto, data=True)]

# ============================================================
# [4] Modes (Amazon-specific)
# ============================================================
MODES = {
    "price": {"nominal": ["price_src", "price_dst"]},
    "category": {"nominal": ["category_src", "category_dst"]},
    "price_category": {"nominal": ["price_src", "price_dst", "category_src", "category_dst"]},
}

# ============================================================
# [5] Prototype selection
# ============================================================
def select_prototypes(G):
    UG = G.to_undirected()
    degs = {u: UG.degree(u) for u in UG.nodes()}
    sorted_nodes = sorted(degs.keys(), key=lambda n: degs[n], reverse=True)
    selected, excluded = [], set()
    for n in sorted_nodes:
        if n in excluded: continue
        selected.append(n)
        excluded.update(UG.neighbors(n))
        excluded.add(n)
        if len(selected) >= MAX_PROTOTYPES:
            break
    return selected

prototypes = select_prototypes(G)
print(f"--> Selected {len(prototypes)} prototypes (non-overlapping).\n")

# ============================================================
# [6] Find best q for each prototype
# ============================================================
def find_best_q_for_prototype(G, proto, nominal_attrs, global_edges):
    ranked = rank_out_edges(G, proto)
    if len(ranked) < 2:
        return None
    baseline = global_edges if COMPARE_R_TO_GLOBAL else ranked
    best_rho, best_q_rg = 0, -float("inf")
    max_rho = len(ranked) if N_RHO_LIMIT is None else min(N_RHO_LIMIT, len(ranked))
    for rho in range(2, max_rho + 1):
        R = ranked[:rho]
        q_rg = wkl_quality_edges(R, baseline, nominal_attrs)
        if q_rg > best_q_rg:
            best_q_rg, best_rho = q_rg, rho
    R_best = ranked[:best_rho]
    best_sigma, best_q_sr = 0, -float("inf")
    for sigma in range(1, best_rho):
        S = R_best[:sigma]
        q_sr = wkl_quality_edges(S, R_best, nominal_attrs)
        if q_sr > best_q_sr:
            best_q_sr, best_sigma = q_sr, sigma
    return {"prototype": proto, "rho": best_rho, "sigma": best_sigma, "q": best_q_sr,
            "q_rg": best_q_rg, "n_nodes": len(ranked)}

# ============================================================
# [7] Main LSD loop
# ============================================================
global_edges = [(u, v, d) for u, v, d in G.edges(data=True)]
records = []
for proto in tqdm(prototypes, desc="Running LSD on Amazon edges"):
    for feat_name, cfg in MODES.items():
        res = find_best_q_for_prototype(G, proto, cfg["nominal"], global_edges)
        if res:
            res["feature"] = feat_name
            records.append(res)

results = pd.DataFrame.from_records(records)
if results.empty:
    raise RuntimeError("No LSD edge results produced.")

# ============================================================
# [8] Top-100 per feature
# ============================================================
topk_all = []
for feat in MODES.keys():
    topk = results[results["feature"] == feat].sort_values("q", ascending=False).head(TOP_K)
    if not topk.empty:
        topk_all.append(topk)
        topk.to_csv(f"amazon_edges_top{TOP_K}_subgroups_{feat}.csv", index=False)
        print(f"Saved top {TOP_K} subgroups for {feat} → amazon_top{TOP_K}_subgroups_{feat}.csv")

if topk_all:
    pd.concat(topk_all).to_csv("amazon_top100_subgroups_all.csv", index=False)
    print("Combined top-100 subgroups saved → amazon_top100_subgroups_all.csv")

# ============================================================
# [9] Table 2 – Feature-specific Jaccard distances
# ============================================================
feature_to_edges = defaultdict(set)
for u, v, d in G.edges(data=True):
    for fname, fval in d.items():
        feature_to_edges[(fname, str(fval))].add((u, v))

num_edges = len(G.edges())
table2 = [(f"{fn}={fv}", len(eds), 1 - len(eds)/num_edges)
          for (fn, fv), eds in feature_to_edges.items() if len(eds) >= 20]
table2_df = pd.DataFrame(table2, columns=["Feature", "Size", "Jaccard Distance"]) \
              .sort_values("Jaccard Distance", ascending=False)
table2_df.to_csv("amazon_edges_feature_jaccard_table.csv", index=False)

print("\nTop of Table 2:")
print(table2_df.head(10).to_string(index=False))
print("\nDone ✅")


         Final LSD Edges Approach – Amazon Dataset             

[1] Loading dataset Computers...
--> Nodes (V): 1900
--> Edges (E): 31864
--> Feature dim: 767

[2] Building graph and computing edge-level features...
--> Graph built: |V|=1,900, |E|=31,864

--> Selected 1 prototypes (non-overlapping).



Running LSD on Amazon edges: 100%|██████████| 1/1 [00:31<00:00, 31.04s/it]


Saved top 100 subgroups for price → amazon_top100_subgroups_price.csv
Saved top 100 subgroups for category → amazon_top100_subgroups_category.csv
Saved top 100 subgroups for price_category → amazon_top100_subgroups_price_category.csv
Combined top-100 subgroups saved → amazon_top100_subgroups_all.csv

Top of Table 2:
       Feature  Size  Jaccard Distance
category_src=0   127          0.996014
category_dst=2   146          0.995418
category_dst=0   151          0.995261
category_src=6   157          0.995073
category_src=2   170          0.994665
category_dst=6   180          0.994351
category_dst=3   754          0.976337
category_dst=8   949          0.970217
category_src=3  1012          0.968240
category_src=8  1203          0.962246

Done ✅


In [None]:
# ==============================================================
# Subgraph Extractor – Amazon Dataset (approx. |V|=1900, |E|=31k)
# ==============================================================

import torch
from torch_geometric.datasets import Amazon
import networkx as nx
import numpy as np
import random

TARGET_NODES = 1900
DATASET_NAME = "Computers"

print("==============================================================")
print(f" Extracting computable Amazon subgraph (~{TARGET_NODES} nodes)")
print("==============================================================\n")

# Load dataset
dataset = Amazon(root="data/Amazon", name=DATASET_NAME)
data = dataset[0]
edge_index = data.edge_index.numpy()

# Build undirected graph for sampling
G_full = nx.Graph()
G_full.add_nodes_from(range(data.num_nodes))
G_full.add_edges_from(edge_index.T)
print(f"Full graph: |V|={G_full.number_of_nodes():,}, |E|={G_full.number_of_edges():,}")

# --- Step 1: Start from a high-degree node ---
deg_sorted = sorted(G_full.degree, key=lambda x: x[1], reverse=True)
seed_node = deg_sorted[0][0]

# --- Step 2: BFS expansion until ~TARGET_NODES ---
visited = set()
queue = [seed_node]

while queue and len(visited) < TARGET_NODES:
    node = queue.pop(0)
    if node not in visited:
        visited.add(node)
        neighbors = list(G_full.neighbors(node))
        random.shuffle(neighbors)
        queue.extend(neighbors)

sub_nodes = list(visited)
G_sub = G_full.subgraph(sub_nodes).copy()

print(f"Subgraph extracted: |V|={G_sub.number_of_nodes():,}, |E|={G_sub.number_of_edges():,}\n")

# --- Step 3: Build new edge_index and feature matrix ---
node_map = {old: i for i, old in enumerate(G_sub.nodes())}
edges_sub = np.array([[node_map[u], node_map[v]] for u, v in G_sub.edges()]).T

# Create reduced dataset
data_sub = data.clone()
data_sub.x = data.x[sub_nodes]
data_sub.y = data.y[sub_nodes]
data_sub.edge_index = torch.tensor(edges_sub, dtype=torch.long)

# --- Step 4: Save subgraph for reuse ---
torch.save(data_sub, f"data/Amazon/{DATASET_NAME}_subgraph.pt")
print(f"Saved subgraph → data/Amazon/{DATASET_NAME}_subgraph.pt")

print("==============================================================")
print("You can now load this subset using:")
print(f"  data = torch.load('data/Amazon/{DATASET_NAME}_subgraph.pt')")
print("==============================================================")


 Extracting computable Amazon subgraph (~1900 nodes)

Full graph: |V|=13,752, |E|=245,861
Subgraph extracted: |V|=1,900, |E|=31,864

Saved subgraph → data/Amazon/Computers_subgraph.pt
You can now load this subset using:
  data = torch.load('data/Amazon/Computers_subgraph.pt')
