In [1]:
import igraph as ig
import networkx as nx
import torch

def nx_to_igraph(nx_g):
    g = ig.Graph()
    g.add_vertices(list(nx_g.nodes()))
    edges = list(nx_g.edges())
    g.add_edges(edges)

    # Copy node attributes
    for node in g.vs:
        node_id = node.index
        node_attrs = nx_g.nodes[node_id]
        for key, value in node_attrs.items():
            node[key] = value

    return g


In [2]:
import networkx as nx

def hetero_to_networkx(data):
    G = nx.Graph()
    node_id_map = {}
    curr_id = 0

    for ntype in data.node_types:
        for i in range(data[ntype].num_nodes):
            G.add_node(curr_id, node_type=ntype, orig_id=i)
            node_id_map[(ntype, i)] = curr_id
            curr_id += 1

    for src_type, rel, dst_type in data.edge_types:
        edge_index = data[(src_type, rel, dst_type)].edge_index
        for src, dst in zip(edge_index[0].tolist(), edge_index[1].tolist()):
            src_global = node_id_map[(src_type, src)]
            dst_global = node_id_map[(dst_type, dst)]
            G.add_edge(src_global, dst_global, edge_type=rel)

    return G, node_id_map


In [3]:
import leidenalg

data = torch.load('../hetero_graph.pt', weights_only=False)
# Convert your heterogeneous graph
G_nx, node_id_map = hetero_to_networkx(data)
G_ig = nx_to_igraph(G_nx)

# Run Leiden algorithm
partition = leidenalg.find_partition(G_ig, leidenalg.ModularityVertexPartition)

# Map: community_id → list of node IDs
from collections import defaultdict
community_nodes = defaultdict(list)
for node, comm_id in enumerate(partition.membership):
    community_nodes[comm_id].append(node)


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def compute_suspiciousness(G, community_nodes_dict):
    scores = {}
    for comm_id, nodes in community_nodes_dict.items():
        subG = G.subgraph(nodes)
        num_nodes = subG.number_of_nodes()
        num_edges = subG.number_of_edges()
        max_edges = num_nodes * (num_nodes - 1) / 2
        density = num_edges / max_edges if max_edges > 0 else 0

        reviewers = [n for n in nodes if G.nodes[n].get('node_type') == 'reviewer']
        products = [n for n in nodes if G.nodes[n].get('node_type') == 'product']
        rev_to_prod_ratio = len(reviewers) / (len(products) + 1e-5)

        avg_degree = sum(dict(subG.degree()).values()) / num_nodes if num_nodes > 0 else 0

        # Heuristic score: you can adjust weights
        score = 0.4 * density + 0.4 * rev_to_prod_ratio + 0.2 * avg_degree
        scores[comm_id] = score
    return scores


In [5]:
suspiciousness_scores = compute_suspiciousness(G_nx, community_nodes)

In [6]:
# Sort original suspiciousness scores
sorted_sus = sorted(suspiciousness_scores.items(), key=lambda x: x[1], reverse=True)

# Normalize by number of nodes in each community
normalized_sus = []
for comm_id, score in sorted_sus:
    num_nodes = len(community_nodes[comm_id])
    norm_score = score / (num_nodes + 1e-6)  # to avoid divide-by-zero
    normalized_sus.append((comm_id, score, norm_score, num_nodes))

# Sort by normalized suspiciousness score
normalized_sus.sort(key=lambda x: x[2], reverse=True)


In [7]:
print("Top suspicious communities (Normalized Suspiciousness per Node):")
for comm_id, raw_score, norm_score, num_nodes in normalized_sus[:15]:
    print(f"Community {comm_id}: Norm Suspiciousness = {norm_score:.4f} | Raw = {raw_score:.2f} | Nodes = {num_nodes}")


Top suspicious communities (Normalized Suspiciousness per Node):
Community 34: Norm Suspiciousness = 0.2029 | Raw = 14.41 | Nodes = 71
Community 23: Norm Suspiciousness = 0.2009 | Raw = 44.00 | Nodes = 219
Community 21: Norm Suspiciousness = 0.2009 | Raw = 45.20 | Nodes = 225
Community 19: Norm Suspiciousness = 0.2008 | Raw = 47.60 | Nodes = 237
Community 12: Norm Suspiciousness = 0.2004 | Raw = 108.40 | Nodes = 541
Community 38: Norm Suspiciousness = 0.2002 | Raw = 8.41 | Nodes = 42
Community 1: Norm Suspiciousness = 0.2001 | Raw = 388.40 | Nodes = 1941
Community 26: Norm Suspiciousness = 0.2000 | Raw = 34.40 | Nodes = 172
Community 16: Norm Suspiciousness = 0.2000 | Raw = 84.40 | Nodes = 422
Community 6: Norm Suspiciousness = 0.2000 | Raw = 179.60 | Nodes = 898
Community 5: Norm Suspiciousness = 0.1998 | Raw = 186.40 | Nodes = 933
Community 8: Norm Suspiciousness = 0.1997 | Raw = 153.20 | Nodes = 767
Community 10: Norm Suspiciousness = 0.1997 | Raw = 145.20 | Nodes = 727
Community 11

In [12]:
def get_reviewer_asin_from_community(G, community_nodes, community_id):
    nodes = community_nodes[community_id]
    reviewers = []
    products = []

    for n in nodes:
        node_data = G.nodes[n]
        if node_data.get('node_type') == 'reviewer':
            reviewers.append(node_data.get('reviewerId', n))  # fallback to node ID if missing
        elif node_data.get('node_type') == 'product':
            products.append(node_data.get('asin', n))

    return reviewers, products

# Example usage:
community_id = 34  # Change this to the desired suspicious community number
reviewers, products = get_reviewer_asin_from_community(G_nx, community_nodes, community_id)

print(f"Community {community_id} contains:")
print(f"🧑 Reviewers ({len(reviewers)}):")
print(reviewers[:20])  # Show first 20

print(f"\n📦 Products ({len(products)}):")
print(products[:20])  # Show first 20


Community 34 contains:
🧑 Reviewers (35):
[132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151]

📦 Products (1):
[0]
