In [1]:
import igraph as ig
import numpy as np
import ensembleclustering as CE
from sklearn.metrics import adjusted_mutual_info_score
import pandas as pd
from joblib import Parallel, delayed
import multiprocessing
from tqdm import tqdm

In [2]:
# Load GitHub ML Graph Data
edges = pd.read_csv("musae_git_edges.csv")
nodes = pd.read_csv("musae_git_target.csv")

In [3]:
# Create Graph from Edge List
g = ig.Graph.TupleList(edges.itertuples(index=False), directed=False)

# Mapping Node IDs
id_map = {node: idx for idx, node in enumerate(nodes['id'])}
nodes['graph_idx'] = nodes['id'].map(id_map)

def compute_ami(partition1, partition2):
    return adjusted_mutual_info_score(partition1.membership, partition2.membership)

In [None]:
# Define clustering functions for parallel execution
def run_louvain():
    return g.community_multilevel()

def run_leiden():
    return g.community_leiden(objective_function='modularity')

def run_infomap():
    return g.community_infomap()

def run_label_propagation():
    return g.community_label_propagation()

def run_girvan_newman():
    hierarchy = g.community_edge_betweenness()
    return hierarchy.as_clustering(n=2)

def run_fastgreedy():
    hierarchy = g.community_fastgreedy()
    return hierarchy.as_clustering()

# Run clustering methods in parallel
num_cores = min(multiprocessing.cpu_count(), 6)  # Use up to 6 cores
results = Parallel(n_jobs=num_cores)(
    delayed(func)() for func in [
        run_louvain,
        run_leiden,
        run_infomap,
        run_label_propagation,
        run_girvan_newman,
        run_fastgreedy
    ]
)

In [None]:
# Assign results
louvain_partition, leiden_partition, infomap_partition, lp_partition, gn_partition, cnm_partition = results

# ECG using ClusterEnsembles
labels_louvain = np.array(louvain_partition.membership)
labels_infomap = np.array(infomap_partition.membership)
labels_lp = np.array(lp_partition.membership)

# Parallelizing the ensemble clustering
base_clusterings = np.array([labels_louvain, labels_infomap, labels_lp])

def run_consensus_clustering():
    return CE.cluster_ensembles(base_clusterings, verbose=True, nclass=5)

# Run consensus clustering in parallel
consensus_labels = Parallel(n_jobs=1)(delayed(run_consensus_clustering)())[0]

In [None]:
# Store partitions
partitions = {
    "ECG": consensus_labels,
    "Leiden": leiden_partition,
    "Louvain": louvain_partition,
    "Infomap": infomap_partition,
    "Label Propagation": lp_partition,
    "Girvan-Newman": gn_partition,
    "CNM": cnm_partition
}

In [None]:
partitions

In [None]:
# Prepare pairs for parallel processing
partition_pairs = [(name1, part1, name2, part2) for (name1, part1), (name2, part2) in combinations(partitions.items(), 2)]

# Use parallel processing for AMI computation
num_cores = multiprocessing.cpu_count()  # Detect available CPU cores
ami_results = Parallel(n_jobs=num_cores)(delayed(compute_ami)(pair) for pair in partition_pairs)

# Convert results to a dictionary
ami_scores = dict(ami_results)

# Print results
for key, value in ami_scores.items():
    print(f"{key}: {round(value, 3)}")