In [20]:
import igraph as ig
import numpy as np
import ensembleclustering as CE
from sklearn.metrics import adjusted_mutual_info_score
import pandas as pd
from itertools import combinations

In [2]:
# Load GitHub ML Graph Data
edges = pd.read_csv("musae_git_edges.csv")
nodes = pd.read_csv("musae_git_target.csv")

In [3]:
# Create Graph from Edge List
g = ig.Graph.TupleList(edges.itertuples(index=False), directed=False)

# Mapping Node IDs
id_map = {node: idx for idx, node in enumerate(nodes['id'])}
nodes['graph_idx'] = nodes['id'].map(id_map)

def compute_ami(partition1, partition2):
    return adjusted_mutual_info_score(partition1.membership, partition2.membership)

### Clustering Algorithms

In [6]:
print("Running Louvain...")
louvain_partition = g.community_multilevel()

Running Louvain...


In [7]:
print("Running Leiden...")
leiden_partition = g.community_leiden(objective_function='modularity')

Running Leiden...


In [8]:
print("Running Infomap...")
infomap_partition = g.community_infomap()

Running Infomap...


In [9]:
print("Running Label Propagation...")
lp_partition = g.community_label_propagation()

Running Label Propagation...


In [10]:
print("Running CNM...")
cnm_hierarchy = g.community_fastgreedy()
cnm_partition = cnm_hierarchy.as_clustering()

Running CNM...


#### Too slow so I skipped

In [13]:
#print("Running Girvan-Newman...")
#gn_hierarchy = g.community_edge_betweenness()
#gn_partition = gn_hierarchy.as_clustering(n=2)

In [11]:
print("Running ECG (ClusterEnsembles)...")
labels_louvain = louvain_partition.membership
labels_infomap = infomap_partition.membership
labels_lp = lp_partition.membership
base_clusterings = np.array([labels_louvain, labels_infomap, labels_lp])
consensus_labels = CE.cluster_ensembles(base_clusterings, verbose=True, nclass=5)

Running ECG (ClusterEnsembles)...
Cluster Ensembles
    - number of classes: 5
    - solver: hbgf
    - length of base clustering labels: 37700
    - number of base clusters: 3


In [14]:
# Convert ECG to dummy clustering object
class DummyPartition:
    def __init__(self, labels):
        self.membership = labels

# Create dummy partition for ECG
ecg_partition = DummyPartition(consensus_labels)

In [16]:
partitions = {
    "ECG": ecg_partition,
    "Leiden": leiden_partition,
    "Louvain": louvain_partition,
    "Infomap": infomap_partition,
    "Label Propagation": lp_partition,
    #"Girvan-Newman": gn_partition,
    "CNM": cnm_partition
}

In [22]:
# Compute AMI scores for each pair of clustering algorithms
ami_scores = {}
for (name1, part1), (name2, part2) in combinations(partitions.items(), 2):
    ami_scores[f"{name1} vs {name2}"] = compute_ami(part1, part2)

# Print results
for key, value in ami_scores.items():
    print(f"{key}: {round(value, 3)}")

ECG vs Leiden: 0.314
ECG vs Louvain: 0.446
ECG vs Infomap: 0.382
ECG vs Label Propagation: 0.001
ECG vs CNM: 0.207
Leiden vs Louvain: 0.553
Leiden vs Infomap: 0.348
Leiden vs Label Propagation: 0.006
Leiden vs CNM: 0.304
Louvain vs Infomap: 0.328
Louvain vs Label Propagation: 0.006
Louvain vs CNM: 0.295
Infomap vs Label Propagation: 0.002
Infomap vs CNM: 0.29
Label Propagation vs CNM: 0.013
