⬆️ markov_clustering

GiulioRossetti · May 13, 2024 · fec921c · fec921c
1 parent b66e83b
commit fec921c
Show file tree

Hide file tree

Showing 7 changed files with 258 additions and 50 deletions.
diff --git a/cdlib/algorithms/crisp_partition.py b/cdlib/algorithms/crisp_partition.py
@@ -35,6 +35,7 @@
 from cdlib.algorithms.internal.MCODE import m_code
 from cdlib.algorithms.internal.RSC import rsc_evaluate_graph
 from cdlib.prompt_utils import report_missing_packages, prompt_import_failure
+import cdlib.algorithms.internal.markov_clustering as mc
 
 import warnings
 
@@ -90,12 +91,6 @@
     by = None
 
 
-try:
-    import markov_clustering as mc
-except ModuleNotFoundError:
-    missing_packages.add("markov_clustering")
-    by = None
-
 # try:
 #    import karateclub
 # except ModuleNotFoundError:
@@ -1674,13 +1669,6 @@ def markov_clustering(
     .. note:: Reference implementation: https://github.com/GuyAllard/markov_clustering
     """
 
-    if mc is None:
-        raise Exception(
-            "===================================================== \n"
-            "The markov clustering algorithm seems not to be installed (or incorrectly installed). \n"
-            "Please resolve with: pip install markov_clustering"
-        )
-
     g = convert_graph_formats(g_original, nx.Graph)
     g, maps = nx_node_integer_mapping(g)
 

diff --git a/cdlib/algorithms/internal/RSC.py b/cdlib/algorithms/internal/RSC.py
@@ -35,7 +35,7 @@ def __regularized_laplacian_matrix(adj_matrix, tau):
 
     # diags will be zero at points where there is no edge and/or the node you are at
     #  ignore the error and make it zero later
-    #with scipy.errstate(divide="ignore"):
+    # with scipy.errstate(divide="ignore"):
     diags_sqrt = 1.0 / np.sqrt(diags)
     diags_sqrt[np.isinf(diags_sqrt)] = 0
     D = sp.spdiags(diags_sqrt, [0], m, n, format="csr")

diff --git a/cdlib/algorithms/internal/markov_clustering.py b/cdlib/algorithms/internal/markov_clustering.py
@@ -0,0 +1,238 @@
+import numpy as np
+from scipy.sparse import isspmatrix, dok_matrix, csc_matrix
+import sklearn.preprocessing
+
+
+def sparse_allclose(a, b, rtol=1e-5, atol=1e-8):
+    """
+    Version of np.allclose for use with sparse matrices
+    """
+    c = np.abs(a - b) - rtol * np.abs(b)
+    # noinspection PyUnresolvedReferences
+    return c.max() <= atol
+
+
+def normalize(matrix):
+    """
+    Normalize the columns of the given matrix
+
+    :param matrix: The matrix to be normalized
+    :returns: The normalized matrix
+    """
+    return sklearn.preprocessing.normalize(matrix, norm="l1", axis=0)
+
+
+def inflate(matrix, power):
+    """
+    Apply cluster inflation to the given matrix by raising
+    each element to the given power.
+
+    :param matrix: The matrix to be inflated
+    :param power: Cluster inflation parameter
+    :returns: The inflated matrix
+    """
+    if isspmatrix(matrix):
+        return normalize(matrix.power(power))
+
+    return normalize(np.power(matrix, power))
+
+
+def expand(matrix, power):
+    """
+    Apply cluster expansion to the given matrix by raising
+    the matrix to the given power.
+
+    :param matrix: The matrix to be expanded
+    :param power: Cluster expansion parameter
+    :returns: The expanded matrix
+    """
+
+    if isspmatrix(matrix):
+        return matrix**power
+    else:
+        import scipy.sparse as sp
+
+        matrix = sp.csr_matrix(matrix)
+        return matrix**power
+
+    return np.linalg.matrix_power(matrix, power)
+
+
+def add_self_loops(matrix, loop_value):
+    """
+    Add self-loops to the matrix by setting the diagonal
+    to loop_value
+
+    :param matrix: The matrix to add loops to
+    :param loop_value: Value to use for self-loops
+    :returns: The matrix with self-loops
+    """
+    shape = matrix.shape
+    assert shape[0] == shape[1], "Error, matrix is not square"
+
+    if isspmatrix(matrix):
+        new_matrix = matrix.todok()
+    else:
+        new_matrix = matrix.copy()
+
+    for i in range(shape[0]):
+        new_matrix[i, i] = loop_value
+
+    if isspmatrix(matrix):
+        return new_matrix.tocsc()
+
+    return new_matrix
+
+
+def prune(matrix, threshold):
+    """
+    Prune the matrix so that very small edges are removed.
+    The maximum value in each column is never pruned.
+
+    :param matrix: The matrix to be pruned
+    :param threshold: The value below which edges will be removed
+    :returns: The pruned matrix
+    """
+    if isspmatrix(matrix):
+        pruned = dok_matrix(matrix.shape)
+        pruned[matrix >= threshold] = matrix[matrix >= threshold]
+        pruned = pruned.tocsc()
+    else:
+        pruned = matrix.copy()
+        pruned[pruned < threshold] = 0
+
+    # keep max value in each column. same behaviour for dense/sparse
+    num_cols = matrix.shape[1]
+    row_indices = matrix.argmax(axis=0).reshape((num_cols,))
+    col_indices = np.arange(num_cols)
+    pruned[row_indices, col_indices] = matrix[row_indices, col_indices]
+
+    return pruned
+
+
+def converged(matrix1, matrix2):
+    """
+    Check for convergence by determining if
+    matrix1 and matrix2 are approximately equal.
+
+    :param matrix1: The matrix to compare with matrix2
+    :param matrix2: The matrix to compare with matrix1
+    :returns: True if matrix1 and matrix2 approximately equal
+    """
+    if isspmatrix(matrix1) or isspmatrix(matrix2):
+        return sparse_allclose(matrix1, matrix2)
+
+    return np.allclose(matrix1, matrix2)
+
+
+def iterate(matrix, expansion, inflation):
+    """
+    Run a single iteration (expansion + inflation) of the mcl algorithm
+
+    :param matrix: The matrix to perform the iteration on
+    :param expansion: Cluster expansion factor
+    :param inflation: Cluster inflation factor
+    """
+
+    # matrix = matrix.todense()
+
+    # Expansion
+    matrix = expand(matrix, expansion)
+
+    # Inflation
+    matrix = inflate(matrix, inflation)
+
+    return matrix
+
+
+def get_clusters(matrix):
+    """
+    Retrieve the clusters from the matrix
+
+    :param matrix: The matrix produced by the MCL algorithm
+    :returns: A list of tuples where each tuple represents a cluster and
+              contains the indices of the nodes belonging to the cluster
+    """
+    if not isspmatrix(matrix):
+        # cast to sparse so that we don't need to handle different
+        # matrix types
+        matrix = csc_matrix(matrix)
+
+    # get the attractors - non-zero elements of the matrix diagonal
+    attractors = matrix.diagonal().nonzero()[0]
+
+    # somewhere to put the clusters
+    clusters = set()
+
+    # the nodes in the same row as each attractor form a cluster
+    for attractor in attractors:
+        cluster = tuple(matrix.getrow(attractor).nonzero()[1].tolist())
+        clusters.add(cluster)
+
+    return sorted(list(clusters))
+
+
+def run_mcl(
+    matrix,
+    expansion=2,
+    inflation=2,
+    loop_value=1,
+    iterations=100,
+    pruning_threshold=0.001,
+    pruning_frequency=1,
+    convergence_check_frequency=1,
+    verbose=False,
+):
+    """
+    Perform MCL on the given similarity matrix
+
+    :param matrix: The similarity matrix to cluster
+    :param expansion: The cluster expansion factor
+    :param inflation: The cluster inflation factor
+    :param loop_value: Initialization value for self-loops
+    :param iterations: Maximum number of iterations
+           (actual number of iterations will be less if convergence is reached)
+    :param pruning_threshold: Threshold below which matrix elements will be set
+           set to 0
+    :param pruning_frequency: Perform pruning every 'pruning_frequency'
+           iterations.
+    :param convergence_check_frequency: Perform the check for convergence
+           every convergence_check_frequency iterations
+    :param verbose: Print extra information to the console
+    :returns: The final matrix
+    """
+    assert expansion > 1, "Invalid expansion parameter"
+    assert inflation > 1, "Invalid inflation parameter"
+    assert loop_value >= 0, "Invalid loop_value"
+    assert iterations > 0, "Invalid number of iterations"
+    assert pruning_threshold >= 0, "Invalid pruning_threshold"
+    assert pruning_frequency > 0, "Invalid pruning_frequency"
+    assert convergence_check_frequency > 0, "Invalid convergence_check_frequency"
+
+    # Initialize self-loops
+    if loop_value > 0:
+        matrix = add_self_loops(matrix, loop_value)
+
+    # Normalize
+    matrix = normalize(matrix)
+
+    # iterations
+    for i in range(iterations):
+
+        # store current matrix for convergence checking
+        last_mat = matrix.copy()
+
+        # perform MCL expansion and inflation
+
+        matrix = iterate(matrix, expansion, inflation)
+
+        # prune
+        if pruning_threshold > 0 and i % pruning_frequency == pruning_frequency - 1:
+            matrix = prune(matrix, pruning_threshold)
+
+        # Check for convergence
+        if i % convergence_check_frequency == convergence_check_frequency - 1:
+            if converged(matrix, last_mat):
+                break
+
+    return matrix
diff --git a/cdlib/test/test_community_discovery_models.py b/cdlib/test/test_community_discovery_models.py
@@ -54,10 +54,6 @@
 except ModuleNotFoundError:
     by = None
 
-try:
-    import markov_clustering as mc
-except ModuleNotFoundError:
-    mc = None
 
 try:
     from cdlib.algorithms.internal.LPAM import LPAM
@@ -360,24 +356,14 @@ def test_osse(self):
 
     def test_markov_clustering(self):
 
-        if mc is not None:
-            g = get_string_graph()
-
-            communities = algorithms.markov_clustering(g)
-            self.assertEqual(type(communities.communities), list)
-            if len(communities.communities) > 0:
-                self.assertEqual(type(communities.communities[0]), list)
-                if len(communities.communities[0]) > 0:
-                    self.assertEqual(type(communities.communities[0][0]), str)
-
-            g = nx.karate_club_graph()
+        g = nx.karate_club_graph()
 
-            communities = algorithms.markov_clustering(g)
-            self.assertEqual(type(communities.communities), list)
-            if len(communities.communities) > 0:
-                self.assertEqual(type(communities.communities[0]), list)
-                if len(communities.communities[0]) > 0:
-                    self.assertEqual(type(communities.communities[0][0]), int)
+        communities = algorithms.markov_clustering(g)
+        self.assertEqual(type(communities.communities), list)
+        if len(communities.communities) > 0:
+            self.assertEqual(type(communities.communities[0]), list)
+            if len(communities.communities[0]) > 0:
+                self.assertEqual(type(communities.communities[0][0]), int)
 
     # def test_bigClam(self):
     #     if karateclub is None:

diff --git a/requirements.txt b/requirements.txt
@@ -1,18 +1,18 @@
 numpy
-scikit-learn
+scikit - learn
 tqdm
-networkx>=3.0
+networkx >= 3.0
 demon
-python-louvain>=0.16
-scipy>=1.10
+python - louvain >= 0.16
+scipy >= 1.10
 pulp
 seaborn
 pandas
 eva_lcd
 bimlpa
-python-igraph>=0.10
+python - igraph >= 0.10
 angelcommunity
 pooch
 dynetx
 thresholdclustering
-python-Levenshtein
+python - Levenshtein
diff --git a/requirements_optional.txt b/requirements_optional.txt
@@ -3,5 +3,4 @@ networkit
 pycombo
 leidenalg
 infomap>=1.3.0
-wurlitzer>=1.0.2
-markov_clustering
+wurlitzer>=1.0.2
diff --git a/setup.py b/setup.py
@@ -54,12 +54,9 @@
             "GraphRicciCurvature",
             "networkit",
             "pycombo",
-            "leidenalg"
-        ],
-        "pypi": [
-            "bayanpy",
-            "pyclustering"
+            "leidenalg",
         ],
+        "pypi": ["bayanpy", "pyclustering"],
         "all": [
             "infomap>=1.3.0",
             "wurlitzer>=1.0.2",
@@ -68,8 +65,8 @@
             "pycombo",
             "leidenalg",
             "bayanpy",
-            "pyclustering"
-        ]
+            "pyclustering",
+        ],
     },
     packages=find_packages(
         exclude=["*.test", "*.test.*", "test.*", "test", "cdlib.test", "cdlib.test.*"]