Skip to content

Commit

Permalink
⬆️ markov_clustering
Browse files Browse the repository at this point in the history
  • Loading branch information
GiulioRossetti committed May 13, 2024
1 parent b66e83b commit fec921c
Show file tree
Hide file tree
Showing 7 changed files with 258 additions and 50 deletions.
14 changes: 1 addition & 13 deletions cdlib/algorithms/crisp_partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
from cdlib.algorithms.internal.MCODE import m_code
from cdlib.algorithms.internal.RSC import rsc_evaluate_graph
from cdlib.prompt_utils import report_missing_packages, prompt_import_failure
import cdlib.algorithms.internal.markov_clustering as mc

import warnings

Expand Down Expand Up @@ -90,12 +91,6 @@
by = None


try:
import markov_clustering as mc
except ModuleNotFoundError:
missing_packages.add("markov_clustering")
by = None

# try:
# import karateclub
# except ModuleNotFoundError:
Expand Down Expand Up @@ -1674,13 +1669,6 @@ def markov_clustering(
.. note:: Reference implementation: https://github.com/GuyAllard/markov_clustering
"""

if mc is None:
raise Exception(
"===================================================== \n"
"The markov clustering algorithm seems not to be installed (or incorrectly installed). \n"
"Please resolve with: pip install markov_clustering"
)

g = convert_graph_formats(g_original, nx.Graph)
g, maps = nx_node_integer_mapping(g)

Expand Down
2 changes: 1 addition & 1 deletion cdlib/algorithms/internal/RSC.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def __regularized_laplacian_matrix(adj_matrix, tau):

# diags will be zero at points where there is no edge and/or the node you are at
# ignore the error and make it zero later
#with scipy.errstate(divide="ignore"):
# with scipy.errstate(divide="ignore"):
diags_sqrt = 1.0 / np.sqrt(diags)
diags_sqrt[np.isinf(diags_sqrt)] = 0
D = sp.spdiags(diags_sqrt, [0], m, n, format="csr")
Expand Down
238 changes: 238 additions & 0 deletions cdlib/algorithms/internal/markov_clustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
import numpy as np
from scipy.sparse import isspmatrix, dok_matrix, csc_matrix
import sklearn.preprocessing


def sparse_allclose(a, b, rtol=1e-5, atol=1e-8):
"""
Version of np.allclose for use with sparse matrices
"""
c = np.abs(a - b) - rtol * np.abs(b)
# noinspection PyUnresolvedReferences
return c.max() <= atol


def normalize(matrix):
"""
Normalize the columns of the given matrix
:param matrix: The matrix to be normalized
:returns: The normalized matrix
"""
return sklearn.preprocessing.normalize(matrix, norm="l1", axis=0)


def inflate(matrix, power):
"""
Apply cluster inflation to the given matrix by raising
each element to the given power.
:param matrix: The matrix to be inflated
:param power: Cluster inflation parameter
:returns: The inflated matrix
"""
if isspmatrix(matrix):
return normalize(matrix.power(power))

return normalize(np.power(matrix, power))


def expand(matrix, power):
"""
Apply cluster expansion to the given matrix by raising
the matrix to the given power.
:param matrix: The matrix to be expanded
:param power: Cluster expansion parameter
:returns: The expanded matrix
"""

if isspmatrix(matrix):
return matrix**power
else:
import scipy.sparse as sp

matrix = sp.csr_matrix(matrix)
return matrix**power

return np.linalg.matrix_power(matrix, power)


def add_self_loops(matrix, loop_value):
"""
Add self-loops to the matrix by setting the diagonal
to loop_value
:param matrix: The matrix to add loops to
:param loop_value: Value to use for self-loops
:returns: The matrix with self-loops
"""
shape = matrix.shape
assert shape[0] == shape[1], "Error, matrix is not square"

if isspmatrix(matrix):
new_matrix = matrix.todok()
else:
new_matrix = matrix.copy()

for i in range(shape[0]):
new_matrix[i, i] = loop_value

if isspmatrix(matrix):
return new_matrix.tocsc()

return new_matrix


def prune(matrix, threshold):
"""
Prune the matrix so that very small edges are removed.
The maximum value in each column is never pruned.
:param matrix: The matrix to be pruned
:param threshold: The value below which edges will be removed
:returns: The pruned matrix
"""
if isspmatrix(matrix):
pruned = dok_matrix(matrix.shape)
pruned[matrix >= threshold] = matrix[matrix >= threshold]
pruned = pruned.tocsc()
else:
pruned = matrix.copy()
pruned[pruned < threshold] = 0

# keep max value in each column. same behaviour for dense/sparse
num_cols = matrix.shape[1]
row_indices = matrix.argmax(axis=0).reshape((num_cols,))
col_indices = np.arange(num_cols)
pruned[row_indices, col_indices] = matrix[row_indices, col_indices]

return pruned


def converged(matrix1, matrix2):
"""
Check for convergence by determining if
matrix1 and matrix2 are approximately equal.
:param matrix1: The matrix to compare with matrix2
:param matrix2: The matrix to compare with matrix1
:returns: True if matrix1 and matrix2 approximately equal
"""
if isspmatrix(matrix1) or isspmatrix(matrix2):
return sparse_allclose(matrix1, matrix2)

return np.allclose(matrix1, matrix2)


def iterate(matrix, expansion, inflation):
"""
Run a single iteration (expansion + inflation) of the mcl algorithm
:param matrix: The matrix to perform the iteration on
:param expansion: Cluster expansion factor
:param inflation: Cluster inflation factor
"""

# matrix = matrix.todense()

# Expansion
matrix = expand(matrix, expansion)

# Inflation
matrix = inflate(matrix, inflation)

return matrix


def get_clusters(matrix):
"""
Retrieve the clusters from the matrix
:param matrix: The matrix produced by the MCL algorithm
:returns: A list of tuples where each tuple represents a cluster and
contains the indices of the nodes belonging to the cluster
"""
if not isspmatrix(matrix):
# cast to sparse so that we don't need to handle different
# matrix types
matrix = csc_matrix(matrix)

# get the attractors - non-zero elements of the matrix diagonal
attractors = matrix.diagonal().nonzero()[0]

# somewhere to put the clusters
clusters = set()

# the nodes in the same row as each attractor form a cluster
for attractor in attractors:
cluster = tuple(matrix.getrow(attractor).nonzero()[1].tolist())
clusters.add(cluster)

return sorted(list(clusters))


def run_mcl(
matrix,
expansion=2,
inflation=2,
loop_value=1,
iterations=100,
pruning_threshold=0.001,
pruning_frequency=1,
convergence_check_frequency=1,
verbose=False,
):
"""
Perform MCL on the given similarity matrix
:param matrix: The similarity matrix to cluster
:param expansion: The cluster expansion factor
:param inflation: The cluster inflation factor
:param loop_value: Initialization value for self-loops
:param iterations: Maximum number of iterations
(actual number of iterations will be less if convergence is reached)
:param pruning_threshold: Threshold below which matrix elements will be set
set to 0
:param pruning_frequency: Perform pruning every 'pruning_frequency'
iterations.
:param convergence_check_frequency: Perform the check for convergence
every convergence_check_frequency iterations
:param verbose: Print extra information to the console
:returns: The final matrix
"""
assert expansion > 1, "Invalid expansion parameter"
assert inflation > 1, "Invalid inflation parameter"
assert loop_value >= 0, "Invalid loop_value"
assert iterations > 0, "Invalid number of iterations"
assert pruning_threshold >= 0, "Invalid pruning_threshold"
assert pruning_frequency > 0, "Invalid pruning_frequency"
assert convergence_check_frequency > 0, "Invalid convergence_check_frequency"

# Initialize self-loops
if loop_value > 0:
matrix = add_self_loops(matrix, loop_value)

# Normalize
matrix = normalize(matrix)

# iterations
for i in range(iterations):

# store current matrix for convergence checking
last_mat = matrix.copy()

# perform MCL expansion and inflation

matrix = iterate(matrix, expansion, inflation)

# prune
if pruning_threshold > 0 and i % pruning_frequency == pruning_frequency - 1:
matrix = prune(matrix, pruning_threshold)

# Check for convergence
if i % convergence_check_frequency == convergence_check_frequency - 1:
if converged(matrix, last_mat):
break

return matrix
28 changes: 7 additions & 21 deletions cdlib/test/test_community_discovery_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,6 @@
except ModuleNotFoundError:
by = None

try:
import markov_clustering as mc
except ModuleNotFoundError:
mc = None

try:
from cdlib.algorithms.internal.LPAM import LPAM
Expand Down Expand Up @@ -360,24 +356,14 @@ def test_osse(self):

def test_markov_clustering(self):

if mc is not None:
g = get_string_graph()

communities = algorithms.markov_clustering(g)
self.assertEqual(type(communities.communities), list)
if len(communities.communities) > 0:
self.assertEqual(type(communities.communities[0]), list)
if len(communities.communities[0]) > 0:
self.assertEqual(type(communities.communities[0][0]), str)

g = nx.karate_club_graph()
g = nx.karate_club_graph()

communities = algorithms.markov_clustering(g)
self.assertEqual(type(communities.communities), list)
if len(communities.communities) > 0:
self.assertEqual(type(communities.communities[0]), list)
if len(communities.communities[0]) > 0:
self.assertEqual(type(communities.communities[0][0]), int)
communities = algorithms.markov_clustering(g)
self.assertEqual(type(communities.communities), list)
if len(communities.communities) > 0:
self.assertEqual(type(communities.communities[0]), list)
if len(communities.communities[0]) > 0:
self.assertEqual(type(communities.communities[0][0]), int)

# def test_bigClam(self):
# if karateclub is None:
Expand Down
12 changes: 6 additions & 6 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
numpy
scikit-learn
scikit - learn
tqdm
networkx>=3.0
networkx >= 3.0
demon
python-louvain>=0.16
scipy>=1.10
python - louvain >= 0.16
scipy >= 1.10
pulp
seaborn
pandas
eva_lcd
bimlpa
python-igraph>=0.10
python - igraph >= 0.10
angelcommunity
pooch
dynetx
thresholdclustering
python-Levenshtein
python - Levenshtein
3 changes: 1 addition & 2 deletions requirements_optional.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,4 @@ networkit
pycombo
leidenalg
infomap>=1.3.0
wurlitzer>=1.0.2
markov_clustering
wurlitzer>=1.0.2
11 changes: 4 additions & 7 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,9 @@
"GraphRicciCurvature",
"networkit",
"pycombo",
"leidenalg"
],
"pypi": [
"bayanpy",
"pyclustering"
"leidenalg",
],
"pypi": ["bayanpy", "pyclustering"],
"all": [
"infomap>=1.3.0",
"wurlitzer>=1.0.2",
Expand All @@ -68,8 +65,8 @@
"pycombo",
"leidenalg",
"bayanpy",
"pyclustering"
]
"pyclustering",
],
},
packages=find_packages(
exclude=["*.test", "*.test.*", "test.*", "test", "cdlib.test", "cdlib.test.*"]
Expand Down

0 comments on commit fec921c

Please sign in to comment.