 # EMD

In [1]:
%pip install tqdm

Note: you may need to restart the kernel to use updated packages.


In [None]:
%cd /content/BCOT
%load_ext autoreload
%autoreload 2
    
import sys

import scipy.sparse as sp
from bcot.utils import read_dataset
from bcot.bcot import BCOT
from sklearn.metrics import normalized_mutual_info_score as nmi 
from sklearn.metrics import adjusted_rand_score as ari
from sklearn.metrics import davies_bouldin_score
from bcot.metrics import clustering_accuracy
import numpy as np
from time import time 
from tqdm import tqdm

n_runs = 20
plot_block_structure = False

#for dataset in ['wiki', 'pubmed', 'ng20']:
#for dataset in ['wiki', 'pubmed', 'acm', 'dblp', 'ohscal', 'ng20']:
for dataset in ['pubmed','acm']:
  features, labels, n_classes = read_dataset(dataset, sparse=True)
  n, d = features.shape
  k = n_classes

  for scaler in [1, n, d, k]:    

    metrics = {}
    metrics['time'] = []
    metrics['acc'] = []
    metrics['nmi'] = []
    metrics['ari'] = []
    metrics['db'] = []
    metrics['avg_n_clust'] = []
    

    for _ in tqdm(range(n_runs)):
      t0 = time()
      M = -features * scaler
      
      Z_p, W_p = BCOT(M, n_classes, algorithm='emd', reg=1, n_iter=100)

      Z = Z_p.argmax(-1)
      W = W_p.argmax(-1)

      metrics['time'].append(time()-t0)
      metrics['acc'].append(clustering_accuracy(labels, Z)*100)
      metrics['nmi'].append(nmi(labels, Z)*100)
      metrics['ari'].append(ari(labels, Z)*100)
      try:
        metrics['db'].append(davies_bouldin_score(features.toarray(), Z))
      except:
        metrics['db'].append(np.nan)
      metrics['avg_n_clust'].append(len(np.unique(Z)))
      
      
    results = {
      'mean': {k:np.mean(v).round(1 if k != 'time' else 2) for k,v in metrics.items()}, 
      'std': {k:np.std(v).round(1 if k != 'time' else 2) for k,v in metrics.items()}
    }
    
    
    means = results['mean']
    std = results['std']
    print(f'### Results on {dataset}')
    print(f'L(X)=-{scaler}X')
    #print(f"{means['acc']}±{std['acc']} & {means['nmi']}±{std['nmi']} & {means['ari']}±{std['ari']}", sep=',') 
    print(f"acc: {means['acc']}±{std['acc']} & nmi: {means['nmi']}±{std['nmi']} & ari: {means['ari']}±{std['ari']}", sep=',') 
    print(f"time: {means['time']}±{std['time']}")
    print(f"db-index: {means['db']}±{std['db']}")
    print(f"number of clusters: {means['avg_n_clust']}")
    print()

    if plot_block_structure:
      import matplotlib.pyplot as plt


      fig = plt.figure(figsize=(6, 4))
      ax = fig.add_subplot(111)

      # from bcot.utils import binarize
      # Z_p = binarize(Z_p.argmax(-1), k)
      # W_p = binarize(W_p.argmax(-1), k)

      ax.imshow((Z_p @ W_p.T)[Z.argsort()][:, W.argsort()], interpolation='nearest')
      ax.set_aspect('auto')

      plt.xticks([])
      plt.yticks([])
      plt.savefig(f'{dataset}-block-structure.pdf')
      plt.show()

[Errno 2] No such file or directory: '/content/BCOT'
/volume1/scratch/zopdebee/GitHub/HeNCler/BCOT-main


100%|████████████████████████████████████████████████████████████████████████████████| 20/20 [01:16<00:00,  3.82s/it]


### Results on pubmed
L(X)=-1X
acc: 55.2±2.5 & nmi: 16.5±1.3 & ari: 13.6±1.5
time: 3.72±1.05
db-index: 6.9±0.3
number of clusters: 3.0



100%|████████████████████████████████████████████████████████████████████████████████| 20/20 [01:20<00:00,  4.03s/it]


### Results on pubmed
L(X)=-19717X
acc: 50.6±4.2 & nmi: 14.8±1.8 & ari: 11.5±2.0
time: 3.93±1.27
db-index: 7.1±0.3
number of clusters: 3.0



100%|█████████████████████████████████████████████| 20/20 [01:28<00:00,  4.44s/it]


### Results on pubmed
L(X)=-500X
acc: 53.0±4.1 & nmi: 15.0±2.5 & ari: 12.2±2.4
time: 4.34±0.81
db-index: 7.1±0.5
number of clusters: 3.0



100%|█████████████████████████████████████████████| 20/20 [01:33<00:00,  4.69s/it]


### Results on pubmed
L(X)=-3X
acc: 53.3±3.3 & nmi: 15.5±1.9 & ari: 12.4±2.1
time: 4.59±1.32
db-index: 6.9±0.2
number of clusters: 3.0



 70%|███████████████████████████████▍             | 14/20 [00:09<00:04,  1.49it/s]

# Sinkhorn 

In [None]:
%cd /content/BCOT
%load_ext autoreload
%autoreload 2

import sys
import warnings
warnings.filterwarnings("ignore")

import scipy.sparse as sp
from bcot.utils import read_dataset
from bcot.bcot import BCOT
from sklearn.metrics import normalized_mutual_info_score as nmi 
from sklearn.metrics import adjusted_rand_score as ari
from sklearn.metrics import davies_bouldin_score
from bcot.metrics import clustering_accuracy
import numpy as np
from time import time 

n_runs = 1
plot_block_structure = True

for dataset in ['wiki']:
  features, labels, n_classes = read_dataset(dataset, sparse=True)
  n, d = features.shape
  k = n_classes

  for reg in [.0001, .001, .01, .1, 1, 1]:
    for scaler in [1, n, d, k]:

      metrics = {}
      metrics['time'] = []
      metrics['acc'] = []
      metrics['nmi'] = []
      metrics['ari'] = []
      metrics['db'] = []
      metrics['avg_n_clust'] = []
      

      for _ in range(n_runs):
        t0 = time()
        M = -features * scaler
        
        Z_p, W_p = BCOT(M, n_classes, algorithm='sinkhorn', reg=reg, n_iter=100)

        Z = Z_p.argmax(-1)
        W = W_p.argmax(-1)

        metrics['time'].append(time()-t0)
        metrics['acc'].append(clustering_accuracy(labels, Z)*100)
        metrics['nmi'].append(nmi(labels, Z)*100)
        metrics['ari'].append(ari(labels, Z)*100)
        try:
          metrics['db'].append(davies_bouldin_score(features.toarray(), Z))
        except:
          metrics['db'].append(np.nan)
        metrics['avg_n_clust'].append(len(np.unique(Z)))
        
        
      results = {
        'mean': {k:np.mean(v).round(1 if k != 'time' else 2) for k,v in metrics.items()}, 
        'std': {k:np.std(v).round(1 if k != 'time' else 2) for k,v in metrics.items()}
      }
      
      
      means = results['mean']
      std = results['std']
      print(f'### Results on {dataset}')
      print(f'L(X)=-{scaler}X, λ={reg}')
      print(f"{means['acc']}±{std['acc']} & {means['nmi']}±{std['nmi']} & {means['ari']}±{std['ari']}", sep=',') 
      print(f"time: {means['time']}±{std['time']}")
      print(f"db-index: {means['db']}±{std['db']}")
      print(f"number of clusters: {means['avg_n_clust']}")
      print()

      if plot_block_structure:
        import matplotlib.pyplot as plt


        fig = plt.figure(figsize=(6, 4))
        ax = fig.add_subplot(111)

        # from bcot.utils import binarize
        # Z_p = binarize(Z_p.argmax(-1), k)
        # W_p = binarize(W_p.argmax(-1), k)

        ax.imshow((Z_p @ W_p.T)[Z.argsort()][:, W.argsort()], interpolation='nearest')
        ax.set_aspect('auto')

        plt.xticks([])
        plt.yticks([])
        plt.savefig(f'{dataset}-block-structure.pdf')
        plt.show()
        