 # EMD

In [1]:
%pip install tqdm

Note: you may need to restart the kernel to use updated packages.


In [12]:
%cd /content/BCOT
%load_ext autoreload
%autoreload 2
    
import sys

import scipy.sparse as sp
from bcot.utils import read_dataset
from bcot.bcot import BCOT
from sklearn.metrics import normalized_mutual_info_score as nmi 
from sklearn.metrics import adjusted_rand_score as ari
from sklearn.metrics import davies_bouldin_score
from bcot.metrics import clustering_accuracy
import numpy as np
from time import time 
from tqdm import tqdm

n_runs = 20
plot_block_structure = False

#for dataset in ['wiki', 'pubmed', 'ng20']:
#for dataset in ['wiki', 'pubmed', 'acm', 'dblp', 'ohscal', 'ng20']:
for dataset in ['wiki']:
  features, labels, n_classes = read_dataset(dataset, sparse=True)
  n, d = features.shape
  k = n_classes

  for scaler in [1, n, d, k]:    

    metrics = {}
    metrics['time'] = []
    metrics['acc'] = []
    metrics['nmi'] = []
    metrics['ari'] = []
    metrics['db'] = []
    metrics['avg_n_clust'] = []
    

    for _ in tqdm(range(n_runs)):
      t0 = time()
      M = -features * scaler
      
      Z_p, W_p = BCOT(M, n_classes, algorithm='emd', reg=1, n_iter=100)

      Z = Z_p.argmax(-1)
      W = W_p.argmax(-1)

      metrics['time'].append(time()-t0)
      metrics['acc'].append(clustering_accuracy(labels, Z)*100)
      metrics['nmi'].append(nmi(labels, Z)*100)
      metrics['ari'].append(ari(labels, Z)*100)
      try:
        metrics['db'].append(davies_bouldin_score(features.toarray(), Z))
      except:
        metrics['db'].append(np.nan)
      metrics['avg_n_clust'].append(len(np.unique(Z)))
      
      
    results = {
      'mean': {k:np.mean(v).round(1 if k != 'time' else 2) for k,v in metrics.items()}, 
      'std': {k:np.std(v).round(1 if k != 'time' else 2) for k,v in metrics.items()}
    }
    
    
    means = results['mean']
    std = results['std']
    print(f'### Results on {dataset}')
    print(f'L(X)=-{scaler}X')
    #print(f"{means['acc']}±{std['acc']} & {means['nmi']}±{std['nmi']} & {means['ari']}±{std['ari']}", sep=',') 
    print(f"acc: {means['acc']}±{std['acc']} & nmi: {means['nmi']}±{std['nmi']} & ari: {means['ari']}±{std['ari']}", sep=',') 
    print(f"time: {means['time']}±{std['time']}")
    print(f"db-index: {means['db']}±{std['db']}")
    print(f"number of clusters: {means['avg_n_clust']}")
    print()

    if plot_block_structure:
      import matplotlib.pyplot as plt


      fig = plt.figure(figsize=(6, 4))
      ax = fig.add_subplot(111)

      # from bcot.utils import binarize
      # Z_p = binarize(Z_p.argmax(-1), k)
      # W_p = binarize(W_p.argmax(-1), k)

      ax.imshow((Z_p @ W_p.T)[Z.argsort()][:, W.argsort()], interpolation='nearest')
      ax.set_aspect('auto')

      plt.xticks([])
      plt.yticks([])
      plt.savefig(f'{dataset}-block-structure.pdf')
      plt.show()

[Errno 2] No such file or directory: '/content/BCOT'
/volume1/scratch/zopdebee/GitHub/HeNCler/BCOT-main
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


100%|█████████████████████████████████████████████| 20/20 [00:51<00:00,  2.59s/it]


### Results on wiki
L(X)=-1X
acc: 48.4±3.5 & nmi: 47.1±2.5 & ari: 29.4±2.8
time: 2.44±0.45
db-index: 5.9±0.2
number of clusters: 17.0



100%|█████████████████████████████████████████████| 20/20 [01:02<00:00,  3.13s/it]


### Results on wiki
L(X)=-2405X
acc: 49.3±2.6 & nmi: 47.7±1.9 & ari: 30.2±2.0
time: 2.89±0.51
db-index: 5.9±0.2
number of clusters: 17.0



100%|█████████████████████████████████████████████| 20/20 [01:08<00:00,  3.43s/it]


### Results on wiki
L(X)=-4973X
acc: 49.5±1.9 & nmi: 47.6±1.1 & ari: 30.2±1.5
time: 3.05±0.5
db-index: 5.9±0.2
number of clusters: 17.0



100%|█████████████████████████████████████████████| 20/20 [01:04<00:00,  3.23s/it]

### Results on wiki
L(X)=-17X
acc: 48.6±3.3 & nmi: 47.1±2.0 & ari: 29.5±2.3
time: 2.88±0.59
db-index: 5.9±0.2
number of clusters: 17.0






# Sinkhorn 

In [3]:
%cd /content/BCOT
%load_ext autoreload
%autoreload 2

import sys
import warnings
warnings.filterwarnings("ignore")

import scipy.sparse as sp
from bcot.utils import read_dataset
from bcot.bcot import BCOT
from sklearn.metrics import normalized_mutual_info_score as nmi 
from sklearn.metrics import adjusted_rand_score as ari
from sklearn.metrics import davies_bouldin_score
from bcot.metrics import clustering_accuracy
import numpy as np
from time import time 

n_runs = 1
plot_block_structure = False

for dataset in ['dblp']:
  features, labels, n_classes = read_dataset(dataset, sparse=True)
  n, d = features.shape
  k = n_classes

  for reg in [.0001, .001, .01, .1, 1, 1]:
    for scaler in [1, n, d, k]:

      metrics = {}
      metrics['time'] = []
      metrics['acc'] = []
      metrics['nmi'] = []
      metrics['ari'] = []
      metrics['db'] = []
      metrics['avg_n_clust'] = []
      

      for _ in range(n_runs):
        t0 = time()
        M = -features * scaler
        
        Z_p, W_p = BCOT(M, n_classes, algorithm='sinkhorn', reg=reg, n_iter=100)

        Z = Z_p.argmax(-1)
        W = W_p.argmax(-1)

        metrics['time'].append(time()-t0)
        metrics['acc'].append(clustering_accuracy(labels, Z)*100)
        metrics['nmi'].append(nmi(labels, Z)*100)
        metrics['ari'].append(ari(labels, Z)*100)
        try:
          metrics['db'].append(davies_bouldin_score(features.toarray(), Z))
        except:
          metrics['db'].append(np.nan)
        metrics['avg_n_clust'].append(len(np.unique(Z)))
        
        
      results = {
        'mean': {k:np.mean(v).round(1 if k != 'time' else 2) for k,v in metrics.items()}, 
        'std': {k:np.std(v).round(1 if k != 'time' else 2) for k,v in metrics.items()}
      }
      
      
      means = results['mean']
      std = results['std']
      print(f'### Results on {dataset}')
      print(f'L(X)=-{scaler}X, λ={reg}')
      #print(f"{means['acc']}±{std['acc']} & {means['nmi']}±{std['nmi']} & {means['ari']}±{std['ari']}", sep=',') 
      print(f"acc: {means['acc']}±{std['acc']} & nmi: {means['nmi']}±{std['nmi']} & ari: {means['ari']}±{std['ari']}", sep=',') 
      print(f"time: {means['time']}±{std['time']}")
      print(f"db-index: {means['db']}±{std['db']}")
      print(f"number of clusters: {means['avg_n_clust']}")
      print()

      if plot_block_structure:
        import matplotlib.pyplot as plt


        fig = plt.figure(figsize=(6, 4))
        ax = fig.add_subplot(111)

        # from bcot.utils import binarize
        # Z_p = binarize(Z_p.argmax(-1), k)
        # W_p = binarize(W_p.argmax(-1), k)

        ax.imshow((Z_p @ W_p.T)[Z.argsort()][:, W.argsort()], interpolation='nearest')
        ax.set_aspect('auto')

        plt.xticks([])
        plt.yticks([])
        plt.savefig(f'{dataset}-block-structure.pdf')
        plt.show()
        

[Errno 2] No such file or directory: '/content/BCOT'
/volume1/scratch/zopdebee/GitHub/HeNCler/BCOT-main
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
### Results on dblp
L(X)=-1X, λ=0.0001
acc: 29.5±0.0 & nmi: 0.0±0.0 & ari: 0.0±0.0
time: 0.06±0.0
db-index: nan±nan
number of clusters: 1.0

### Results on dblp
L(X)=-4057X, λ=0.0001
acc: 29.5±0.0 & nmi: 0.0±0.0 & ari: 0.0±0.0
time: 0.06±0.0
db-index: nan±nan
number of clusters: 1.0

### Results on dblp
L(X)=-334X, λ=0.0001
acc: 29.5±0.0 & nmi: 0.0±0.0 & ari: 0.0±0.0
time: 0.06±0.0
db-index: nan±nan
number of clusters: 1.0

### Results on dblp
L(X)=-4X, λ=0.0001
acc: 29.5±0.0 & nmi: 0.0±0.0 & ari: 0.0±0.0
time: 0.06±0.0
db-index: nan±nan
number of clusters: 1.0

### Results on dblp
L(X)=-1X, λ=0.001
acc: 49.5±0.0 & nmi: 19.4±0.0 & ari: 18.7±0.0
time: 0.62±0.0
db-index: 7.1±0.0
number of clusters: 4.0

### Results on dblp
L(X)=-4057X, λ=0.001
acc: 29.5±0.0 & nmi: 0.0±0.0 & ari: 0.0±0.0
time: 0.06±0