 # EMD

In [1]:
%cd /content/BCOT
%load_ext autoreload
%autoreload 2

import sys

import scipy.sparse as sp
from bcot.utils import read_dataset
from bcot.bcot import BCOT
from sklearn.metrics import davies_bouldin_score
from bcot.metrics import average_pmi_per_cluster
import numpy as np
from time import time 

n_runs = 2
plot_block_structure = True

for dataset, n_clusters in zip(['wiki', 'pubmed', 'acm', 'dblp', 'ohscal', 'ng20'], [23, 3, 18, 2, 100, 140]):
  features, labels, _ = read_dataset(dataset, sparse=True)
  n, d = features.shape
  k = n_clusters

  for scaler in [1, n, d, k]:    

    metrics = {}
    metrics['db'] = []
    metrics['pmi'] = []
    

    for _ in range(n_runs):
      M = -features * scaler
      
      Z_p, W_p = BCOT(M, n_clusters, algorithm='emd', reg=1, n_iter=100)

      Z = Z_p.argmax(-1)
      W = W_p.argmax(-1)

      try:
        metrics['db'].append(davies_bouldin_score(features.toarray(), Z))
      except:
        metrics['db'].append(np.nan)
      metrics['pmi'].append(average_pmi_per_cluster(features.T, W))
      
      
    results = {
      'mean': {k:np.mean(v).round(4) for k,v in metrics.items()}, 
      'std': {k:np.std(v).round(2) for k,v in metrics.items()}
    }
    
    
    means = results['mean']
    std = results['std']
    print(f'### Results on {dataset}')
    print(f'L(X)=-{scaler}X')
    print(f"pmi:{means['pmi']}±{std['pmi']}") 
    print(f"db-index: {means['db']}±{std['db']}")
    print()

[Errno 2] No such file or directory: '/content/BCOT'
/volume1/scratch/zopdebee/GitHub/HeNCler/BCOT-main


TypeError: loop of ufunc does not support argument 0 of type coo_matrix which has no callable log method

# Sinkhorn 

In [None]:
%cd /content/BCOT
%load_ext autoreload
%autoreload 2

import sys

import scipy.sparse as sp
from bcot.utils import read_dataset
from bcot.bcot import BCOT
from sklearn.metrics import davies_bouldin_score
from bcot.metrics import average_pmi_per_cluster
import numpy as np
from time import time 

n_runs = 2
plot_block_structure = True

for dataset, n_clusters in zip(['wiki', 'pubmed', 'acm', 'dblp', 'ohscal', 'ng20'], [23, 3, 18, 2, 100, 140]):
  features, labels, _ = read_dataset(dataset, sparse=True)
  n, d = features.shape
  k = n_clusters

  for scaler in [1, n, d, k]:    
    for reg in [.0001, .001, .01, .1, 1, 1]:
      metrics = {}
      metrics['db'] = []
      metrics['pmi'] = []
      

      for _ in range(n_runs):
        M = -features * scaler
        
        Z_p, W_p = BCOT(M, n_clusters, algorithm='sinkhorn', reg=reg, n_iter=100)

        Z = Z_p.argmax(-1)
        W = W_p.argmax(-1)

        try:
          metrics['db'].append(davies_bouldin_score(features.toarray(), Z))
        except:
          metrics['db'].append(np.nan)
        metrics['pmi'].append(average_pmi_per_cluster(features.T, W))
        
        
      results = {
        'mean': {k:np.mean(v).round(4) for k,v in metrics.items()}, 
        'std': {k:np.std(v).round(2) for k,v in metrics.items()}
      }
      
      
      means = results['mean']
      std = results['std']
      print(f'### Results on {dataset}')
      print(f'L(X)=-{scaler}X')
      print(f"pmi:{means['pmi']}±{std['pmi']}") 
      print(f"db-index: {means['db']}±{std['db']}")
      print()