In [10]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126

Looking in indexes: https://download.pytorch.org/whl/cu126
Collecting torch
  Downloading https://download.pytorch.org/whl/cu126/torch-2.6.0%2Bcu126-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (28 kB)
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu126/torchvision-0.21.0%2Bcu126-cp312-cp312-linux_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cu126/torchaudio-2.6.0%2Bcu126-cp312-cp312-linux_x86_64.whl.metadata (6.6 kB)
Collecting filelock (from torch)
  Downloading https://download.pytorch.org/whl/filelock-3.13.1-py3-none-any.whl.metadata (2.8 kB)
Collecting sympy==1.13.1 (from torch)
  Downloading https://download.pytorch.org/whl/sympy-1.13.1-py3-none-any.whl (6.2 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
Collecting networkx (from torch)
  Downloading https://download.pytorch.org/whl/networkx-3.3-py3-none-any.whl.me

 # EMD

In [2]:
#%cd /content/BCOT
%load_ext autoreload
%autoreload 2
    
import sys

import scipy.sparse as sp
from bcot.utils import read_dataset
from bcot.bcot import BCOT
from sklearn.metrics import davies_bouldin_score
from bcot.metrics import average_pmi_per_cluster
import numpy as np
#import cupy as np
from time import time 
from tqdm import tqdm

w = 11.7
print(np.log(w))
print(np.log1p(w - np.divide(w,w)))
print()


n_runs = 10
plot_block_structure = True

#for dataset, n_clusters in zip(['wiki', 'pubmed', 'acm', 'dblp', 'ohscal', 'ng20'], [23, 3, 18, 2, 100, 140]):
for dataset, n_clusters in zip(['wiki', 'pubmed', 'acm', 'dblp'], [23, 3, 18, 2]):
  print(dataset + '\t' + str(n_clusters))
  features, labels, _ = read_dataset(dataset, sparse=True)
  n, d = features.shape
  k = n_clusters

  for scaler in [1, n, d, k]:    

    metrics = {}
    metrics['db'] = []
    metrics['pmi'] = []
    

    for _ in tqdm(range(n_runs)):
      M = -features * scaler
      #t = time()
      Z_p, W_p = BCOT(M, n_clusters, algorithm='emd', reg=1, n_iter=100)
      #print(time() - t,flush=True)
      Z = Z_p.argmax(-1)
      W = W_p.argmax(-1)

      try:
        metrics['db'].append(davies_bouldin_score(features.toarray(), Z))
      except:
        metrics['db'].append(np.nan)
      metrics['pmi'].append(average_pmi_per_cluster(features.T, W))
      #metrics['pmi'].append(np.nan)
      
      
    results = {
      'mean': {k:np.mean(v).round(4) for k,v in metrics.items()}, 
      'std': {k:np.std(v).round(2) for k,v in metrics.items()}
    }
    
    
    means = results['mean']
    std = results['std']
    print(f'### Results on {dataset}')
    print(f'L(X)=-{scaler}X')
    print(f"pmi:{means['pmi']}±{std['pmi']}") 
    print(f"db-index: {means['db']}±{std['db']}")
    print()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
2.4595888418037104
2.4595888418037104

wiki	23


100%|█████████████████████████████████████████████| 10/10 [01:25<00:00,  8.56s/it]


### Results on wiki
L(X)=-1X
pmi:0.6426±0.02
db-index: 5.7586±0.15



100%|█████████████████████████████████████████████| 10/10 [01:23<00:00,  8.36s/it]


### Results on wiki
L(X)=-2405X
pmi:0.6401±0.02
db-index: 5.7812±0.15



100%|█████████████████████████████████████████████| 10/10 [01:25<00:00,  8.60s/it]


### Results on wiki
L(X)=-4973X
pmi:0.636±0.02
db-index: 5.7651±0.13



100%|█████████████████████████████████████████████| 10/10 [01:30<00:00,  9.00s/it]


### Results on wiki
L(X)=-23X
pmi:0.6332±0.02
db-index: 5.7517±0.12

pubmed	3


100%|█████████████████████████████████████████████| 10/10 [00:49<00:00,  4.91s/it]


### Results on pubmed
L(X)=-1X
pmi:0.5551±0.03
db-index: 7.0678±0.35



100%|█████████████████████████████████████████████| 10/10 [00:36<00:00,  3.63s/it]


### Results on pubmed
L(X)=-19717X
pmi:0.5697±0.01
db-index: 6.9578±0.23



100%|█████████████████████████████████████████████| 10/10 [00:33<00:00,  3.31s/it]


### Results on pubmed
L(X)=-500X
pmi:0.5626±0.01
db-index: 6.9977±0.2



100%|█████████████████████████████████████████████| 10/10 [00:38<00:00,  3.87s/it]


### Results on pubmed
L(X)=-3X
pmi:0.5299±0.04
db-index: 7.228±0.57

acm	18


100%|█████████████████████████████████████████████| 10/10 [00:19<00:00,  1.99s/it]


### Results on acm
L(X)=-1X
pmi:0.2691±0.01
db-index: 10.4101±0.19



100%|█████████████████████████████████████████████| 10/10 [00:20<00:00,  2.00s/it]


### Results on acm
L(X)=-3025X
pmi:0.2714±0.01
db-index: 10.3857±0.1



100%|█████████████████████████████████████████████| 10/10 [00:20<00:00,  2.00s/it]


### Results on acm
L(X)=-1870X
pmi:0.2713±0.0
db-index: 10.3467±0.19



100%|█████████████████████████████████████████████| 10/10 [00:20<00:00,  2.00s/it]


### Results on acm
L(X)=-18X
pmi:0.2745±0.0
db-index: 10.2382±0.11

dblp	2


100%|█████████████████████████████████████████████| 10/10 [00:01<00:00,  9.72it/s]


### Results on dblp
L(X)=-1X
pmi:0.2137±0.04
db-index: 7.6442±0.66



100%|█████████████████████████████████████████████| 10/10 [00:01<00:00,  6.16it/s]


### Results on dblp
L(X)=-4057X
pmi:0.2162±0.03
db-index: 7.3792±0.66



100%|█████████████████████████████████████████████| 10/10 [00:01<00:00,  7.06it/s]


### Results on dblp
L(X)=-334X
pmi:0.1931±0.04
db-index: 7.2328±0.91



100%|█████████████████████████████████████████████| 10/10 [00:01<00:00,  9.10it/s]

### Results on dblp
L(X)=-2X
pmi:0.2047±0.04
db-index: 7.4938±0.89






# Sinkhorn 

In [None]:
%cd /content/BCOT
%load_ext autoreload
%autoreload 2

import sys

import scipy.sparse as sp
from bcot.utils import read_dataset
from bcot.bcot import BCOT
from sklearn.metrics import davies_bouldin_score
from bcot.metrics import average_pmi_per_cluster
import numpy as np
from time import time 

n_runs = 2
plot_block_structure = True

for dataset, n_clusters in zip(['wiki', 'pubmed', 'acm', 'dblp', 'ohscal', 'ng20'], [23, 3, 18, 2, 100, 140]):
  features, labels, _ = read_dataset(dataset, sparse=True)
  n, d = features.shape
  k = n_clusters

  for scaler in [1, n, d, k]:    
    for reg in [.0001, .001, .01, .1, 1, 1]:
      metrics = {}
      metrics['db'] = []
      metrics['pmi'] = []
      

      for _ in range(n_runs):
        M = -features * scaler
        
        Z_p, W_p = BCOT(M, n_clusters, algorithm='sinkhorn', reg=reg, n_iter=100)

        Z = Z_p.argmax(-1)
        W = W_p.argmax(-1)

        try:
          metrics['db'].append(davies_bouldin_score(features.toarray(), Z))
        except:
          metrics['db'].append(np.nan)
        metrics['pmi'].append(average_pmi_per_cluster(features.T, W))
        
        
      results = {
        'mean': {k:np.mean(v).round(4) for k,v in metrics.items()}, 
        'std': {k:np.std(v).round(2) for k,v in metrics.items()}
      }
      
      
      means = results['mean']
      std = results['std']
      print(f'### Results on {dataset}')
      print(f'L(X)=-{scaler}X')
      print(f"pmi:{means['pmi']}±{std['pmi']}") 
      print(f"db-index: {means['db']}±{std['db']}")
      print()