# Clustering experiment

Dataset links:
* [ecoli](https://archive.ics.uci.edu/ml/datasets/ecoli)
* [olive](https://www.scss.tcd.ie/~arwhite/Teaching/STU33011/Lab4.html)
* [satellite](https://archive.ics.uci.edu/ml/datasets/Statlog+(Landsat+Satellite))
* [vehicle](https://archive.ics.uci.edu/ml/datasets/Statlog+(Vehicle+Silhouettes))
* [berkeley](https://www2.eecs.berkeley.edu/Research/Projects/CS/vision/bsds/)

In [1]:
from mixes import GMN, DGMM, GMM, stopping_criterion
from sklearn import preprocessing, datasets, cluster
from sklearn.mixture import GaussianMixture
from utils import *

%load_ext autoreload
%autoreload 2

## Wine dataset
(Num datapoints: `178`, num features: `13`, num clusters: `3`)

In [206]:
wine_data, wine_labels = datasets.load_wine(return_X_y=True)
wine_data = preprocessing.scale(wine_data)

alg_functions = [
    lambda e: cluster.KMeans(n_clusters=3, n_init=1, max_iter=100),
    lambda e: GMM(3, init='kmeans',
          update_rate=1,
          num_iter=100, evaluator=e),
    lambda e: DGMM([3, 1], [3, 2], init='kmeans',
          update_rate=0.1, use_annealing=False,
          num_iter=200, evaluator=e),
    lambda e: DGMM([3, 1], [3, 2], init='kmeans',
          update_rate=0.1, use_annealing=True, annealing_start_v=0.5,
          num_iter=200, evaluator=e),
    lambda e: GMN([3, 1], [3, 2], init='kmeans',
          update_rate=0.1, use_annealing=False,
          num_iter=200, evaluator=e),
    lambda e: GMN([3, 1], [3, 2], init='kmeans',
          update_rate=0.1, use_annealing=True, annealing_start_v=0.5,
          num_iter=200, evaluator=e),
]
alg_names = ["KMeans", "GMM", "DGMM", "DGMM ann", "GMN", "GMN ann"]

wine_result = test_algorithms_on_data(alg_functions, alg_names,
                                      wine_data, wine_labels, num_repeats=10)
wine_result

Num datapoints: 178, num features: 13, num clusters: 3
Alg KMeans run  10 /  10
Alg GMM run  10 /  10
Alg DGMM run  10 /  10
Alg DGMM ann run  10 /  10
Alg GMN run  10 /  10
Alg GMN ann run  10 /  10


Unnamed: 0,sil,sil best,m.r.,m.r. best,ari,ari best,log lik,log lik best
KMeans,0.283272,0.284859,0.041573,0.033708,0.873413,0.897495,-,-
GMM,0.278107,0.278107,0.016854,0.016854,0.945885,0.945885,-2073.051599,-2073.051599
DGMM,0.27885,0.280492,0.007865,0.011236,0.975108,0.963651,-2307.503359,-2227.307357
DGMM ann,0.278271,0.27978,0.007865,0.0,0.974955,1.0,-2283.907814,-2230.823199
GMN,0.278381,0.280008,0.011798,0.016854,0.962416,0.948536,-2282.672645,-2245.640753
GMN ann,0.27864,0.27978,0.005618,0.0,0.982537,1.0,-2298.617126,-2216.986327


## Ecoli dataset
(Num datapoints: `336`, num features: `7`, num clusters: `8`)

In [208]:
ecoli_data, ecoli_labels = load_ecoli()
ecoli_data = preprocessing.scale(ecoli_data)

alg_functions = [
    lambda e: cluster.KMeans(n_clusters=8, n_init=1, max_iter=100),
    lambda e: GaussianMixture(8, max_iter=100),
    lambda e: DGMM([8, 4, 1], [6, 5, 4], init='kmeans',
          update_rate=1e-3, use_annealing=False,
          num_iter=200, evaluator=e, var_regularization=1e-10),
    lambda e: DGMM([8, 4, 1], [6, 5, 4], init='kmeans',
          update_rate=1e-3, use_annealing=True, annealing_start_v=0.7,
          num_iter=200, evaluator=e, var_regularization=1e-10),
    lambda e: GMN([8, 4, 1], [6, 5, 4], init='kmeans',
          update_rate=1e-3, use_annealing=False,
          num_iter=200, evaluator=e, var_regularization=1e-10),
    lambda e: GMN([8, 4, 1], [6, 5, 4], init='kmeans',
          update_rate=1e-3, use_annealing=True, annealing_start_v=0.7,
          num_iter=200, evaluator=e, var_regularization=1e-10),
]
alg_names = ["KMeans", "GMM", "DGMM", "DGMM ann", "GMN", "GMN ann"]

ecoli_result = test_algorithms_on_data(alg_functions, alg_names,
                                      ecoli_data, ecoli_labels, num_repeats=10)
ecoli_result

Num datapoints: 336, num features: 7, num clusters: 8
Alg KMeans run  10 /  10
Alg GMM run  10 /  10
Alg DGMM run  10 /  10
Alg DGMM ann run  10 /  10
Alg GMN run  10 /  10
Alg GMN ann run  10 /  10


Unnamed: 0,sil,sil best,m.r.,m.r. best,ari,ari best,log lik,log lik best
KMeans,0.27165,0.349556,0.335119,0.1875,0.528851,0.717215,-,-
GMM,0.200657,0.268006,0.239286,0.247024,0.653183,0.646588,-,-
DGMM,0.336975,0.350948,0.180952,0.172619,0.749611,0.770124,35.302275,122.020485
DGMM ann,0.334669,0.350032,0.190179,0.175595,0.734427,0.768221,-177.752903,-136.358985
GMN,0.347735,0.355547,0.173214,0.157738,0.764307,0.789363,-74.111751,-2.14935
GMN ann,0.335814,0.345483,0.187202,0.178571,0.736743,0.756,-274.026009,-233.98987


# Satellite dataset
(Num datapoints: `6435`, num features: `36`, num clusters: `6`)

In [204]:
satellite_data, satellite_labels = load_satellite()
satellite_data = preprocessing.scale(satellite_data)
stopping_crit=stopping_criterion.create_log_lik_criterion(1e-2)

alg_functions = [
    lambda e: cluster.KMeans(n_clusters=6, n_init=1, max_iter=100),
    lambda e: GaussianMixture(6, max_iter=100),
    lambda e: DGMM([6, 5, 1], [13, 5, 1], init='kmeans',
          update_rate=0.1, use_annealing=False,
          num_iter=100, evaluator=e, var_regularization=0.05,
          stopping_criterion=stopping_crit),
    lambda e: DGMM([6, 5, 1], [13, 5, 1], init='kmeans',
          update_rate=0.1, use_annealing=True, annealing_start_v=0.5,
          num_iter=100, evaluator=e, var_regularization=0.05,
          stopping_criterion=stopping_crit),
    lambda e: GMN([6, 5, 1], [13, 5, 1], init='kmeans',
          update_rate=0.1, use_annealing=False,
          num_iter=100, evaluator=e, var_regularization=0.05,
          stopping_criterion=stopping_crit),
    lambda e: GMN([6, 5, 1], [13, 5, 1], init='kmeans',
          update_rate=0.1, use_annealing=True, annealing_start_v=0.5,
          num_iter=100, evaluator=e, var_regularization=0.05,
          stopping_criterion=stopping_crit),
]
alg_names = ["KMeans", "GMM", "DGMM", "DGMM ann", "GMN", "GMN ann"]

satellite_result = test_algorithms_on_data(alg_functions, alg_names,
                                           satellite_data, satellite_labels, num_repeats=10)
satellite_result

Num datapoints: 6435, num features: 36, num clusters: 6
Alg KMeans run  10 /  10
Alg GMM run  10 /  10
Alg DGMM run  10 /  10
Alg DGMM ann run  10 /  10
Alg GMN run  10 /  10
Alg GMN ann run  10 /  10


Unnamed: 0,sil,sil best,m.r.,m.r. best,ari,ari best,log lik,log lik best
KMeans,0.352098,0.352316,0.321197,0.321523,0.529934,0.529496,-,-
GMM,0.084326,0.086656,0.413302,0.405905,0.466858,0.467825,-,-
DGMM,0.235615,0.277924,0.282145,0.292308,0.571499,0.568355,-37147.434483,-34689.404335
DGMM ann,0.236024,0.259256,0.279611,0.280963,0.578266,0.573748,-36394.537222,-32938.001466
GMN,0.181137,0.200782,0.301445,0.32634,0.535684,0.489159,-33000.603463,-31244.530441
GMN ann,0.148977,0.207256,0.304848,0.21927,0.523701,0.630084,-32625.665866,-30385.554444


## Digits dataset
(Num datapoints: `1797`, num features: `64`, num clusters: `10`)


In [150]:
digits_data, digits_labels = datasets.load_digits(return_X_y=True)
stopping_crit=stopping_criterion.create_log_lik_criterion(0.05)

alg_functions = [
    lambda e: cluster.KMeans(n_clusters=10, n_init=1, max_iter=100),
    lambda e: GaussianMixture(10, max_iter=100),
    lambda e: DGMM([10, 5, 2], [10, 6, 2], init='kmeans',
          update_rate=1, num_iter=100, evaluator=e,
          var_regularization=2e-3, stopping_criterion=stopping_crit),
    lambda e: DGMM([10, 5, 2], [10, 6, 2], init='kmeans',
          update_rate=1, num_iter=100, evaluator=e,
          use_annealing=True, annealing_start_v=0.5,
          var_regularization=2e-3, stopping_criterion=stopping_crit),
    lambda e: GMN([10, 5, 2], [10, 6, 2], init='kmeans',
          update_rate=1, num_iter=100, evaluator=e,
          var_regularization=2e-3, stopping_criterion=stopping_crit),
    lambda e: GMN([10, 5, 2], [10, 6, 2], init='kmeans',
          update_rate=1, num_iter=100, evaluator=e,
          use_annealing=True, annealing_start_v=0.5,
          var_regularization=2e-3, stopping_criterion=stopping_crit),
]
alg_names = ["KMeans", "GMM", "DGMM", "DGMM ann", "GMN", "GMN ann"]

digits_result = test_algorithms_on_data(alg_functions, alg_names,
                                        digits_data, digits_labels, num_repeats=10)
digits_result

Num datapoints: 1797, num features: 64, num clusters: 10
Alg KMeans run  10 /  10
Alg GMM run  10 /  10
Alg DGMM run   9 /  10

  exp_v /= denom
  exp_vv /= denom
  exp_w /= denom
  exp_vw /= denom
  exp_ww /= denom


Alg DGMM run  10 /  10
Alg DGMM ann run  10 /  10
Alg GMN run   2 /  10

  exp_v /= denom
  exp_vv /= denom
  exp_w /= denom
  exp_vw /= denom
  exp_ww /= denom


Alg GMN run  10 /  10
Alg GMN ann run  10 /  10


Unnamed: 0,sil,sil best,m.r.,m.r. best,ari,ari best,log lik,log lik best
KMeans,0.18371,0.188035,0.257596,0.300501,0.632725,0.589879,-,-
GMM,0.167104,0.182951,0.245075,0.214246,0.640889,0.693401,-,-
DGMM,0.174209,0.17581,0.185309,0.187535,0.698282,0.694344,-125913.98566,-121578.144908
DGMM ann,0.173633,0.174811,0.183027,0.184752,0.699174,0.69692,-126293.674141,-124374.805845
GMN,0.171386,0.172556,0.184975,0.185865,0.701635,0.701463,-120121.258641,-119606.375907
GMN ann,0.171167,0.171851,0.183417,0.183083,0.70448,0.707283,-119781.096078,-119229.425226


# Vehicle dataset
(Num datapoints: `846`, num features: `18`, num clusters: `4`)

In [199]:
vehicle_data, vehicle_labels = load_vehicle()
vehicle_data = preprocessing.scale(vehicle_data)

alg_functions = [
    lambda e: cluster.KMeans(n_clusters=4, n_init=1, max_iter=100),
    lambda e: GaussianMixture(4, max_iter=100),
    lambda e: DGMM([4, 3, 2], [7, 3, 3], init='kmeans',
          update_rate=0.1, use_annealing=True, annealing_start_v=0.5,
          num_iter=200, evaluator=e, var_regularization=2e-6),
    lambda e: GMN([4, 3, 2], [7, 3, 3], init='kmeans',
          update_rate=0.1, use_annealing=True, annealing_start_v=0.5,
          num_iter=200, evaluator=e, var_regularization=2e-6),
]
alg_names = ["KMeans", "GMM", "DGMM ann", "GMN ann"]
assert len(alg_functions) == len(alg_names)

vehicle_result = test_algorithms_on_data(alg_functions, alg_names,
                                      vehicle_data, vehicle_labels, num_repeats=20)
vehicle_result

Num datapoints: 846, num features: 18, num clusters: 4
Alg KMeans run  20 /  20
Alg GMM run  20 /  20
Alg DGMM ann run  16 /  20

  exp_v /= denom
  exp_vv /= denom
  exp_w /= denom
  exp_vw /= denom
  exp_ww /= denom


Alg DGMM ann run  20 /  20
Alg GMN ann run   2 /  20

  exp_v /= denom
  exp_vv /= denom
  exp_w /= denom
  exp_vw /= denom
  exp_ww /= denom


Alg GMN ann run   2 /  20

  exp_v /= denom
  exp_vv /= denom
  exp_w /= denom
  exp_vw /= denom
  exp_ww /= denom


Alg GMN ann run   4 /  20

  exp_v /= denom
  exp_vv /= denom
  exp_w /= denom
  exp_vw /= denom
  exp_ww /= denom


Alg GMN ann run   4 /  20

  exp_v /= denom
  exp_vv /= denom
  exp_w /= denom
  exp_vw /= denom
  exp_ww /= denom


Alg GMN ann run   9 /  20

  exp_v /= denom
  exp_vv /= denom
  exp_w /= denom
  exp_vw /= denom
  exp_ww /= denom


Alg GMN ann run  20 /  20


Unnamed: 0,sil,sil best,m.r.,m.r. best,ari,ari best,log lik,log lik best
KMeans,0.25387,0.305422,0.638416,0.641844,0.069479,0.076041,-,-
GMM,0.261065,0.287192,0.619031,0.602837,0.095354,0.091301,-,-
DGMM ann,0.185077,0.259449,0.599291,0.654846,0.108794,0.069609,-3209.319502,-1599.871559
GMN ann,0.192866,0.255594,0.585343,0.591017,0.129833,0.114005,-2608.730137,-1289.025369
