- .[Davies Bouldin](#DBI)
- .[Silhouette Score](#sil)

# Davies Bouldin Index <a id='DBI'></a>

In [18]:
import random
import numpy as np
from sklearn.metrics import pairwise_distances, davies_bouldin_score

n_feature = 5
n_sample = 100
X = [[random.random() for _ in range(n_feature)] for _ in range(n_sample)]
labels = [random.randint(1, 3) for _ in range(n_sample)]
X, labels = np.array(X), np.array(labels)



def get_center_and_intra(cluster_x: np.array) -> tuple:
    center = np.mean(cluster_x, axis=0)
    print(center.shape)
    dist = pairwise_distances(cluster_x, [center])
    return (center, np.mean(dist))

In [19]:
from tqdm import tqdm
center_and_intras = []
for i in range(1, 4):
    cluster_x = X[labels == i]
    center_and_intras.append(get_center_and_intra(cluster_x))

(43, 5)
(5,)
(30, 5)
(5,)
(27, 5)
(5,)


In [22]:
from collections import namedtuple
Distance = namedtuple('Distance', ['cluster', 'dist'])


cluster2other = dict()
for i, t2 in enumerate(center_and_intras, 1):
    for j, other_t2 in enumerate(center_and_intras, 1):
        if j <= i:
            continue
        delta = t2[1] + other_t2[1]
        sigma = np.linalg.norm(t2[0] - other_t2[0])
        distance = delta / sigma
        cluster2other.setdefault(i, []).append(Distance(cluster=j, dist=distance))
        cluster2other.setdefault(j, []).append(Distance(cluster=i, dist=distance))

In [26]:
cluster2nearest = {}
for i, dists in cluster2other.items():
    cluster2nearest[i] = max(dists, key=lambda x: x.dist)

    
dbi = np.mean([v.dist for v in cluster2nearest.values()])
dbi

10.257773667103406

In [27]:
davies_bouldin_score(X, labels)

10.25777366710325

# Silhouette Score <a id='sil'></a>

In [43]:
from sklearn.metrics import silhouette_score, silhouette_samples
silhouette_score(X, labels)

-0.026550268419739464

In [29]:
import inspect

# inspect.getsourcelines(silhouette_score)

(['@_deprecate_positional_args\n',
  "def silhouette_score(X, labels, *, metric='euclidean', sample_size=None,\n",
  '                     random_state=None, **kwds):\n',
  '    """Compute the mean Silhouette Coefficient of all samples.\n',
  '\n',
  '    The Silhouette Coefficient is calculated using the mean intra-cluster\n',
  '    distance (``a``) and the mean nearest-cluster distance (``b``) for each\n',
  '    sample.  The Silhouette Coefficient for a sample is ``(b - a) / max(a,\n',
  '    b)``.  To clarify, ``b`` is the distance between a sample and the nearest\n',
  '    cluster that the sample is not a part of.\n',
  '    Note that Silhouette Coefficient is only defined if number of labels\n',
  '    is 2 <= n_labels <= n_samples - 1.\n',
  '\n',
  '    This function returns the mean Silhouette Coefficient over all samples.\n',
  '    To obtain the values for each sample, use :func:`silhouette_samples`.\n',
  '\n',
  '    The best value is 1 and the worst value is -1. Values 

In [55]:
def get_silhouette_for_sample(x, cluster_x, other_clusters):
    """ 
    a meansures how close to the points in the same cluster
    inters measure how close to the points of another cluster
    b is the smallest among inters and represents how close to the nearest neighboring cluster
    """
    n_cluster = cluster_x.shape[0]
    if n_cluster == 1: return 0
    a = np.sum(pairwise_distances(cluster_x, [x])) / (n_cluster - 1)
    inters = []
    for other_cluster in other_clusters:
        inters.append(np.mean(pairwise_distances(other_cluster, [x]))) 
    b = min(inters) 
    return (b - a) / max(a, b)

In [56]:
X.shape

(100, 5)

In [57]:
x = X[:3, :]
x, np.delete(x, 0, 0)

(array([[0.65481691, 0.25052482, 0.20312108, 0.26463258, 0.86589611],
        [0.20183729, 0.31023726, 0.6077858 , 0.87119358, 0.53492397],
        [0.26669158, 0.3043665 , 0.18981735, 0.99389037, 0.63526697]]),
 array([[0.20183729, 0.31023726, 0.6077858 , 0.87119358, 0.53492397],
        [0.26669158, 0.3043665 , 0.18981735, 0.99389037, 0.63526697]]))

In [58]:
label2index = dict(zip(range(1, 4), [labels == i for i in range(1, 4)]))
silhouettes = list()

for k, v in label2index.items():
    print(k, np.sum(v))

for x, label in zip(X, labels):
    cluster_x = X[label2index[label]]
    other_clusters = [X[label2index[other_label]] for other_label in range(1, 4) if other_label != label]
    silhouette = get_silhouette_for_sample(x, cluster_x, other_clusters)
    silhouettes.append(silhouette)

1 43
2 30
3 27


In [59]:
np.mean(silhouettes)

-0.026550268508579694

In [60]:
i

99

In [61]:
silhouette_samples(X, labels)

array([-0.01911961, -0.00319283,  0.01774095, -0.02866315, -0.04380257,
       -0.06636078, -0.05874458, -0.10084966, -0.02507859, -0.03774477,
       -0.00797902,  0.03115136, -0.10394859,  0.01490487, -0.02499371,
       -0.06937265, -0.05467665, -0.07873286, -0.06165016, -0.02409868,
        0.0159463 ,  0.00309462, -0.09990845,  0.02490216, -0.03354179,
       -0.09984039,  0.03272453, -0.00711942,  0.04043652,  0.02534043,
       -0.04877535,  0.03443575, -0.0032145 , -0.05108119,  0.00473856,
       -0.06391584,  0.00579958, -0.02054897, -0.05452016, -0.05437354,
       -0.09029871, -0.046282  , -0.08639628, -0.07650065, -0.07409867,
        0.03055817, -0.11967971,  0.03561882,  0.0300124 ,  0.01857143,
        0.04404687,  0.02198653,  0.00205605, -0.00677625,  0.01148252,
       -0.00805039,  0.01558129, -0.00081476, -0.07337459, -0.07131647,
        0.00670091, -0.07623232, -0.08539074,  0.05407924, -0.02699979,
        0.00232213,  0.00529427,  0.00203591,  0.01580862, -0.08