## SSE

As input, we will receive a list of cluster along with the points that belong to them, i.e.:
- input=[cluster1, cluster2, cluster3 ...]
- clusterN=[x1,x2,x3...]

In [146]:
import numpy as np

In [147]:
# Initializing List of Clusters
oned_cluster = [np.random.uniform(low=1.0, high=10.0, size=20)
for i in range(10)
]
oned_cluster

[array([8.71407265, 4.7946932 , 1.79705735, 2.90306727, 1.37523127,
        6.78390669, 6.84173478, 2.36184571, 5.77767627, 5.94836718,
        6.03772351, 8.65510138, 1.93398252, 4.05465124, 2.53908789,
        3.13888579, 7.66966205, 9.17810104, 9.20963084, 8.62152819]),
 array([2.99082701, 2.38563204, 6.55661603, 9.33521507, 5.24448525,
        4.98602399, 4.17600096, 1.40723284, 4.72391048, 7.72533852,
        3.38377655, 9.8119346 , 6.6318284 , 2.04691671, 5.10591734,
        8.87278571, 1.3668101 , 1.36628833, 9.72463681, 3.4225437 ]),
 array([4.55967393, 6.3409259 , 8.93179832, 4.25353744, 3.3870529 ,
        2.59976815, 8.66039815, 7.21606847, 1.48886069, 2.05905185,
        5.28186796, 9.92029772, 1.19161147, 7.26488166, 9.99593562,
        5.98098151, 6.59439035, 9.32734906, 6.29432151, 3.26366384]),
 array([9.6818429 , 4.31451283, 4.34733118, 5.06044967, 8.11969421,
        5.35139182, 7.75551925, 9.6008486 , 6.15372447, 9.51768517,
        5.71228917, 1.66865356, 8.47606619

In [148]:
# Compute SSE
def sse_1d(clusters):
    sse=list()
    for c in clusters:
        mu=np.mean(c)
        sse.extend([(ci-mu)**2 for ci in c])
    return np.sum(sse)

In [149]:
sse_1d(oned_cluster)

1374.1014314845224

In [150]:
second_clusters = [np.random.uniform(low=1.0, high=10.0, size=20)
for y in range(10)
]
twod_clusters = list(zip(oned_cluster, second_clusters))
twod_clusters[0]

(array([8.71407265, 4.7946932 , 1.79705735, 2.90306727, 1.37523127,
        6.78390669, 6.84173478, 2.36184571, 5.77767627, 5.94836718,
        6.03772351, 8.65510138, 1.93398252, 4.05465124, 2.53908789,
        3.13888579, 7.66966205, 9.17810104, 9.20963084, 8.62152819]),
 array([5.47918119, 6.26711198, 2.12559996, 4.4134696 , 4.42760332,
        2.78578823, 5.60131072, 4.48526372, 2.78934258, 6.23122225,
        7.43383958, 5.71984534, 6.72296382, 7.862001  , 4.63826502,
        6.83826875, 1.38703755, 4.89957616, 5.53532337, 4.16217714]))

In [151]:
# Pair-wise squared distances
def sse_2d(clusters):
    sse_x = list()
    sse_y = list()

    for x, y in clusters:
        mu_x = np.mean(x)
        mu_y = np.mean(y)
        for ci in x:
            sse_x.extend([(ci-mu_x)**2])
        for cj in y:
            sse_y.extend([(cj-mu_y)**2])
    sum_x = np.sum(sse_x)
    sum_y = np.sum(sse_y)    
    return [sum_x, sum_y]
        

sse_2d(twod_clusters)

[1374.1014314845224, 1209.8144935991659]

## Davies-Bouldin Index

Implement the DB-Index


In [152]:
# within-to-between cluster distance
def d_ij(ci, cj):
    mu_i = np.mean(ci)
    mu_j = np.mean(cj)
    return abs(mu_i - mu_j)

### Average distance between each point in a cluster

In [153]:
# within-to-between cluster distance
def d_bar(ci):
    centroid = np.mean(ci)
    dist = []
    for pt in ci:
        dist.append(abs(pt-centroid))  # absolute distance from centroid eg. pt=1, dist=abs(1-5)=4
    return np.mean(dist)

In [154]:
def Dij(ci, cj):
    return (
                (d_bar(ci) + d_bar(cj)) / # intra-cluster distance
                d_ij(ci, cj) # inter-cluster distance
            )

In [155]:
# DB Index: DB(C)
def db_index(clusters):
    dbi_sum=0
    for i, ic in enumerate(clusters):
        Dij_val = []
        for k, kc in enumerate(clusters):
            if i != k:
                Dij_val.append(Dij(ic,kc))
        dbi_sum += max(Dij_val)
    return dbi_sum

In [156]:
db_index(oned_cluster)

2310.08462857775