In [178]:
import numpy as np

In [179]:
def getDistances(A):
    if not isinstance(A, np.ndarray):
        A = A.values.reshape(-1, A.shape[-1])

    n = A.shape[0]
    Distance = np.zeros((n,n))
    for i in range(n):
        for j in range(i, n):
            Distance[i,j] = np.linalg.norm(A[i] - A[j])
            Distance[j,i] = Distance[i,j]
    return Distance

In [180]:
def getW(D,U,V):
    weight = 0
    for i in U:
        for j in V:
            weight += D[i,j]
    return weight
    

In [181]:
def getWIn(D,C):
    Win = 0
    clusters = set(C)
    for cluster in clusters:
        indices = [i for i, x in enumerate(C) if x == cluster]
        Win += getW(D, indices, indices)
    return round(Win/2, 0)

In [182]:
def getWOut(D,C):
    Wout = 0
    clusters = set(C)
    for cluster in clusters:
        indices = [i for i, x in enumerate(C) if x == cluster]
        for i in indices:
            for j in range(len(D)):
                if j in indices:
                    continue
                Wout += D[i,j]
    return round(Wout/2,0)

In [183]:
def getNIn(C):
    Nin = 0
    clusters = set(C)
    for cluster in clusters:
        n = [i for i, x in enumerate(C) if x == cluster]
        Nin += len(n) * (len(n)-1)/2
    return Nin

In [184]:
def getNOut(C):
    n = len(C)
    Nout = (n*(n-1)/2) - getNIn(C)
    return Nout

In [185]:
def getBetaCV(A, C):
    D = getDistances(A)
    Win = getWIn(D, C)
    Wout = getWOut(D, C)
    Nin = getNIn(C)
    Nout = getNOut(C)
    BetaCV = (Win / Nin) / (Wout / Nout)
    return BetaCV

In [186]:
def getCIndex(A,C):
    W = getDistances(A)
    Nin = getNIn(C)
    Wflat = np.ravel(W)
    Wmin = np.sum(np.partition(Wflat, int(Nin))[:int(Nin)])
    Wmax = np.sum(np.partition(Wflat, -int(Nin))[-int(Nin):])
    Win = getWIn(W, C)
    CIndex = ((Win - Wmin)/(Wmax - Wmin))/2
    return CIndex

In [194]:
def getNormalizedCut(A, C):
    win = getWIn(A, C)
    wout = getWOut(A, C)

    vol = [getW(A, [i], range(len(A))) for i in range(len(C))]

    normalized_cut = 0
    for i in range(len(C)):
        normalized_cut += wout / vol[i]

    return normalized_cut

In [188]:
def getDunn(A, C):
    D = getDistances(A=A)

    w_min_out = np.inf
    for i in range(len(C)):
        for j in range(i+1, len(C)):
            i_points = np.where(np.array(C) == i)[0]
            j_points = np.where(np.array(C) == j)[0]
            matrix_dist = D[np.ix_(i_points, j_points)]
            dist_min = np.min(matrix_dist)
            if dist_min < w_min_out:
                w_min_out = dist_min

    w_max_in = 0
    for i in range(len(C)):
        i_points = np.where(np.array(C) == i)[0]
        matrix_dist = D[np.ix_(i_points, i_points)]
        dist_max = np.max(matrix_dist)
        if dist_max > w_max_in:
            w_max_in = dist_max

    dunn = w_min_out / w_max_in
    return dunn

In [189]:
def getDaviesBouldin(A, C):
    D = getDistances(A=A)
    
    centroids = []
    for id_cluster in set(C):
        cluster_points = A[np.where(np.array(C) == id_cluster)]
        centroids.append(np.mean(cluster_points, axis=0))

        dispersion = []
        for id_cluster in set(C):
            cluster_points = A[np.where(np.array(C) == id_cluster)]
            centroid = centroids[id_cluster]
            dispersion_cluster = np.mean(np.linalg.norm(cluster_points - centroid, axis=1))
            dispersion.append(dispersion_cluster)

        distances_centroid = []
        for i in range(len(set(C))):
            for j in range(i+1, len(set(C))):
                distance = np.linalg.norm(centroids[i] - centroids[j])
                distances_centroid.append(distance)

        DaviesBouldin = np.mean([(dispersion[i] + dispersion[j]) / distances_centroid[idx] for idx, (i, j) in enumerate(zip(range(len(set(C))), range(len(set(C))))) if i != j])
        return DaviesBouldin

        


In [190]:
def getSilhouette(A, C):
    D = getDistances(A=A)
    n = len(A)
    SilhouetteCoefficient = 0

    for i in range(n):
        id_cluster = C[i]
        points_cluster = np.where(np.array(C) == id_cluster)[0]
        outside_mean = np.mean(D[i, points_cluster])

        min_out = np.inf
        for j in range(len(set(C))):
            if j != id_cluster:
                other_cluster_points = np.where(np.array(C) == j)[0]
                inside_mean = np.mean(D[i, other_cluster_points])
                if inside_mean < min_out:
                    min_out = inside_mean

        silhouette_i = (min_out - outside_mean)/ max(outside_mean, min_out)
        SilhouetteCoefficient += silhouette_i

    SilhouetteCoefficient /= n
    return SilhouetteCoefficient
                

In [191]:
def getMetric(A, C, metric):
    if metric == "beta":
        return getBetaCV(A, C)
    elif metric == "cindex":
        return getCIndex(A, C)
    elif metric == "nc":
        return getNormalizedCut(A, C)
    elif metric == "dunn":
        return getDunn(A, C)
    elif metric == "db":
        return getDaviesBouldin(A, C)
    elif metric == "sil":
        return getSilhouette(A, C)
    else:
        raise ValueError("No hay metrica")

In [192]:
import pandas as pd_test
import numpy as np_test
dfIrisTest = pd_test.read_csv("https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv")
A_Iris_Test = dfIrisTest[dfIrisTest.columns[:4]].astype(float)
C_Iris_Test = dfIrisTest[dfIrisTest.columns[4]]
print(type(A_Iris_Test))
D_Iris_Test = getDistances(A_Iris_Test)
C1 = np.where(C_Iris_Test == 'setosa')[0]
C2 = np.where(C_Iris_Test == 'versicolor')[0]
C3 = np.where(C_Iris_Test == 'virginica')[0]
CList = [C1, C2, C3]

print ("\nTest Weight Measures\n------------------")
expectedW = {
    (0,1): 8246,
    (0,2): 12056,
    (1,2): 4606
}
wSummary = "W: "
wFailed = False
for i in range(3):
    for j in range(i):
        p1 = (i,j)
        p2 = (j,i)
        W1 = np_test.round(getW(D_Iris_Test, CList[i], CList[j]))
        W2 = np_test.round(getW(D_Iris_Test, CList[j], CList[i]))
        if W1 != W2:
            if not wFailed:
                wSummary += "failed"
            wSummary += "\n\tasymmetry of W: " + str(W1) + " != " + str(W2)
            wFailed = True
        
        if W1 != expectedW[p2]:
            if not wFailed:
                wSummary += "failed"
            wSummary += "\n\tunexpected value of W: " + str(W1) + " instead of expected " + str(expectedW[p2])
            wFailed = True
if not wFailed:
    wSummary += "ok"
print(wSummary)
expectedWIn = 3518
expectedWOut = 24908
expectedNIn = 3675
expectedNOut = 7500
print("WIn:", "ok" if np_test.abs(np_test.round(getWIn(D_Iris_Test, C_Iris_Test)) - expectedWIn) < 2 else "failed")
print("WOut:", "ok" if np_test.abs(np_test.round(getWOut(D_Iris_Test, C_Iris_Test)) - expectedWOut) < 2 else "failed")
print("NIn:", "ok" if getNIn(C_Iris_Test) == 3675 else "failed")
print("NOut:", "ok" if getNOut(C_Iris_Test) == 7500 else "failed")

<class 'pandas.core.frame.DataFrame'>

Test Weight Measures
------------------
W: ok
WIn: ok
WOut: ok
NIn: ok
NOut: ok


In [193]:
import pandas as pd_test
import numpy as np_test
dfIrisTest = pd_test.read_csv("https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv")
A_Iris_Test = dfIrisTest[dfIrisTest.columns[:4]].astype(float)
C_Iris_Test = dfIrisTest[dfIrisTest.columns[4]]
D_Iris_Test = getDistances(A_Iris_Test)

# Test metrics
print ("\nTest Metrics\n------------------")
expected = {
    "beta": 0.2882861014913346,
    "cindex": 0.046803774122703735,
    "nc": 2.6150343040385264,
    "dunn": 0.05848053214719304,
    "db": 0.8445815484442534,
    "sil": 0.5032506980665507
}
for m in expected:
    e = np.round(expected[m], 2)
    a = getMetric(A_Iris_Test, C_Iris_Test, m)
    a = np.round(a, 2) if not a is None else None
    print(m + ":\t", "ok" if e == a else "failed. Expected " + str(e) + " but saw " + str(a))


Test Metrics
------------------
beta:	 ok
cindex:	 ok


NameError: name 'NormalizedCut' is not defined