In [1]:
import numpy as np

In [2]:
def getDistances(A):
    if not isinstance(A, np.ndarray):
        A = A.values.reshape(-1, A.shape[-1])

    n = A.shape[0]
    Distance = np.zeros((n,n))
    for i in range(n):
        for j in range(i, n):
            Distance[i,j] = np.linalg.norm(A[i] - A[j])
            Distance[j,i] = Distance[i,j]
    return Distance

In [3]:
def getW(D,U,V):
    weight = 0
    for i in U:
        for j in V:
            weight += D[i,j]
    return weight
    

In [4]:
def getWIn(D,C):
    Win = 0
    clusters = set(C)
    for cluster in clusters:
        indices = [i for i, x in enumerate(C) if x == cluster]
        Win += getW(D, indices, indices)
    return round(Win/2, 0)

In [5]:
def getWOut(D,C):
    Wout = 0
    clusters = set(C)
    for cluster in clusters:
        indices = [i for i, x in enumerate(C) if x == cluster]
        for i in indices:
            for j in range(len(D)):
                if j in indices:
                    continue
                Wout += D[i,j]
    return round(Wout/2,0)

In [6]:
def getNIn(C):
    Nin = 0
    clusters = set(C)
    for cluster in clusters:
        n = [i for i, x in enumerate(C) if x == cluster]
        Nin += len(n) * (len(n)-1)/2
    return Nin

In [7]:
def getNOut(C):
    n = len(C)
    Nout = (n*(n-1)/2) - getNIn(C)
    return Nout

In [8]:
def getBetaCV(A, C):
    Win = getWIn(A, C)
    Wout = getWOut(A, C)
    Nin = getNIn(C)
    Nout = getNOut(C)
    BetaCV = (Win - Wout) / (Nin - Nout)
    return BetaCV

In [9]:
def getCIndex(A,C):
    W = np.zeros((len(A), len(A)))
    for i in range(len(A)):
        for j in range(i+1, len(A)):
            W[i,j] = W[j,i] = np.linalg.norm(A[i]- A[j])
    k = len(set(C))
    Nin = getNIn(C)
    Wmin = np.partition(np.sort(W[np.triu_indices(len(A), k=1)]), Nin-1)[Nin-1]
    Wmax = np.partition(np.sort(W[np.triu_indices(len(A), k=1)]), -Nin)[-Nin]
    Win = getWIn(W, C)
    CIndex = (Win - Wmin)/(Wmax - Wmin)
    return CIndex

In [10]:
import pandas as pd_test
import numpy as np_test
dfIrisTest = pd_test.read_csv("https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv")
A_Iris_Test = dfIrisTest[dfIrisTest.columns[:4]].astype(float)
C_Iris_Test = dfIrisTest[dfIrisTest.columns[4]]
print(type(A_Iris_Test))
D_Iris_Test = getDistances(A_Iris_Test)
C1 = np.where(C_Iris_Test == 'setosa')[0]
C2 = np.where(C_Iris_Test == 'versicolor')[0]
C3 = np.where(C_Iris_Test == 'virginica')[0]
CList = [C1, C2, C3]

print ("\nTest Weight Measures\n------------------")
expectedW = {
    (0,1): 8246,
    (0,2): 12056,
    (1,2): 4606
}
wSummary = "W: "
wFailed = False
for i in range(3):
    for j in range(i):
        p1 = (i,j)
        p2 = (j,i)
        W1 = np_test.round(getW(D_Iris_Test, CList[i], CList[j]))
        W2 = np_test.round(getW(D_Iris_Test, CList[j], CList[i]))
        if W1 != W2:
            if not wFailed:
                wSummary += "failed"
            wSummary += "\n\tasymmetry of W: " + str(W1) + " != " + str(W2)
            wFailed = True
        
        if W1 != expectedW[p2]:
            if not wFailed:
                wSummary += "failed"
            wSummary += "\n\tunexpected value of W: " + str(W1) + " instead of expected " + str(expectedW[p2])
            wFailed = True
if not wFailed:
    wSummary += "ok"
print(wSummary)
expectedWIn = 3518
expectedWOut = 24908
expectedNIn = 3675
expectedNOut = 7500
print("WIn:", "ok" if np_test.abs(np_test.round(getWIn(D_Iris_Test, C_Iris_Test)) - expectedWIn) < 2 else "failed")
print("WOut:", "ok" if np_test.abs(np_test.round(getWOut(D_Iris_Test, C_Iris_Test)) - expectedWOut) < 2 else "failed")
print("NIn:", "ok" if getNIn(C_Iris_Test) == 3675 else "failed")
print("NOut:", "ok" if getNOut(C_Iris_Test) == 7500 else "failed")

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd_test


<class 'pandas.core.frame.DataFrame'>

Test Weight Measures
------------------
W: ok
WIn: ok
WOut: ok
NIn: ok
NOut: ok


In [11]:
import pandas as pd_test
import numpy as np_test
dfIrisTest = pd_test.read_csv("https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv")
A_Iris_Test = dfIrisTest[dfIrisTest.columns[:4]].astype(float)
C_Iris_Test = dfIrisTest[dfIrisTest.columns[4]]
D_Iris_Test = getDistances(A_Iris_Test)

# Test metrics
print ("\nTest Metrics\n------------------")
expected = {
    "beta": 0.2882861014913346,
    "cindex": 0.046803774122703735,
    "nc": 2.6150343040385264,
    "dunn": 0.05848053214719304,
    "db": 0.8445815484442534,
    "sil": 0.5032506980665507
}
for m in expected:
    e = np.round(expected[m], 2)
    a = getMetric(A_Iris_Test, C_Iris_Test, m)
    a = np.round(a, 2) if not a is None else None
    print(m + ":\t", "ok" if e == a else "failed. Expected " + str(e) + " but saw " + str(a))


Test Metrics
------------------


NameError: name 'getMetric' is not defined