In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import itertools
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform

# Exercise 1

In [31]:
def getDistances(A):

  if type(A) is not np.ndarray:
    A = A.to_numpy()

  n = A.shape[0]
  D = np.zeros((n, n))
  for i in range(n):
    for j in range(i + 1, n):
      D[i][j] = np.linalg.norm(A[i] - A[j])
      D[j][i] = D[i][j]
  return D


In [3]:
def getW(D, U, V):
    total_weight = 0.0
    
    for u in U:
        for v in V:
            total_weight += D[u][v]
    
    return total_weight


In [4]:
def getWIn(D, C):
    win = 0.0
    for cluster in C:
        indices = np.where(np.array(C) == cluster)[0]
        win += getW(D, indices.tolist(), indices.tolist())
    return win

def getWOut(D, C):
    wout = 0.0
    n = D.shape[0]
    for cluster in set(C):
        cluster_indices = np.where(np.array(C) == cluster)[0]
        other_indices = np.where(np.array(C) != cluster)[0]
        wout += getW(D, cluster_indices.tolist(), other_indices.tolist())
    return wout

In [5]:
def getNIn(C):
    nin = 0
    for cluster in set(C):
        cluster_indices = np.where(np.array(C) == cluster)[0]
        nin += len(cluster_indices) * (len(cluster_indices) - 1) // 2
    return nin

def getNOut(C):
    n = len(C)
    nout = n * (n - 1) // 2 - getNIn(C)
    return nout

In [34]:
def getBetaCV(A, C):
    D = getDistances(A=A)
    w_in = getWIn(D=D,C=C)
    n_in = getNIn(C=C)
    w_out = getWOut(D=D,C=C)
    n_out = getNOut(C=C)

    return (w_in / n_in)/(w_out / n_out)    

In [36]:
def getCIndex(A, C):
    D = getDistances(A=A)
    w_in = getWIn(D=D,C=C)
    n_in = getNIn(C=C)

    w_sorted = np.sort(D.flatten())

    w_min = sum(w_sorted[:n_in + len(set(C))])
    w_max = sum(w_sorted[len(w_sorted) - n_in:])

    return (w_in - w_min)/(w_max - w_min)

In [38]:
def getNormalizedCut(A, C):
    nc_list = 0

    D = getDistances(A=A)

    index_set = list(set(C))
    index_lists = []

    for index in index_set:
        index_list = []
        for i, label in enumerate(C):
            if label == index:
                index_list.append(i)

        index_lists.append(index_list)

    for i, index_i in enumerate(index_set):
        w_ci_else = 0
        vol_ci = 0
        for j, index_j in enumerate(index_set):
            w = getW(D=D, U=index_lists[i], V=index_lists[j])
            if index_i != index_j:
                w_ci_else += w
                vol_ci += w
            else:
                vol_ci += w

        nc_list +- w_ci_else / vol_ci

    return nc_list

In [9]:
def getDunn(A, C):
    D = getDistances(A=A)
    w_out = getWOut(D=D,C=C)
    w_out_sorted = np.sort(w_out.flatten())
    w_min_out = w_out_sorted[len(set(C))]

    w_in = getWOut(D=D,C=C)
    w_max_in = w_in.max()

    return w_min_out / w_max_in

In [10]:
def getDaviesBouldin(A, C):
    db_list = []

    indexes = list(set(C))
    k = len(indexes)

    cluster_lens = [0] * k
    cluster_means = []
    cluster_dispersion = []

    for index in C:
        cluster_lens[indexes.index(index)] += 1

    for i, index in enumerate(indexes):
        data_in_c = []
        for j, point_index in enumerate(C):
            if point_index == index:
                data_in_c.append(A[j])

        mean = sum(data_in_c) / cluster_lens[i]
        cluster_means.append(mean)

        var = 0
        for data in data_in_c:
            var += (np.linalg.norm(data - mean)) ** 2
        var = var / cluster_lens[i]
        disp = var ** 0.5
        cluster_dispersion.append(disp)

    for i in range(k):
        temp = []
        for j in range(k):
            if i != j:
                db = (cluster_dispersion[i] + cluster_dispersion[j]) / np.linalg.norm(cluster_means[i] - cluster_means[j])
                temp.append(db)

        db_list.append(max(temp))

    return (1/k) * sum(db_list)

In [11]:
def getSilhouette(A, C):
    si_list = []

    indexes = list(set(C))
    index_lists = []

    for index in indexes:
        index_list = []
        for i, label in enumerate(C):
            if label == index:
                index_list.append(i)

        index_lists.append(index_list)

    for i in range(A):
        current_index = indexes.index(C[i])
        
        weights = []
        for i, index in enumerate(indexes):
            if index != current_index:
                weights.append((getW(D=getDistances(A=A), U=index_lists[current_index], V=index_lists[i]), index))

        closest_cluster = (np.inf, -1)
        for weight, i in weights:
            if weight < closest_cluster[0]:
                closest_cluster = (weight,i)

        mean_in = 0
        mean_min_out = 0
        in_len = 0
        min_out_len = 0

        for j in range(A):
            if i != j: 
                if C[i] == C[j]:
                    mean_in += np.linalg.norm(A[i] - A[j])
                    in_len += 1
                else:
                    if C[j] == closest_cluster[1]:  
                        mean_min_out += np.linalg.norm(A[i] - A[j])
                        min_out_len += 1

        mean_in = mean_in/in_len
        mean_min_out = mean_min_out/min_out_len

        si = (mean_min_out - mean_in) / max([mean_min_out,mean_in])
        si_list.append(si)

    return (1/len(C)) * sum(si_list)

In [12]:
def getMetric(A, C, metric):
    if metric == "beta":
        return getBetaCV(A=A, C=C)
    elif metric == "cindex":
        return getCIndex(A=A, C=C)
    elif metric == "nc":
        return getNormalizedCut(A=A, C=C)
    elif metric == "dunn":
        return getDunn(A=A, C=C)
    elif metric == "db":
        return getDaviesBouldin(A=A, C=C)
    elif metric == "sil":
        return getSilhouette(A=A, C=C)

## Testing

In [32]:
# Test weights
import pandas as pd_test
import numpy as np_test
dfIrisTest = pd_test.read_csv("https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv")
A_Iris_Test = dfIrisTest[dfIrisTest.columns[:4]].astype(float)
C_Iris_Test = dfIrisTest[dfIrisTest.columns[4]]
D_Iris_Test = getDistances(A_Iris_Test)
C1 = np.where(C_Iris_Test == 'setosa')[0]
C2 = np.where(C_Iris_Test == 'versicolor')[0]
C3 = np.where(C_Iris_Test == 'virginica')[0]
CList = [C1, C2, C3]

print ("\nTest Weight Measures\n------------------")
expectedW = {
    (0,1): 8246,
    (0,2): 12056,
    (1,2): 4606
}
wSummary = "W: "
wFailed = False
for i in range(3):
    for j in range(i):
        p1 = (i,j)
        p2 = (j,i)
        W1 = np_test.round(getW(D_Iris_Test, CList[i], CList[j]))
        W2 = np_test.round(getW(D_Iris_Test, CList[j], CList[i]))
        if W1 != W2:
            if not wFailed:
                wSummary += "failed"
            wSummary += "\n\tasymmetry of W: " + str(W1) + " != " + str(W2)
            wFailed = True
        
        if W1 != expectedW[p2]:
            if not wFailed:
                wSummary += "failed"
            wSummary += "\n\tunexpected value of W: " + str(W1) + " instead of expected " + str(expectedW[p2])
            wFailed = True
if not wFailed:
    wSummary += "ok"
print(wSummary)
expectedWIn = 3518
expectedWOut = 24908
expectedNIn = 3675
expectedNOut = 7500
print("WIn:", "ok" if np_test.abs(np_test.round(getWIn(D_Iris_Test, C_Iris_Test)) - expectedWIn) < 2 else "failed")
print("WOut:", "ok" if np_test.abs(np_test.round(getWOut(D_Iris_Test, C_Iris_Test)) - expectedWOut) < 2 else "failed")
print("NIn:", "ok" if getNIn(C_Iris_Test) == 3675 else "failed")
print("NOut:", "ok" if getNOut(C_Iris_Test) == 7500 else "failed")


Test Weight Measures
------------------
W: ok
WIn: failed
WOut: failed
NIn: ok
NOut: ok


In [35]:
# Test weights
import pandas as pd_test
import numpy as np_test
dfIrisTest = pd_test.read_csv("https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv")
A_Iris_Test = dfIrisTest[dfIrisTest.columns[:4]].astype(float)
C_Iris_Test = dfIrisTest[dfIrisTest.columns[4]]
D_Iris_Test = getDistances(A_Iris_Test)

# Test metrics
print ("\nTest Metrics\n------------------")
expected = {
    "beta": 0.2882861014913346,
    "cindex": 0.046803774122703735,
    "nc": 2.6150343040385264,
    "dunn": 0.05848053214719304,
    "db": 0.8445815484442534,
    "sil": 0.5032506980665507
}
for m in expected:
    e = np.round(expected[m], 2)
    a = getMetric(A_Iris_Test, C_Iris_Test, m)
    a = np.round(a, 2) if not a is None else None
    print(m + ":\t", "ok" if e == a else "failed. Expected " + str(e) + " but saw " + str(a))


Test Metrics
------------------
beta:	 failed. Expected 0.29 but saw 14.41
cindex:	 failed. Expected 0.05 but saw 20.59


TypeError: list indices must be integers or slices, not str

# Exercise 2

In [None]:
def plotMetrics(D,kmeans_eps,dbscan_configs,l=100):

In [None]:
def plotClusters(D,C,dimX,dimY,dimZ=None,ax=None):
 h=type(D)==pd.DataFrame
 W=D.columns[dimX]if h else dimX
 X=D.columns[dimY]if h else dimY
 b=D.columns[dimZ]if h and not dimZ is None else dimZ
 if type(D)==pd.DataFrame:
  D=D.values
 x=np.unique(C)
 K=not dimZ is None
 if ax is None:
  if K:
   I=plt.figure()
   ax=I.add_subplot(111,projection='3d')
  else:
   I,ax=plt.subplots()
 for ci in x:
  f=np.where(C==ci)[0]
  if K:
   ax.scatter(D[f,dimX],D[f,dimY],D[f,dimZ])
  else:
   ax.scatter(D[f,dimX],D[f,dimY])
 ax.set_xlabel(W)
 ax.set_ylabel(X)
 if K:
  ax.set_zlabel(b)

In [None]:
def kMeans(D,k,eps=0.01,mu=None,max_iter=20):
 d=D.shape[1]
 if mu is None:
  mu=np.random.rand(k,d)*(np.max(D,axis=0)-np.min(D,axis=0))+np.min(D,axis=0)
 if type(mu)==list:
  mu=np.array(mu)
 s=False
 g=0
 while not s:
  C=[np.argmin([np.linalg.norm(mu[j]-x)for j in range(k)if not any(np.isnan(mu[j]))])for x in D]
  n=np.zeros(mu.shape)
  s=True
  for i in range(k):
   J=[D[j]for j in range(len(D))if C[j]==i]
   if len(J)>0:
    n[i]=np.mean(J,axis=0)
   else:
    n[i]=np.random.rand(1,d)*(np.max(D,axis=0)-np.min(D,axis=0))+np.min(D,axis=0)
   if np.linalg.norm(mu[i]-n[i])>eps:
    s=False
  mu=n
  g+=1
  if g>=max_iter:
   s=True
 return C,mu

In [13]:
df_iris = pd.read_csv("iris.csv")
df_flights = pd.read_csv("delayedflights-small.csv")
df_mall = pd.read_csv("Mall_Customers.csv")

In [14]:
print("holq")

df_flights

holq


Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,2008,1,3,4,1829.0,1755,1959.0,1925,WN,3920,...,3.0,10.0,0,N,0,2.0,0.0,0.0,0.0,32.0
1,2008,1,3,4,1937.0,1830,2037.0,1940,WN,509,...,3.0,7.0,0,N,0,10.0,0.0,0.0,0.0,47.0
2,2008,1,3,4,1644.0,1510,1845.0,1725,WN,1333,...,6.0,8.0,0,N,0,8.0,0.0,0.0,0.0,72.0
3,2008,1,3,4,1452.0,1425,1640.0,1625,WN,675,...,7.0,8.0,0,N,0,3.0,0.0,0.0,0.0,12.0
4,2008,1,3,4,1323.0,1255,1526.0,1510,WN,4,...,4.0,9.0,0,N,0,0.0,0.0,0.0,0.0,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2008,1,21,1,2153.0,1940,2355.0,2155,WN,2036,...,5.0,12.0,0,N,0,0.0,0.0,0.0,0.0,120.0
9996,2008,1,21,1,2225.0,2210,2319.0,2300,WN,1836,...,3.0,9.0,0,N,0,10.0,0.0,4.0,0.0,5.0
9997,2008,1,21,1,1841.0,1815,2058.0,2040,WN,632,...,4.0,11.0,0,N,0,1.0,0.0,0.0,0.0,17.0
9998,2008,1,21,1,2041.0,2020,2303.0,2245,WN,2903,...,6.0,10.0,0,N,0,7.0,0.0,0.0,0.0,11.0
