In [36]:
import numpy as np
import random

class Friedman_Rafsky:

    class Graph:
        
        def __init__(self, vertex):
            
            self.V = vertex
            self.graph = [] #Empty matrix for holding vertices and weights connecting them


        def add_edge(self, v1, v2, w):
            
            self.graph.append([v1, v2, w]) #Add method for creating edges between vertices


        def search(self, parent, i): #Method for determining location of vertex in existing tree
            
            if parent[i] == i:
                return i
            
            return self.search(parent, parent[i])


        def apply_union(self, parent, rank, x, y): #Method for deleting and merging branches
            
            xroot = self.search(parent, x)
            yroot = self.search(parent, y)
            if rank[xroot] < rank[yroot]:
                parent[xroot] = yroot
            elif rank[xroot] > rank[yroot]:
                parent[yroot] = xroot
            else:
                parent[yroot] = xroot
                rank[xroot] += 1


        def kruskal(self):
            
            result = []
            i, e = 0, 0
            self.graph = sorted(self.graph, key=lambda item: item[2])
            parent = []
            rank = []
            for node in range(self.V):
                parent.append(node)
                rank.append(0)
            while e < self.V - 1:
                v1, v2, w = self.graph[i]
                i = i + 1
                x = self.search(parent, v1)
                y = self.search(parent, v2)
                if x != y:
                    e = e + 1
                    result.append([v1, v2])
                    self.apply_union(parent, rank, x, y)
                    
            return result
        
        
    def MST(self, data):
    
        g = self.Graph(len(data[0]-1))

        for i in range(len(data[0])):
            j = i + 1

            while j <= (len(data[0]) - 1):
                weight = np.linalg.norm(data[:,i] - data[:,j])
                g.add_edge(i, j, weight)
                j += 1;

        MST_connections = g.kruskal()

        return MST_connections
    
    
    def num_runs(self, labels, MST_connections):
        
        run_count = 1;

        for x in MST_connections:
            if labels[x[0]] != labels[x[1]]:
                run_count += 1;

        return run_count
    
    
    
    def permutation(self, nperm, labels, MST_connections):
    
        runs = []
        for itr in np.arange(nperm):
            
            lab_shuffle = random.sample(labels, len(labels))

            run_val = self.num_runs(lab_shuffle, MST_connections)

            runs.append(run_val)

        return runs
    
    
    
    def test_stat(self, val):
        
        sd_val = np.std(val)
        
        return ((val - np.mean(val)) / sd_val)
    
    
    
    def pval(self, perm_stat, true_stat):
        
        pvalue = (np.sum(perm_stat <= true_stat) + 1) / (len(perm_stat) + 1)
        
        return pvalue
    
    
    
    def statistics(self, data, labels, nperm):
        
        MST_connections = self.MST(data)
        
        runs_true = self.num_runs(labels, MST_connections)
        
        runs = self.permutation(nperm, labels, MST_connections)
        
        sd_runs = np.std(runs)
        
        mu_runs = np.mean(runs)
        
        W_perm = self.test_stat(runs)

        W_true = (runs_true - mu_runs) / sd_runs

        pvalue = self.pval(W_perm, W_true)
        
        return W_true, W_perm, pvalue

In [45]:
data = np.array([[1,5,3,6,4,2,5,3,2,7],
              [2,5,7,1,8,3,3,7,5,2],
              [4,5,3,3,6,5,2,4,3,7],
              [4,5,3,3,6,5,2,4,3,7]])

labels = [1,2,2,1,2,1,2,2,1,1]

test = Friedman_Rafsky()


vals = [test.statistics(data, labels, 10000)[2] for x in range(50)]

print(np.mean(vals) - 3.291 * (np.std(vals) / np.sqrt(len(vals))), np.mean(vals) + 3.291 * (np.std(vals) / np.sqrt(len(vals))))


0.3557227716484902 0.35981767430691425
