In [2]:
class Graph:
    def __init__(self, vertex):
        self.V = vertex
        self.graph = [] #Empty matrix for holding vertices and weights connecting them
        
        
    def add_edge(self, v1, v2, w):
        self.graph.append([v1, v2, w]) #Add method for creating edges between vertices
        
 
    def search(self, parent, i): #Method for determining location of vertex in existing tree
        if parent[i] == i:
            return i
        return self.search(parent, parent[i])

 
    def apply_union(self, parent, rank, x, y): #Method for deleting and merging branches
        xroot = self.search(parent, x)
        yroot = self.search(parent, y)
        if rank[xroot] < rank[yroot]:
            parent[xroot] = yroot
        elif rank[xroot] > rank[yroot]:
            parent[yroot] = xroot
        else:
            parent[yroot] = xroot
            rank[xroot] += 1
 
  
    def kruskal(self):
        result = []
        i, e = 0, 0
        self.graph = sorted(self.graph, key=lambda item: item[2])
        parent = []
        rank = []
        for node in range(self.V):
            parent.append(node)
            rank.append(0)
        while e < self.V - 1:
            v1, v2, w = self.graph[i]
            i = i + 1
            x = self.search(parent, v1)
            y = self.search(parent, v2)
            if x != y:
                e = e + 1
                result.append([v1, v2])
                self.apply_union(parent, rank, x, y)
        return result

In [17]:
import numpy as np
import random

x = np.array([[1,5,3,6,4,2,5,3,2,7],  #Array with rows as features, columns as samples
              [2,5,7,1,8,3,3,7,5,2],
              [4,5,3,3,6,5,2,4,3,7],
              [4,5,3,3,6,5,2,4,3,7]])

lab = [0,1,1,0,1,0,1,1,0,0] #Corresponding labels (in this case 2 samples but could be extended further)

g = Graph(len(x[0]-1)) #Initiate graph with number of labels 

for i in range(len(x[0])):
    j = i + 1
    while j <= (len(x[0]) - 1):
        dist = np.linalg.norm(x[:,i] - x[:,j])
        g.add_edge(i, j, dist)
        j += 1;
        
#Essentially creating an edge between every possible pair of vertices and adding to graph (inefficient as there are)
#n choose n-1 edges being added, could be optimized
        
connects = g.kruskal() #Find pairs connected by MST
        
runs = []

runs_true = 1; #starting with 1 tree, true runs label

for x in connects:
    if lab[x[0]] != lab[x[1]]: #Determine if vertices still connected are of same label
        runs_true += 1;
        
nperm = 100000
runs = []

for itr in np.arange(nperm):
    run_val = 1
    lab_shuffle = random.sample(lab, len(lab))
    for x in connects:
        if lab_shuffle[x[0]] != lab_shuffle[x[1]]:
            run_val += 1;
    runs.append(run_val)

#Here we randomly arrange the labels and determine number of runs for 100 permutations

sd_runs = np.std(runs)

W_perm = (runs - np.mean(runs)) / sd_runs

W_true = (runs_true - np.mean(runs)) / sd_runs

pvalue = (np.sum(W_perm <= W_true) + 1) / (len(W_perm) + 1)

print('Test Statistic:', W_obs, 'Permutation Statistic:', W_perm, 'p-value:', pvalue)

#Here, with 100,000 permutations, we can see that with a p-value of .355 we can safely fail to reject the
#null hypothesis that these datasets are independent (which makes sense given they are
#random data points)


Test Statistic: -0.6945024814469438 Permutation Statistic: [-1.39051531  1.39353601  0.69752318 ... -0.69450248 -1.39051531
  0.69752318] p-value: 0.3586164138358616


In [18]:
#As opposed to data designed to be somewhat correlated

x = np.array([[1,5,2,7,6,3,1,8,9,3],  #Array with rows as features, columns as samples
              [1,5,2,7,6,3,1,8,9,3],
              [1,5,2,7,6,3,1,8,9,3],
              [1,5,2,7,6,3,1,8,9,3]])

lab = [0,1,0,1,1,0,0,1,1,0] #Corresponding labels (in this case 2 samples but could be extended further)

g = Graph(len(x[0]-1)) #Initiate graph with number of labels 

for i in range(len(x[0])):
    j = i + 1
    while j <= (len(x[0]) - 1):
        dist = np.linalg.norm(x[:,i] - x[:,j])
        g.add_edge(i, j, dist)
        j += 1;
        
#Essentially creating an edge between every possible pair of vertices and adding to graph (inefficient as there are)
#n choose n-1 edges being added, could be optimized
        
connects = g.kruskal() #Find pairs connected by MST
        
runs = []

runs_true = 1; #starting with 1 tree, true runs label

for x in connects:
    if lab[x[0]] != lab[x[1]]: #Determine if vertices still connected are of same label
        runs_true += 1;
        
nperm = 100000
runs = []

for itr in np.arange(nperm):
    run_val = 1
    lab_shuffle = random.sample(lab, len(lab))
    for x in connects:
        if lab_shuffle[x[0]] != lab_shuffle[x[1]]:
            run_val += 1;
    runs.append(run_val)

#Here we randomly arrange the labels and determine number of runs for 100 permutations

sd_runs = np.std(runs)

W_perm = (runs - np.mean(runs)) / sd_runs

W_true = (runs_true - np.mean(runs)) / sd_runs

pvalue = (np.sum(W_perm <= W_true) + 1) / (len(W_perm) + 1)

print('Test Statistic:', W_obs, 'Permutation Statistic:', W_perm, 'p-value:', pvalue)

#And as this data was designed with the purpose of being obviously correlated, a pvalue of less than .05 checks
#out as we successfuly reject the null hypothesis

Test Statistic: -2.7481626074018215 Permutation Statistic: [-0.68727765 -1.3742393  -0.68727765 ...  0.68664565  1.3736073
 -0.68727765] p-value: 0.007819921800781992
