In [2]:
!pip install hyppo

Collecting hyppo
  Downloading hyppo-0.3.2.tar.gz (84 kB)
[K     |████████████████████████████████| 84 kB 2.0 MB/s 
Building wheels for collected packages: hyppo
  Building wheel for hyppo (setup.py) ... [?25l[?25hdone
  Created wheel for hyppo: filename=hyppo-0.3.2-py3-none-any.whl size=134084 sha256=db3a8e27af47a6b7a68f7229da8bc7aa64d26372a41611bb04a5abdfa56d9886
  Stored in directory: /root/.cache/pip/wheels/a2/cf/83/86bab6230c80c120ba769dd0643db951ac795d93e5cb8ee6c5
Successfully built hyppo
Installing collected packages: hyppo
Successfully installed hyppo-0.3.2


In [6]:
import numpy as np
from sklearn.metrics import pairwise_distances, pairwise_kernels
from sklearn.utils import check_random_state
from hyppo.tools import compute_dist, check_perm_blocks, check_perm_block, contains_nan, check_reps
from joblib import Parallel, delayed
from math import ceil

In [15]:
def check_ndarray_xyz(x, y,z):
    """Check if x, y, or z is an ndarray of float"""
    if not isinstance(x, np.ndarray) or not isinstance(y, np.ndarray) or not isinstance(z, np.ndarray):
        raise TypeError("x, y, and z must be ndarrays")

def convert_xyz_float64(x, y, z):
    """Convert x or y, or z to np.float64 (if not already done)"""
    # convert x and y to floats
    x = np.asarray(x).astype(np.float64)
    y = np.asarray(y).astype(np.float64)
    z = np.asarray(z).astype(np.float64)

    return x, y, z

                        
class _CheckInputs:
    """Checks inputs for all independence tests"""

    def __init__(self, x, y,z, reps=None):
        self.x = x
        self.y = y
        self.z = z
        self.reps = reps

    def __call__(self):
        check_ndarray_xyz(self.x, self.y, self.z)
        contains_nan(self.x)
        contains_nan(self.y)
        contains_nan(self.z)
        self.x, self.y, self.z = self.check_dim_xyz()
        self.x, self.y, self.z = convert_xyz_float64(self.x, self.y, self.z)
        self._check_min_samples()
        self._check_variance()

        if self.reps:
            check_reps(self.reps)

        return self.x, self.y, self.z
    def _check_min_samples(self):
        """Check if the number of samples is at least 3"""
        nx = self.x.shape[0]
        ny = self.y.shape[0]
        nz = self.z.shape[0]

        if nx <= 3 or ny <= 3 or nz <= 3:
            raise ValueError("Number of samples is too low")
    
    def check_dim_xyz(self):
        """Convert x and y and z to proper dimensions"""
        if self.x.ndim == 1:
            self.x = self.x[:, np.newaxis]
        elif self.x.ndim != 2:
            raise ValueError(
                "Expected a 2-D array `x`, found shape " "{}".format(self.x.shape)
            )
        if self.y.ndim == 1:
            self.y = self.y[:, np.newaxis]
        elif self.y.ndim != 2:
            raise ValueError(
                "Expected a 2-D array `y`, found shape " "{}".format(self.y.shape)
            )
        if self.z.ndim == 1:
            self.z = self.z[:, np.newaxis]
        elif self.z.ndim != 2:
            raise ValueError(
                "Expected a 2-D array `z`, found shape " "{}".format(self.z.shape)
            )
        return self.x, self.y, self.z
                
    def _check_variance(self):
        if np.var(self.x) == 0 or np.var(self.y) == 0 or np.var(self.z) == 0:
            raise ValueError("Test cannot be run, one of the inputs has 0 variance")
                

def conditional_dcorr(x,y,z,kernel_type, reps = 1000, workers = 1,   is_distsim=True,
        perm_blocks=None,
        random_state=None):
    check_input = _CheckInputs(
            x,
            y,
            z,
            reps=reps
        )
    x, y,z = check_input()
    #unsure of how to check z as check_input takes two inputs
    distx, disty = compute_dist(
                x, y, metric="euclidean")
    kernel_density_estimation = pairwise_kernels(z, metric=kernel_type, n_jobs=1)
    stat = Statistic(distx,disty,kernel_density_estimation)
    #stat, pvalue, null_dist = perm_test(Statistic, distx, disty,kernel_density_estimation,reps,workers,is_distsim,perm_blocks,random_state)
    return stat

def Statistic(distx,disty,kernel_density_estimation):

    return condition_distance_correlation_stats(distx, disty, kernel_density_estimation)
                
def condition_distance_correlation_stats(distance_x, distance_y, kernel_density_estimation):
    condition_distance_correlation = compute_condition_distance_correlation(distance_x, 
                                                                            distance_y,kernel_density_estimation)
    return np.mean(condition_distance_correlation)

def compute_condition_distance_correlation(distance_x, distance_y,kernel_density_estimation):

    num = distance_x.shape[0]
    anova_x = np.zeros((num,num))
    anova_y = np.zeros((num,num))
    condition_distance_covariance_xy = np.zeros(num)
    condition_distance_covariance_xx = np.zeros(num)
    condition_distance_covariance_yy = np.zeros(num)

    for i in range(num):
        anova_x = weight_distance_anova(distance_x, kernel_density_estimation[i])
        anova_y = weight_distance_anova(distance_y, kernel_density_estimation[i])

        for k in range(num):
            for j in range(num): 
                condition_distance_covariance_xy[i] += anova_x[k][j] * anova_y[k][j] * kernel_density_estimation[i][k] * kernel_density_estimation[i][j]
                condition_distance_covariance_xx[i] += anova_x[k][j] * anova_x[k][j] * kernel_density_estimation[i][k] * kernel_density_estimation[i][j]
                condition_distance_covariance_yy[i] += anova_y[k][j] * anova_y[k][j] * kernel_density_estimation[i][k] * kernel_density_estimation[i][j]
    for i in range(num):
        dcor_denominator = condition_distance_covariance_xx[i] * condition_distance_covariance_yy[i]
        if (dcor_denominator > 0.0):
            condition_distance_covariance_xy[i] /= np.sqrt(dcor_denominator)
        else:
            condition_distance_covariance_xy[i] = 0.0

    return condition_distance_covariance_xy 

def weight_distance_anova(distance_matrix, weight):
    weight_sum = np.sum(weight)
    num = distance_matrix.shape[0]

    marginal_weight_distance = np.zeros(num)
    for i in range(num):
        marginal_weight_distance[i] = vector_weight_sum(distance_matrix[i], weight)
    weight_distance_sum = vector_weight_sum(marginal_weight_distance, weight) 
    weight_distance_sum /= weight_sum * weight_sum

    for i in range(num):
        marginal_weight_distance[i] /= weight_sum

    weight_distance_anova_table = np.zeros((num,num))
    for k in range(num):
        for j in range(num):
            weight_distance_anova_table[k][j] = distance_matrix[k][j] - marginal_weight_distance[k] - marginal_weight_distance[j] + weight_distance_sum
            weight_distance_anova_table[j][k] = weight_distance_anova_table[k][j]

    return weight_distance_anova_table

def vector_weight_sum(vector1, weight): 
    sum_value = 0.0
    for i in range(vector1.shape[0]):
        sum_value += vector1[i] * weight[i]
    return sum_value


class _PermNode(object):
    """Helper class for nodes in _PermTree."""

    def __init__(self, parent, label=None, index=None):
        self.children = []
        self.parent = parent
        self.label = label
        self.index = index

    def get_leaf_indices(self):
        if len(self.children) == 0:
            return [self.index]
        else:
            indices = []
            for child in self.children:
                indices += child.get_leaf_indices()
            return indices

    def add_child(self, child):
        self.children.append(child)

    def get_children(self):
        return self.children



class _PermTree(object):
    """Tree representation of dependencies for restricted permutations"""

    def __init__(self, perm_blocks):
        perm_blocks = check_perm_blocks(perm_blocks)
        self.root = _PermNode(None)
        self._add_levels(self.root, perm_blocks, np.arange(perm_blocks.shape[0]))
        indices = self.root.get_leaf_indices()
        self._index_order = np.argsort(indices)

    def _add_levels(self, root: _PermNode, perm_blocks, indices):
        # Add new child node for each unique label, then recurse or end
        if perm_blocks.shape[1] == 0:
            for idx in indices:
                child_node = _PermNode(parent=root, label=1, index=idx)
                root.add_child(child_node)
        else:
            perm_block = check_perm_block(perm_blocks[:, 0])
            for label in np.unique(perm_block):
                idxs = np.where(perm_block == label)[0]
                child_node = _PermNode(parent=root, label=label)
                root.add_child(child_node)
                self._add_levels(child_node, perm_blocks[idxs, 1:], indices[idxs])

    def _permute_level(self, node, rng=None):
        if rng is None:
            rng = np.random
        if len(node.get_children()) == 0:
            return [node.index]
        else:
            indices, labels = zip(
                *[
                    (self._permute_level(child), child.label)
                    for child in node.get_children()
                ]
            )
            shuffle_children = [i for i, label in enumerate(labels) if label >= 0]
            indices = np.asarray(indices)
            if len(shuffle_children) > 1:
                indices[shuffle_children] = indices[rng.permutation(shuffle_children)]
            return np.concatenate(indices)

    def permute_indices(self, rng=None):
        return self._permute_level(self.root, rng)[self._index_order]

    def original_indices(self):
        return np.arange(len(self._index_order))




class _PermGroups(object):
    """Helper function to calculate parallel p-value."""

    def __init__(self, x, perm_blocks=None):
        self.n = x.shape[0]
        if perm_blocks is None:
            self.perm_tree = None
        else:
            self.perm_tree = _PermTree(perm_blocks)

    def __call__(self, rng=None):
        if rng is None:
            rng = np.random
        if self.perm_tree is None:
            order = rng.permutation(self.n)
        else:
            order = self.perm_tree.permute_indices(rng)

        return order


def _perm_stat(calc_stat, x, y, kernel_density_estimation, is_distsim=True, permuter=None, random_state=None):
    """Permute the test statistic"""
    rng = check_random_state(random_state)
    if permuter is None:
        order = rng.permutation(x.shape[0])
    else:
        order = permuter(rng)

    if is_distsim:
        permx = x[order][:, order]
    else:
        permx = x[order]

    perm_stat = calc_stat(permx, y, kernel_density_estimation)

    return perm_stat

def perm_test(
    calc_stat,
    x,
    y,
    kernel_density_estimation,
    reps=1000,
    workers=1,
    is_distsim=True,
    perm_blocks=None,
    random_state=None):
    
    # calculate observed test statistic
    stat = calc_stat(x, y, kernel_density_estimation)

    # make RandomState seeded array
    if random_state is not None:
        rng = check_random_state(random_state)
        random_state = rng.randint(np.iinfo(np.int32).max, size=reps)

    # make random array
    else:
        random_state = np.random.randint(np.iinfo(np.int32).max, size=reps)

    # calculate null distribution
    permuter = _PermGroups(x, perm_blocks)

    null_dist = np.array(
        Parallel(n_jobs=workers)(
            [
                delayed(_perm_stat)(calc_stat, x, y, kernel_density_estimation, is_distsim, permuter, rng)
                for rng in random_state
            ]
        )
    )
    pvalue = (1 + (null_dist >= stat).sum()) / (1 + reps)

    return stat, pvalue, null_dist

In [40]:
def _indep_perm_stat(x,y,z):
    obs_stat = conditional_dcorr(x,y,z,"rbf", reps = 1000, workers = 1,   is_distsim=True,perm_blocks=None, random_state=None)
    permx = np.random.permutation(x)
    perm_stat = conditional_dcorr(permx,y,z,"rbf", reps = 1000, workers = 1,   is_distsim=True,perm_blocks=None, random_state=None)
    return obs_stat, perm_stat
#Example 12
def power_depend(sample_size):
    alt_dist = []
    null_dist = []
    alpha = 0.05
    for i in range(1000):
        z1 = np.random.standard_t(2, sample_size)
        x1 = z1
        z2 = np.random.standard_t(2, sample_size)
        x2 = z2
        z3 = np.random.standard_t(2, sample_size)
        x3 = z3
        z4 = np.random.standard_t(2, sample_size)
        x4 = z4
        y1 = z1*z2 + (z3**2)*(z4**2)
        y2 = z1**3 + (z2**2)*(z3*z4)
        x = np.zeros((sample_size,4))
        x[:,0] = x1
        x[:,1] = x2
        x[:,2] = x3
        x[:,3] = x4
        z = np.zeros((sample_size,4))
        z[:,0] = z1
        z[:,1] = z2
        z[:,2] = z3
        z[:,3] = z4
        y = np.zeros((sample_size,2))
        y[:,0] = y1
        y[:,1] = y2
        obs_stat, perm_stat = _indep_perm_stat(x,y,z)
        alt_dist.append(obs_stat)
        null_dist.append(perm_stat)
    cutoff = np.sort(np.array(null_dist))[ceil(1000 * (1 - alpha))]
    empirical_power = (1 + (np.array(alt_dist) >= cutoff).sum()) / (1 + 1000)
    return empirical_power
#Example 3
def type_1_err_indep(sample_size):
    alt_dist = []
    null_dist = []
    alpha = 0.05  
    for i in range(1000):
        x1 = np.random.binomial(10,0.5,sample_size)
        y1 = np.random.binomial(10,0.5,sample_size)
        z1 = np.random.binomial(10,0.5,sample_size)
        z2 = np.random.binomial(10,0.5,sample_size)
        x = x1 + z1 + z2
        y = y1 + z1 + z2
        z = np.zeros((sample_size,2))
        z[:,0] = z1
        z[:,1] = z2
        obs_stat, perm_stat = _indep_perm_stat(x,y,z)
        alt_dist.append(obs_stat)
        null_dist.append(perm_stat)
    cutoff = np.sort(np.array(null_dist))[ceil(1000 * (1 - alpha))]
    type_1_err = (1 + (np.array(alt_dist) >= cutoff).sum()) / (1 + 1000)
    return type_1_err

In [37]:
print(power_depend(50))

1.0


In [None]:
print(type_1_err_indep(50))