In [2]:
!pip install hyppo

Collecting hyppo
  Downloading hyppo-0.3.2.tar.gz (84 kB)
[?25l[K     |████                            | 10 kB 16.2 MB/s eta 0:00:01[K     |███████▉                        | 20 kB 12.8 MB/s eta 0:00:01[K     |███████████▊                    | 30 kB 6.3 MB/s eta 0:00:01[K     |███████████████▋                | 40 kB 5.6 MB/s eta 0:00:01[K     |███████████████████▌            | 51 kB 4.2 MB/s eta 0:00:01[K     |███████████████████████▍        | 61 kB 4.9 MB/s eta 0:00:01[K     |███████████████████████████▎    | 71 kB 4.9 MB/s eta 0:00:01[K     |███████████████████████████████▏| 81 kB 5.4 MB/s eta 0:00:01[K     |████████████████████████████████| 84 kB 1.9 MB/s 
Building wheels for collected packages: hyppo
  Building wheel for hyppo (setup.py) ... [?25l[?25hdone
  Created wheel for hyppo: filename=hyppo-0.3.2-py3-none-any.whl size=134084 sha256=8e67970381fa2dc9cde775cbae4770e34875b5977c4ecc4b1a429d2d6d818e42
  Stored in directory: /root/.cache/pip/wheels/a2/cf/83/86ba

In [7]:
import numpy as np
from sklearn.metrics import pairwise_distances, pairwise_kernels
from sklearn.utils import check_random_state
from hyppo.tools import compute_dist, check_perm_blocks, check_perm_block, contains_nan, check_reps
from joblib import Parallel, delayed
from math import ceil
from scipy import stats 

In [17]:

#all check functions taken from hyppo and adjusted for three inputs
def check_ndarray_xyz(x, y,z):
    """Check if x, y, or z is an ndarray of float"""
    if not isinstance(x, np.ndarray) or not isinstance(y, np.ndarray) or not isinstance(z, np.ndarray):
        raise TypeError("x, y, and z must be ndarrays")

def convert_xyz_float64(x, y, z):
    """Convert x or y, or z to np.float64 (if not already done)"""
    # convert x and y to floats
    x = np.asarray(x).astype(np.float64)
    y = np.asarray(y).astype(np.float64)
    z = np.asarray(z).astype(np.float64)

    return x, y, z

                        
class _CheckInputs:
    """Checks inputs for all independence tests"""

    def __init__(self, x, y,z, reps=None):
        self.x = x
        self.y = y
        self.z = z
        self.reps = reps

    def __call__(self):
        check_ndarray_xyz(self.x, self.y, self.z)
        contains_nan(self.x)
        contains_nan(self.y)
        contains_nan(self.z)
        self.x, self.y, self.z = self.check_dim_xyz()
        self.x, self.y, self.z = convert_xyz_float64(self.x, self.y, self.z)
        self._check_min_samples()
        self._check_variance()

        if self.reps:
            check_reps(self.reps)

        return self.x, self.y, self.z
    def _check_min_samples(self):
        """Check if the number of samples is at least 3"""
        nx = self.x.shape[0]
        ny = self.y.shape[0]
        nz = self.z.shape[0]

        if nx <= 3 or ny <= 3 or nz <= 3:
            raise ValueError("Number of samples is too low")
    
    def check_dim_xyz(self):
        """Convert x and y and z to proper dimensions"""
        if self.x.ndim == 1:
            self.x = self.x[:, np.newaxis]
        elif self.x.ndim != 2:
            raise ValueError(
                "Expected a 2-D array `x`, found shape " "{}".format(self.x.shape)
            )
        if self.y.ndim == 1:
            self.y = self.y[:, np.newaxis]
        elif self.y.ndim != 2:
            raise ValueError(
                "Expected a 2-D array `y`, found shape " "{}".format(self.y.shape)
            )
        if self.z.ndim == 1:
            self.z = self.z[:, np.newaxis]
        elif self.z.ndim != 2:
            raise ValueError(
                "Expected a 2-D array `z`, found shape " "{}".format(self.z.shape)
            )
        return self.x, self.y, self.z
                
    def _check_variance(self):
        if np.var(self.x) == 0 or np.var(self.y) == 0 or np.var(self.z) == 0:
            raise ValueError("Test cannot be run, one of the inputs has 0 variance")
                

def conditional_dcorr(x,y,z,kernel_type, reps = 1000, workers = 1,   is_distsim=True,
        perm_blocks=None,
        random_state=None):
    # check matrix inputs
    check_input = _CheckInputs(
            x,
            y,
            z,
            reps=reps
        )
    x, y,z = check_input()
    #compute distance matrixes for x and y
    distx, disty = compute_dist(
                x, y, metric="euclidean")
    kernel_density_estimation = pairwise_kernels(z, metric="rbf", n_jobs=1)
    kernel_density_estimation= np.asarray(kernel_density_estimation).astype('float64')
    #stat, pvalue calculations for permuting
    statistic = Statistic(x,y,z)
    #stat, pvalue, null_dist =  conduct_cdc_test(distx, disty, kernel_density_estimation, num_bootstrap,statistic, seed)
    return statistic

#Base Statistic function
def Statistic(x,y,z):
    distx, disty = compute_dist(
                x, y, metric="euclidean")
    kernel_density_estimation = pairwise_kernels(z, metric="rbf", n_jobs=1)
    kernel_density_estimation = np.asarray(kernel_density_estimation).astype('float64')
    return condition_distance_correlation_stats(distx, disty, kernel_density_estimation)

#primary method for statsitic calculation              
def condition_distance_correlation_stats(distance_x, distance_y, kernel_density_estimation):
    num = distance_x.shape[0]
    anova_x = np.zeros((num,num))
    anova_y = np.zeros((num,num))
    condition_distance_covariance_xy = np.zeros(num)
    condition_distance_covariance_xx = np.zeros(num)
    condition_distance_covariance_yy = np.zeros(num)

    for i in range(num):
        #normalization so array can be used as probabilities
        kernel_density_estimation[i] = kernel_density_estimation[i]/kernel_density_estimation[i].sum()
        #anovas calculated
        anova_x = weight_distance_anova(distance_x, kernel_density_estimation[i])
        anova_y = weight_distance_anova(distance_y, kernel_density_estimation[i])
        #using anovas to form condition distance matrices
        for k in range(num):
            for j in range(num): 
                condition_distance_covariance_xy[i] += anova_x[k][j] * anova_y[k][j] * kernel_density_estimation[i][k] * kernel_density_estimation[i][j]
                condition_distance_covariance_xx[i] += anova_x[k][j] * anova_x[k][j] * kernel_density_estimation[i][k] * kernel_density_estimation[i][j]
                condition_distance_covariance_yy[i] += anova_y[k][j] * anova_y[k][j] * kernel_density_estimation[i][k] * kernel_density_estimation[i][j]
    for i in range(num):
        dcor_denominator = condition_distance_covariance_xx[i] * condition_distance_covariance_yy[i]
        if (dcor_denominator > 0.0):
            condition_distance_covariance_xy[i] /= np.sqrt(dcor_denominator)
        else:
            condition_distance_covariance_xy[i] = 0.0
    return np.mean(condition_distance_covariance_xy)


def weight_distance_anova(distance_matrix, weight):
    weight_sum = np.sum(weight)
    num = distance_matrix.shape[0]

    marginal_weight_distance = np.zeros(num)
    #construct marginal weight distance array
    for i in range(num):
        marginal_weight_distance[i] = vector_weight_sum(distance_matrix[i], weight)
    weight_distance_sum = vector_weight_sum(marginal_weight_distance, weight) 
    weight_distance_sum /= weight_sum * weight_sum

    for i in range(num):
        marginal_weight_distance[i] /= weight_sum
    #construct weight distance anova table
    weight_distance_anova_table = np.zeros((num,num))
    for k in range(num):
        for j in range(num):
            weight_distance_anova_table[k][j] = distance_matrix[k][j] - marginal_weight_distance[k] - marginal_weight_distance[j] + weight_distance_sum
            weight_distance_anova_table[j][k] = weight_distance_anova_table[k][j]

    return weight_distance_anova_table

def vector_weight_sum(vector1, weight): 
    sum_value = 0.0
    for i in range(vector1.shape[0]):
        sum_value += vector1[i] * weight[i]
    return sum_value

#overarching function for calculating statistic and pvalue
def conduct_cdc_test(distance_x, distance_y, kernel, statistic,seed, num_bootstrap = 99):
    if (num_bootstrap != 0):
        #set up default random generator
        if (seed == None):
            rng = np.random.default_rng()
        else:
            rng = np.random.default_rng(seed)
        #acquire random saple index array for permuting
        random_sample_index = generate_random_sample_index(num_bootstrap, kernel,rng)

        bootstrap_distance_x = []
        perm_stat = np.zeros(num_bootstrap)
        larger_num = 0.0
        #permute distance matrix x to acquire bootstraped distance
        for i in range(num_bootstrap):
            bootstrap_distance_x = rearrange_matrix(distance_x, random_sample_index[i])
            value = Statistic(bootstrap_distance_x, distance_y, kernel)
            perm_stat.append(value)
            if value >= statistic:
                larger_num = larger_num + 1
        p_value = (1.0 + larger_num) / (1.0 + float(perm_stat.size))
    return p_value

#bootstrapped matrix calculation
def rearrange_matrix(dist_matrix,rearrange_index):
    new_matrix1 = np.zeros((rearrange_index.size,rearrange_index.size))
    new_matrix2 = np.zeros((rearrange_index.size,rearrange_index.size))
    k = 0
    rearrange_index = np.asarray(rearrange_index).astype('int64')
    #form matrices that have rows corresponding to rows extracted from distx at random indexes
    for index in rearrange_index:
        new_matrix1[k] = dist_matrix[index]
        new_matrix2[k] = dist_matrix[index]
        k = k + 1
    
    k = 0
    #select columns from the previously computed matrices at random indexes 
    for index in rearrange_index:
        for i in range(dist_matrix.shape[0]):
            new_matrix1[i][k] = new_matrix2[i][index];
        k = k + 1
    
    return new_matrix1

def generate_random_sample_index(replication_number, probability_matrix,random_number_generator):
    random_sample_index = np.zeros((replication_number,probability_matrix.size()))
    #generate random indexes by sampling multinomial distribution
    for i in range(probability_matrix.shape[0]):
        for j in range(replication_number):
            random_sample_index[j][i] = sample_multinomial_distribution(probability_matrix[i], random_number_generator)
    return random_sample_index;

def sample_multinomial_distribution(prob_array, random_number_generator):
    #sampling multinomial distribution using kernel density estimation rows
    arr = random_number_generator.multinomial(1,prob_array)
    index = np.where(arr == 1)
    return index[0]

In [69]:
a = np.array([[0,1],[2,3]])
print(a[0])

[0 1]


In [20]:
#helper function for power calculation
def _indep_perm_stat(x,y,z):
    check_input = _CheckInputs(
            x,
            y,
            z,
            reps=1000
        )
    x,y,z = check_input()
    obs_stat = conditional_dcorr(x,y,z,"rbf", reps = 1000, workers = 1,   is_distsim=True,perm_blocks=None, random_state=None)
    distx, disty = compute_dist(
                x, y, metric="euclidean")
    kernel_density_estimation = pairwise_kernels(z, metric="rbf", n_jobs=1)
    kernel_density_estimation= np.asarray(kernel_density_estimation).astype('float64')
    rng = np.random.default_rng()
    random_sample_index = np.zeros(kernel_density_estimation.shape[0])
    #generate array of random sample indexes
    for i in range(kernel_density_estimation.shape[0]):
          kernel_density_estimation[i] = kernel_density_estimation[i]/kernel_density_estimation[i].sum()
          print(kernel_density_estimation[i].sum())
          random_sample_index[i] = sample_multinomial_distribution(kernel_density_estimation[i], rng)
    #created bootstrap distance matrix x
    bootstrap_distance_x = rearrange_matrix(distx, random_sample_index)
    #calculate permutated statistic
    perm_stat = condition_distance_correlation_stats(bootstrap_distance_x ,disty,kernel_density_estimation)
    return obs_stat, perm_stat
#Example 12
def power_depend(sample_size):
    alt_dist = []
    null_dist = []
    alpha = 0.05
    for i in range(1000):
        #dependece distribution
        z1 = np.random.standard_t(2, sample_size)
        x1 = z1
        z2 = np.random.standard_t(2, sample_size)
        x2 = z2
        z3 = np.random.standard_t(2, sample_size)
        x3 = z3
        z4 = np.random.standard_t(2, sample_size)
        x4 = z4
        y1 = z1*z2 + (z3**2)*(z4**2)
        y2 = z1**3 + (z2**2)*(z3*z4)
        x = np.zeros((sample_size,4))
        x[:,0] = x1
        x[:,1] = x2
        x[:,2] = x3
        x[:,3] = x4
        z = np.zeros((sample_size,4))
        z[:,0] = z1
        z[:,1] = z2
        z[:,2] = z3
        z[:,3] = z4
        y = np.zeros((sample_size,2))
        y[:,0] = y1
        y[:,1] = y2
        obs_stat, perm_stat = _indep_perm_stat(x,y,z)
        print(obs_stat, 'a')
        print(perm_stat, 'b')
        alt_dist.append(obs_stat)
        null_dist.append(perm_stat)
    cutoff = np.sort(np.array(null_dist))[ceil(1000 * (1 - alpha))]
    empirical_power = (1 + (np.array(alt_dist) >= cutoff).sum()) / (1 + 1000)
    return empirical_power
#Example 4
def type_1_err_indep(sample_size):
    alt_dist = []
    null_dist = []
    alpha = 0.05  
    for i in range(1000):
        #independece distribution
        x1 = np.random.binomial(10,0.5,sample_size)
        y1 = np.random.binomial(10,0.5,sample_size)
        z1 = np.random.binomial(10,0.5,sample_size)
        z2 = np.random.binomial(10,0.5,sample_size)
        x = x1 + z1 + z2
        y = y1 + z1 + z2
        z = np.zeros((sample_size,2))
        z[:,0] = z1
        z[:,1] = z2
        obs_stat, perm_stat = _indep_perm_stat(x,y,z)
        print(obs_stat, perm_stat)
        alt_dist.append(obs_stat)
        null_dist.append(perm_stat)
    cutoff = np.sort(np.array(null_dist))[ceil(1000 * (1 - alpha))]
    type_1_err = (1 + (np.array(alt_dist) >= cutoff).sum()) / (1 + 1000)
    return type_1_err