In [1]:
import abc
import numpy
from analytic import SmoothCFTest, MeanEmbeddingTest
from unittest import TestCase
from numpy.random import seed


from warnings import warn
from numpy import mean, transpose, cov, cos, sin, shape, exp, newaxis, concatenate
from numpy.linalg import linalg, LinAlgError, solve
from scipy.stats import chi2


#testing purposes
from numpy.random import seed
import numpy 
import abc

In [2]:

num_samples = 500
dimensions = 10
seed(120)
X = numpy.random.randn(num_samples, dimensions)
X[:, 1] *= 3
Y = numpy.random.randn(num_samples, dimensions)

## Benchmark against below

In [3]:
pvalue = SmoothCFTest(X, Y, scale=2.0**(-5)).compute_pvalue()
pvalue

6.598161566320998e-21

In [4]:
pvalue = MeanEmbeddingTest(X,Y,scale=2.0**(-5)).compute_pvalue()
pvalue

2.9590753405197295e-34

## Test Mean Embedding

In [5]:

#init
scale = 2**(-5)
data_x, data_y = X*scale, Y*scale
number_of_frequencies = 5 #what is this?
_, dimension = numpy.shape(data_x)

In [6]:
seed(111)
points = numpy.random.randn(number_of_frequencies, dimension) #test points

ind = 0
a = numpy.zeros([points.shape[0],data_x.shape[0]])
for point in points:
    zx = numpy.linalg.norm(data_x - scale * point, axis=1)**2
    zy = numpy.linalg.norm(data_y - scale * point, axis=1)**2
    zx_est, zy_est = numpy.exp(-zx/2.0), numpy.exp(-zy/2.0)
    
    diff = zx_est - zy_est #diff in mean embeddings
    
    a[ind] = diff
    ind +=1

obs = a.T

num_samples, _ = shape(obs)
sigma = cov(transpose(obs))
mu = mean(obs, 0)
stat = num_samples * mu.dot(solve(sigma, mu.T)) #compute test statistic
pval = chi2.sf(stat, number_of_frequencies) #convert to p-value

print(pval) #pval matches

2.9590753405197295e-34


## Test Smooth CF

In [8]:
def _gen_random(dimension, num_random_features):
    seed(111)
    return numpy.random.randn(dimension, num_random_features)

def smooth(data):
    w = linalg.norm(data, axis=1)
    w = exp(-w ** 2 / 2)
    return w[:, newaxis]

def smooth_cf(data, w, random_frequencies):
    n,_ = data.shape
    _,d = random_frequencies.shape
    mat = data.dot(random_frequencies)
    arr = concatenate((sin(mat)*w, cos(mat)*w),1)
    return arr


#init
scale = 2**(-5)
data_x, data_y = scale*X, scale*Y
num_random_features = 5

_, dimension_x = numpy.shape(data_x)
_, dimension_y = numpy.shape(data_y)
assert dimension_x == dimension_y
random_frequencies = _gen_random(dimension_x, num_random_features)
random_frequencies.shape


x_smooth, y_smooth = smooth(data_x), smooth(data_y)
characteristic_fxn_x = smooth_cf(data_x, x_smooth, random_frequencies)
characteristic_fxn_y = smooth_cf(data_y, y_smooth, random_frequencies)
smooth_diff = characteristic_fxn_x - characteristic_fxn_y

sigma = cov(transpose(smooth_diff))
mu = mean(smooth_diff, 0)
stat = num_samples * mu.dot(solve(sigma, mu.T)) #compute test statistic
pval = chi2.sf(stat, 2*num_random_features) #convert to p-value

print(pval)

6.598161566320998e-21
