In [1]:
import sys
import cProfile
import time
import math

import numpy as np
from scipy.stats import pearsonr

sys.path.insert(0, '../../')
import ccal
from ccal.support import drop_nan_columns, add_jitter

%matplotlib inline
%config InlineBackend.figure_formats = {'svg',}

import rpy2.robjects as ro

from rpy2.robjects.numpy2ri import numpy2ri
ro.conversion.py2ri = numpy2ri


<17:22:54> Checking dependencies ...
<17:22:54> Using the following packages:
<17:22:54> 	matplotlib (v1.5.1)
<17:22:54> 	numpy (v1.10.4)
<17:22:54> 	pandas (v0.18.0)
<17:22:54> 	rpy2 (v2.7.9)
<17:22:54> 	scikit-learn (v0.17.1)
<17:22:54> 	scipy (v0.17.0)
<17:22:54> 	seaborn (v0.7.0)


In [12]:
n = 1
size = 1000
x = np.random.random_sample(size)
y = np.random.random_sample(size)

# Time KDE functions

In [3]:
from rpy2.robjects.packages import importr
mass = importr('MASS')

t0 = time.time()
for i in range(n):
    x = np.random.random_sample(size)
    y = np.random.random_sample(size)
    kde = mass.kde2d(x, y, np.array([0.1, 0.1]), n=np.array([25]))
    kde[2]
time.time() - t0

0.022496938705444336

In [4]:
from statsmodels.nonparametric.kernel_density import KDEMultivariate

t0 = time.time()
for i in range(n):
    x = np.random.random_sample(size)
    y = np.random.random_sample(size)
    kde = KDEMultivariate([x, y], var_type='cc', bw=[0.1, 0.1])
    kde.pdf()
time.time() - t0

0.07916998863220215

In [5]:
from scipy.stats import gaussian_kde

t0 = time.time()
for i in range(n):
    x = np.random.random_sample(size)
    y = np.random.random_sample(size)
    kde = gaussian_kde([x, y], bw_method='silverman')
    kde.pdf(np.random.random_sample((2, 25)))
time.time() - t0

0.046447038650512695

from fastkde import fastKDE
t0 = time.time()
for i in range(n):
    x = np.random.random_sample(size)
    y = np.random.random_sample(size)
    pdf, axes = fastKDE.pdf(x, y)
time.time() - t0


# Time information_coefficient functions

In [43]:
from rpy2.robjects.packages import SignatureTranslatedAnonymousPackage
r ='''
information_coefficient <-  function(x, y, n.grid=25)
{
    x.set <- !is.na(x)
    y.set <- !is.na(y)
    overlap <- x.set & y.set

    x <- x[overlap] +  0.000000001*runif(length(overlap))
    y <- y[overlap] +  0.000000001*runif(length(overlap))

    if (length(x) > 2) {
        delta = c(bcv(x), bcv(y))
        rho <- cor(x, y)
        delta <- delta*(1 + (-0.75)*abs(rho))
        
        kde2d.xy <- kde2d(x, y, n = n.grid, h = delta)
        FXY <- kde2d.xy$z + .Machine$double.eps
        dx <- kde2d.xy$x[2] - kde2d.xy$x[1]
        dy <- kde2d.xy$y[2] - kde2d.xy$y[1]
        
        PXY <- FXY/(sum(FXY)*dx*dy)
        PX <- rowSums(PXY)*dy
        PY <- colSums(PXY)*dx
        
        HXY <- -sum(PXY * log(PXY))*dx*dy
        HX <- -sum(PX * log(PX))*dx
        HY <- -sum(PY * log(PY))*dy
        
        PX <- matrix(PX, nrow=n.grid, ncol=n.grid)
        PY <- matrix(PY, byrow = TRUE, nrow=n.grid, ncol=n.grid)
        
        MI <- HX + HY - HXY
        print(sign(rho) * sqrt(1 - exp(- 2 * MI)))
        
        MI <- sum(PXY * log(PXY/(PX*PY)))*dx*dy
        print(sign(rho) * sqrt(1 - exp(- 2 * MI)))
        IC = 9
        if (is.na(IC)) {
            IC <- 0
        }
    }
    else {
        IC <- 0
    }
    return(IC)
}
'''
r = SignatureTranslatedAnonymousPackage(r, 'r')

t0 = time.time()
for i in range(n):
    x = np.random.random_sample(size)
    y = np.random.random_sample(size)
    r.information_coefficient(x, y)
time.time() - t0

[1]
 0.1441506


[1]
 0.1441506




0.00895380973815918

def information_coefficient(x, y, z=None, n_grid=25, vector_data_types=None, n_perm=0, adaptive=True, alpha=0.05,
                            perm_alpha=0.05):
    x, y = add_jitter(drop_nan_columns([x, y]))

    if len(x) <= len(vector_data_types):
        return 0

    rho, p = pearsonr(x, y)
    bandwidth_scaling = 1 + (-0.75) * abs(rho)

    vectors = [x, y]
    grids = [np.linspace(v.min(), v.max(), n_grid) for v in vectors]
    mesh_grids = np.meshgrid(*grids)
    grid_shape = tuple([n_grid] * len(vectors))
    grid = np.vstack([mesh_grid.flatten() for mesh_grid in mesh_grids])
    delta = np.array([rbcv(q) for q in vectors]).reshape((len(vectors),)) / 4
    if bandwidth_scaling:
        delta *= bandwidth_scaling
    kde = KDEMultivariate(vectors, bw=delta, var_type=vector_data_types)
    pxy = kde.pdf(grid).reshape(grid_shape) + np.finfo(float).eps
    ds = [grid[1] - grid[0] for grid in grids]
    ds_prod = np.prod(ds)
    pxy /= (pxy.sum() * ds_prod)
    h_joint = - np.sum(pxy * np.log(pxy)) * ds_prod
    dx = ds[0]
    dy = ds[1]
    dx = ds[0]
    dy = ds[1]
    px = pxy.sum(axis=1) * dy
    py = pxy.sum(axis=0) * dx
    hx = -np.sum(px * np.log(px)) * dx
    hy = -np.sum(py * np.log(py)) * dy
    mi = hx + hy - h_joint

    ic_sign = np.sign(rho)
    ic = ic_sign * np.sqrt(1 - np.exp(- 2 * mi))
    return ic


def rbcv(x):
    bandwidth = np.array(mass.bcv(x))[0]
    return bandwidth


t0 = time.time()
for i in range(n):
    x = np.random.random_sample(size)
    y = np.random.random_sample(size)
    information_coefficient(x, y, vector_data_types='cc')
time.time() - t0

In [45]:
def information_coefficient2(x, y, data_types, ngrid=25):
    # Filter columsn and add jitter
    x, y = add_jitter(drop_nan_columns([x, y]))

    # Can't calculate MI if length of a vector is smaller than or equal to the number of vectors
    if len(x) <= 2:
        return 0

    cor, p = pearsonr(x, y)    
    
    bandwidth_x = np.asarray(mass.bcv(x)[0]) / 4 * (1 + (-0.75) * abs(cor))
    bandwidth_y = np.asarray(mass.bcv(y)[0]) / 4 * (1 + (-0.75) * abs(cor))
    
    kde = KDEMultivariate([x, y], bw=[bandwidth_x, bandwidth_y], var_type=data_types)
    grid_x = np.linspace(x.min(), x.max(), ngrid)
    grid_y = np.linspace(y.min(), y.max(), ngrid)
    dx = grid_x[1] - grid_x[0]
    dy = grid_y[1] - grid_y[0]
    meshgrid_x, meshgrid_y = np.meshgrid(grid_x, grid_y)
    pxy = kde.pdf(np.vstack([meshgrid_x.flatten(), meshgrid_y.flatten()])).reshape((ngrid, ngrid)) + np.finfo(float).eps

    pxy /= (pxy.sum() * dx * dy)
    px = pxy.sum(axis=1) * dy
    py = pxy.sum(axis=0) * dx   
    return px
    hxy = - np.sum(pxy * np.log(pxy)) * dx * dy
    hx = -np.sum(px * np.log(px)) * dx
    hy = -np.sum(py * np.log(py)) * dy
    
    mi = hx + hy - hxy
    ic_sign = np.sign(cor)
    ic = ic_sign * np.sqrt(1 - np.exp(- 2 * mi))
    print('H', ic)
    
    mi = np.sum(pxy * np.log(pxy/(px * py))) * dx * dy
    ic_sign = np.sign(cor)
    ic = ic_sign * np.sqrt(1 - np.exp(- 2 * mi))
    print('P', ic)


t0 = time.time()
for i in range(n):
    x = np.random.random_sample(size)
    y = np.random.random_sample(size)
    information_coefficient2(x, y, data_types='cc')
time.time() - t0

0.05499100685119629

In [46]:
x = np.random.random_sample(size)
y = np.random.random_sample(size)
a = information_coefficient2(x, y, data_types='cc')

In [48]:
a * a

array([ 0.19513879,  0.4959941 ,  0.80329806,  0.99249291,  1.0679133 ,
        1.06451103,  1.00593402,  0.94158745,  0.9042184 ,  0.88999245,
        0.91667045,  0.99322138,  1.06693336,  1.11371109,  1.17043999,
        1.21304707,  1.21374143,  1.1924651 ,  1.13442087,  1.05372987,
        1.02316613,  1.05582141,  1.0381755 ,  0.78191404,  0.34980949])

In [13]:
#print(information_coefficient(x, y, vector_data_types='cc'))
print(r.information_coefficient(x, y))
print(information_coefficient2(x, y, 'cc'))

[1] -0.09795917

H -0.0979591679704
P -0.118486260171
None


In [None]:
def information_coefficient_clean(x, y, data_types, ngrid=25):
    # Filter columsn and add jitter
    x, y = add_jitter(drop_nan_columns([x, y]))

    # Can't calculate MI if length of a vector is smaller than or equal to the number of vectors
    if len(x) <= 2:
        return 0

    # Get bandwidths
    cor, p = pearsonr(x, y)

    bandwidth_x = np.asarray(mass.bcv(x)[0]) / 4 * (1 + (-0.75) * abs(cor))
    bandwidth_y = np.asarray(mass.bcv(y)[0]) / 4 * (1 + (-0.75) * abs(cor))

    # Kernel density estimate
    kde = KDEMultivariate([x, y], data_types, bw=[bandwidth_x, bandwidth_y])
    grid_x = np.linspace(x.min(), x.max(), ngrid)
    grid_y = np.linspace(y.min(), y.max(), ngrid)
    dx = grid_x[1] - grid_x[0]
    dy = grid_y[1] - grid_y[0]
    meshgrid_x, meshgrid_y = np.meshgrid(grid_x, grid_y)
    pxy = kde.pdf(data_predict=np.vstack((meshgrid_x.flatten(), meshgrid_y.flatten()))).reshape((ngrid, ngrid)) + np.finfo(float).eps
    pxy /= (np.sum(pxy) * dx * dy)
    px = pxy.sum(axis=1) * dy
    py = pxy.sum(axis=0) * dx
    hx = - np.sum(px * np.log(px)) * dx
    hy = - np.sum(py * np.log(py)) * dy
    hxy = - np.sum(pxy * np.log(pxy)) * dx * dy
    mi = np.sum(pxy * np.log(pxy/(px * py))) * dx * dy
    mi = hx + hy - hxy
    
    ic = np.sign(cor) * np.sqrt(1 - np.exp(- 2 * mi))
    return ic

t0 = time.time()
for i in range(n):
    x = np.random.random_sample(size)
    y = np.random.random_sample(size)
    information_coefficient_clean(x, y, 'cc')
time.time() - t0

In [None]:
def information_coefficient_scipy(x, y, data_types, ngrid=25):
    # Filter columsn and add jitter
    x, y = add_jitter(drop_nan_columns([x, y]))

    # Can't calcualte MI if length of a vector is smaller than or equal to the number of vectors
    if len(x) <= 2:
        return 0
    
    # Get bandwidths
    cor, p = pearsonr(x, y)

    bandwidth_x = np.asarray(mass.bcv(x)[0]) / 4 * (1 + (-0.75) * abs(cor))
    bandwidth_y = np.asarray(mass.bcv(y)[0]) / 4 * (1 + (-0.75) * abs(cor))

    # Kernel density estimate
    kde = gaussian_kde([x, y], bw_method='silverman')
    grid_x = np.linspace(x.min(), x.max(), ngrid)
    grid_y = np.linspace(y.min(), y.max(), ngrid)
    dx = grid_x[1] - grid_x[0]
    dy = grid_y[1] - grid_y[0]
    meshgrid_x, meshgrid_y = np.meshgrid(grid_x, grid_y)
    pxy = kde.pdf(np.vstack((meshgrid_x.flatten(), meshgrid_y.flatten()))).reshape((ngrid, ngrid)) + np.finfo(float).eps

    px = pxy.sum(axis=1) * dy
    py = pxy.sum(axis=0) * dx
    mi = np.sum(pxy * np.log(pxy/(px * py))) * dx * dy
    ic = np.sign(cor) * np.sqrt(1 - np.exp(- 2 * mi))
    return ic

t0 = time.time()
for i in range(n):
    x = np.random.random_sample(size)
    y = np.random.random_sample(size)
    information_coefficient_scipy(x, y, 'cc')
time.time() - t0

In [None]:
def information_coefficient_kde2d(x, y, ngrid=25):
    # Filter columsn and add jitter
    x, y = add_jitter(drop_nan_columns([x, y]))

    # Can't calcualte MI if length of a vector is smaller than or equal to the number of vectors
    if len(x) <= 2:
        return 0
    
    # Get bandwidth scalar
    bandwidth = np.array([mass.bcv(v)[0] for v in [x, y]]) / 4
    cor, p = pearsonr(x, y)
    bandwidth_scaling = 1 + (-0.75) * abs(cor)
    if bandwidth_scaling:
        bandwidth *= bandwidth_scaling
        
    # KDE
    grid_x, grid_y, density = [np.asarray(i) for i in mass.kde2d(x, y, bandwidth, n=np.array([ngrid]))]
    density += np.finfo(float).eps

    dx = grid_x[1] = grid_x[0]
    dy = grid_y[1] = grid_y[0]
    
    pxy = density / (density.sum() * dx * dy)
    
    px = pxy.sum(axis=1) * dy
    py = pxy.sum(axis=0) * dx
    
    hxy = - (pxy * np.log(pxy)).sum() * dx * dy
    hx = -np.sum(px * np.log(px)) * dx
    hy = -np.sum(py * np.log(py)) * dy
    
    mi = hx + hy - hxy
    ic = np.sign(cor) * np.sqrt(1 - np.exp(- 2 * mi))
    
    return ic

t0 = time.time()
for i in range(n):
    x = np.random.random_sample(size)
    y = np.random.random_sample(size)
    information_coefficient_kde2d(x, y)
time.time() - t0

In [None]:
def information_coefficient_kde2d_clean(x, y, data_types, ngrid=25):
    # Filter columsn and add jitter
    x, y = add_jitter(drop_nan_columns([x, y]))

    # Can't calcualte MI if length of a vector is smaller than or equal to the number of vectors
    if len(x) <= 2:
        return 0

    # Get bandwidths
    cor, p = pearsonr(x, y)

    bandwidth_x = np.asarray(mass.bcv(x)[0]) / 4 * (1 + (-0.75) * abs(cor))
    bandwidth_y = np.asarray(mass.bcv(y)[0]) / 4 * (1 + (-0.75) * abs(cor))

    # Kernel density estimate
    grid_x, grid_y, density = [np.asarray(i) for i in mass.kde2d(x, y, np.array([bandwidth_x, bandwidth_y]), n=np.array([ngrid]))]
    dx = grid_x[1] - grid_x[0]
    dy = grid_y[1] - grid_y[0]
    density += np.finfo(float).eps
    pxy = density / (density.sum() * dx * dy)

    px = pxy.sum(axis=1) * dy
    py = pxy.sum(axis=0) * dx
    mi = np.sum(pxy * np.log(pxy/(px * py))) * dx * dy
    ic = np.sign(cor) * np.sqrt(1 - np.exp(- 2 * mi))
    return ic

t0 = time.time()
for i in range(n):
    x = np.random.random_sample(size)
    y = np.random.random_sample(size)
    information_coefficient_kde2d_clean(x, y, 'cc')
time.time() - t0

In [None]:
print(information_coefficient(x, y, vector_data_types='cc'))
print(information_coefficient_clean(x, y, 'cc'))
print(information_coefficient_scipy(x, y, 'cc'))
print(r.information_coefficient(x, y))
print(information_coefficient_kde2d(x, y))
print(information_coefficient_kde2d_clean(x, y, 'cc'))

# Test data type cases

x = np.random.random_sample(10)
y = np.random.random_sample(10)
print(ccal.information.information_coefficient(x, y))

x = np.random.random_sample(10)
y = np.random.random_sample(11)
try:
    ccal.information.information_coefficient(x, y)
except ValueError as e:
    print(e)

x = np.random.random_sample(10)
x[1] = None
y = np.random.random_sample(10)
y[2] = None
print(ccal.information.information_coefficient(x, y))

x = np.random.random_sample(10)
x[1] = None
y = np.random.random_sample(10)
y[2] = None
y[6] = None
print(ccal.information.information_coefficient(x, y))

x = np.random.random_sample(10)
x[1] = None
x[3] = None
x[5] = None
y = np.random.random_sample(10)
y[2] = None
y[4] = None
print(ccal.information.information_coefficient(x, y))

x = np.array([12.517, 14.706, np.nan, 14.12, np.nan, np.nan, np.nan, 12.255])
y = np.array([0.98246356, 0.97525171, 0.77744759, 0.64084311, 0.4405853, 0.43827196, 0.12447757, 0.08116039])
print(ccal.information.information_coefficient(x, y))