In [None]:
import sys
import cProfile
import time
import math

import numpy as np
from scipy.stats import pearsonr

sys.path.insert(0, '../../')
import ccal
from ccal.support import drop_nan_columns, add_jitter

%matplotlib inline
%config InlineBackend.figure_formats = {'svg',}

import rpy2.robjects as ro

from rpy2.robjects.numpy2ri import numpy2ri
ro.conversion.py2ri = numpy2ri
from rpy2.robjects.packages import importr
from rpy2.robjects.packages import SignatureTranslatedAnonymousPackage
mass = importr('MASS')

# Time functions

In [None]:
n = 100
size = 700

In [None]:
t0 = time.time()
for i in range(n):
    x = np.random.random_sample(size)
    y = np.random.random_sample(size)
    ccal.information.information_coefficient(x, y)
time.time() - t0

In [None]:
r ='''
information_coefficient <-  function(x, y, n.grid=25)
{
    x.set <- !is.na(x)
    y.set <- !is.na(y)
    overlap <- x.set & y.set

    x <- x[overlap] +  0.000000001*runif(length(overlap))
    y <- y[overlap] +  0.000000001*runif(length(overlap))

    if (length(x) > 2) {
        delta = c(bcv(x), bcv(y))
        rho <- cor(x, y)
        delta <- delta*(1 + (-0.75)*abs(rho))
        
        kde2d.xy <- kde2d(x, y, n = n.grid, h = delta)
        FXY <- kde2d.xy$z + .Machine$double.eps
        dx <- kde2d.xy$x[2] - kde2d.xy$x[1]
        dy <- kde2d.xy$y[2] - kde2d.xy$y[1]
        
        PXY <- FXY/(sum(FXY)*dx*dy)
        PX <- rowSums(PXY)*dy
        PY <- colSums(PXY)*dx
        
        HXY <- -sum(PXY * log(PXY))*dx*dy
        HX <- -sum(PX * log(PX))*dx
        HY <- -sum(PY * log(PY))*dy
        
        PX <- matrix(PX, nrow=n.grid, ncol=n.grid)
        PY <- matrix(PY, byrow = TRUE, nrow=n.grid, ncol=n.grid)
        
        MI <- sum(PXY * log(PXY/(PX*PY)))*dx*dy
        
        IC <- sign(rho) * sqrt(1 - exp(- 2 * MI))
        
        if (is.na(IC)) {
            IC <- 0
        }
    }
    else {
        IC <- 0
    }
    return(IC)
}
'''
r = SignatureTranslatedAnonymousPackage(r, 'r')

t0 = time.time()
for i in range(n):
    x = np.random.random_sample(size)
    y = np.random.random_sample(size)
    r.information_coefficient(x, y)
time.time() - t0

In [None]:
def information_coefficient_kde2d(x, y, ngrid=25):
    # Filter columsn and add jitter
    x, y = add_jitter(drop_nan_columns([x, y]))

    # Can't calcualte MI if length of a vector is smaller than or equal to the number of vectors
    if len(x) <= 2:
        return 0
    
    # Get bandwidth scalar
    bandwidth = np.array([mass.bcv(v)[0] for v in [x, y]]) / 4
    cor, p = pearsonr(x, y)
    bandwidth_scaling = 1 + (-0.75) * abs(cor)
    if bandwidth_scaling:
        bandwidth *= bandwidth_scaling
        
    # KDE
    grid_x, grid_y, density = [np.asarray(i) for i in mass.kde2d(x, y, bandwidth, n=np.array([ngrid]))]
    density += np.finfo(float).eps

    dx = grid_x[1] = grid_x[0]
    dy = grid_y[1] = grid_y[0]
    
    pxy = density / (density.sum() * dx * dy)
    
    px = pxy.sum(axis=1) * dy
    py = pxy.sum(axis=0) * dx
    
    hxy = - (pxy * np.log(pxy)).sum() * dx * dy
    hx = -np.sum(px * np.log(px)) * dx
    hy = -np.sum(py * np.log(py)) * dy
    
    mi = hx + hy - hxy
    ic = np.sign(cor) * np.sqrt(1 - np.exp(- 2 * mi))
    
    return ic

t0 = time.time()
for i in range(n):
    x = np.random.random_sample(size)
    y = np.random.random_sample(size)
    information_coefficient_2(x, y)
time.time() - t0

In [None]:
def information_coefficient(x, y, data_types, ngrid=25):
    # Filter columsn and add jitter
    x, y = add_jitter(drop_nan_columns([x, y]))

    # Can't calcualte MI if length of a vector is smaller than or equal to the number of vectors
    if len(x) <= 2:
        return 0
    
    # Get bandwidth scalar
    bandwidth = np.array([mass.bcv(v)[0] for v in [x, y]]) / 4
    cor, p = pearsonr(x, y)
    bandwidth_scaling = 1 + (-0.75) * abs(cor)
    if bandwidth_scaling:
        bandwidth *= bandwidth_scaling
        
    # Get bandwidth and kernel density estimate
    kde = kde_KDEMultivariate(x, y, data_types, bandwidth)

    # Grid
    meshgrids = np.meshgrid(np.linspace(x.min(), x.max(), ngrid),
                            np.linspace(y.min(), y.max(), ngrid))
    p_joint = kde.pdf(data_predict=np.vstack([meshgrids[0].flatten(),
                                              meshgrids[1].flatten()])).reshape((ngrid, ngrid)) + np.finfo(float).eps
    grids = [np.linspace(v.min(), v.max(), ngrid) for v in [x, y]]
    ds = [grid[1] - grid[0] for grid in grids]
    dx = ds[0]
    dy = ds[1]
    
    ds_prod = np.prod(ds)
    p_joint /= p_joint.sum() * ds_prod
    h_joint = - np.sum(p_joint * np.log(p_joint)) * ds_prod

    px = p_joint.sum(axis=1) * dy
    py = p_joint.sum(axis=0) * dx
    hx = -np.sum(px * np.log(px)) * dx
    hy = -np.sum(py * np.log(py)) * dy
    mi = hx + hy - h_joint
    ic = np.sign(cor) * np.sqrt(1 - np.exp(- 2 * mi))
    return ic

from statsmodels.nonparametric.kernel_density import KDEMultivariate
def kde_KDEMultivariate(x, y, data_types, bandwidth):
    return KDEMultivariate([x, y], var_type=data_types, bw=bandwidth)

t0 = time.time()
for i in range(n):
    x = np.random.random_sample(size)
    y = np.random.random_sample(size)
    information_coefficient(x, y, 'cc')
time.time() - t0

# Test data type cases

In [None]:
x = np.random.random_sample(10)
y = np.random.random_sample(10)
print(ccal.information.information_coefficient(x, y))

x = np.random.random_sample(10)
y = np.random.random_sample(11)
try:
    ccal.information.information_coefficient(x, y)
except ValueError as e:
    print(e)

x = np.random.random_sample(10)
x[1] = None
y = np.random.random_sample(10)
y[2] = None
print(ccal.information.information_coefficient(x, y))

x = np.random.random_sample(10)
x[1] = None
y = np.random.random_sample(10)
y[2] = None
y[6] = None
print(ccal.information.information_coefficient(x, y))

x = np.random.random_sample(10)
x[1] = None
x[3] = None
x[5] = None
y = np.random.random_sample(10)
y[2] = None
y[4] = None
print(ccal.information.information_coefficient(x, y))

x = np.array([12.517, 14.706, np.nan, 14.12, np.nan, np.nan, np.nan, 12.255])
y = np.array([0.98246356, 0.97525171, 0.77744759, 0.64084311, 0.4405853, 0.43827196, 0.12447757, 0.08116039])
print(ccal.information.information_coefficient(x, y))