In [None]:
# Use higher resolution
%config InlineBackend.figure_formats = {'svg',}

In [1]:
import sys
import time
import math

import numpy as np
from scipy.stats import pearsonr

sys.path.insert(0, '../../')
import ccal
from ccal.support import drop_nan_columns, add_jitter

%matplotlib inline
%config InlineBackend.figure_formats = {'svg',}

<14:10:10> Checking dependencies ...
<14:10:10> Using the following packages:
<14:10:10> 	matplotlib (v1.5.1)
<14:10:10> 	numpy (v1.10.4)
<14:10:10> 	pandas (v0.18.0)
<14:10:10> 	rpy2 (v2.7.9)
<14:10:10> 	scikit-learn (v0.17.1)
<14:10:10> 	scipy (v0.17.0)
<14:10:10> 	seaborn (v0.7.0)


In [2]:
n = 100
size = 1000
x = np.random.random_sample(size)
y = np.random.random_sample(size)

# Time information_coefficient implementations

## Wrapped R using P and H

In [3]:
from rpy2.robjects.packages import SignatureTranslatedAnonymousPackage


# TODO: refactor
r ='''
    information_coefficient <- function(x, y, n.grid = 25) {
        x.set <- !is.na(x)
        y.set <- !is.na(y)
        overlap <- x.set & y.set

        x <- x[overlap] + 1e-09 * runif(length(overlap))
        y <- y[overlap] + 1e-09 * runif(length(overlap))

        if (length(x) > 2) {
            delta <- c(bcv(x), bcv(y))
            cor <- cor(x, y)

            bandwidths <- delta * (1 + (-0.75) * abs(cor))
            
            kde2d.xy <- kde2d(x, y, bandwidths, n = n.grid)
            fxy <- kde2d.xy$z + .Machine$double.eps
            dx <- kde2d.xy$x[2] - kde2d.xy$x[1]
            dy <- kde2d.xy$y[2] - kde2d.xy$y[1]
            pxy <- fxy/(sum(fxy) * dx * dy)
            px <- rowSums(pxy) * dy
            py <- colSums(pxy) * dx

            # hxy <- -sum(pxy * log(pxy)) * dx * dy
            # hx <- -sum(px * log(px)) * dx
            # hy <- -sum(py * log(py)) * dy
            # mi <- hx + hy - hxy

            px <- matrix(px, nrow = n.grid, ncol = n.grid)
            py <- matrix(py, byrow = TRUE, nrow = n.grid, ncol = n.grid)
            mi <- sum(pxy * log(pxy/(px * py))) * dx * dy

            ic <- sign(cor) * sqrt(1 - exp(-2 * mi))

            if (is.na(ic)) {
                ic <- 0
            }
        } else {
            ic <- 0
        }
        return(ic)
    }
    '''
r = SignatureTranslatedAnonymousPackage(r, 'r')

t0 = time.time()
for i in range(n):
    x = np.random.random_sample(size)
    y = np.random.random_sample(size)
    r.information_coefficient(x, y)
print(time.time() - t0)

0.9430339336395264


## Using R's kde2d

In [4]:
import rpy2.robjects as ro
from rpy2.robjects.numpy2ri import numpy2ri
ro.conversion.py2ri = numpy2ri
from rpy2.robjects.packages import importr
mass = importr('MASS')


def information_coefficient_kde2d(x, y, ngrid=25):
    x, y = add_jitter(drop_nan_columns([x, y]))

    if len(x) <= 2:
        return 0
    
    cor, p = pearsonr(x, y)    
    bandwidth_x = np.asarray(mass.bcv(x)[0]) * (1 + (-0.75) * abs(cor))
    bandwidth_y = np.asarray(mass.bcv(y)[0]) * (1 + (-0.75) * abs(cor))
    
    fxy = np.asarray(mass.kde2d(x, y, np.asarray([bandwidth_x, bandwidth_y]), n=np.asarray([ngrid]))[2]) + np.finfo(float).eps
    dx = (x.max() - x.min()) / (ngrid - 1)
    dy = (y.max() - y.min()) / (ngrid - 1)
    pxy = fxy / (fxy.sum() * dx * dy)
    px = pxy.sum(axis=1) * dy
    py = pxy.sum(axis=0) * dx
    
    mi = np.sum(pxy * np.log(pxy/(np.array([px] * ngrid).T * np.array([py] * ngrid)))) * dx * dy
    
    # hxy = - np.sum(pxy * np.log(pxy)) * dx * dy
    # hx = -np.sum(px * np.log(px)) * dx
    # hy = -np.sum(py * np.log(py)) * dy
    # mi = hx + hy - hxy
    
    ic = np.sign(cor) * np.sqrt(1 - np.exp(- 2 * mi))
    
    return ic

t0 = time.time()
for i in range(n):
    x = np.random.random_sample(size)
    y = np.random.random_sample(size)
    information_coefficient_kde2d(x, y)
time.time() - t0

0.9052920341491699

## Using KDEMultivariate

In [5]:
from statsmodels.nonparametric.kernel_density import KDEMultivariate


def information_coefficient(x, y, data_types, ngrid=25):
    x, y = add_jitter(drop_nan_columns([x, y]))

    if len(x) <= 2:
        return 0

    cor, p = pearsonr(x, y)    
    bandwidth_x = np.asarray(mass.bcv(x)[0]) * (1 + (-0.75) * abs(cor)) / 4
    bandwidth_y = np.asarray(mass.bcv(y)[0]) * (1 + (-0.75) * abs(cor)) / 4
    
    kde = KDEMultivariate([x, y], bw=[bandwidth_x, bandwidth_y], var_type=data_types)
    meshgrid_x, meshgrid_y = np.meshgrid(np.linspace(x.min(), x.max(), ngrid), np.linspace(y.min(), y.max(), ngrid))
    fxy = kde.pdf(np.vstack([meshgrid_x.flatten(), meshgrid_y.flatten()])).reshape((ngrid, ngrid)) + np.finfo(float).eps
    dx = (x.max() - x.min()) / (ngrid - 1)
    dy = (y.max() - y.min()) / (ngrid - 1)
    pxy = fxy / (fxy.sum() * dx * dy)
    px = pxy.sum(axis=1) * dy
    py = pxy.sum(axis=0) * dx
    
    mi = np.sum(pxy * np.log(pxy/(np.array([px] * ngrid).T * np.array([py] * ngrid)))) * dx * dy
    
    # hxy = - np.sum(pxy * np.log(pxy)) * dx * dy
    # hx = -np.sum(px * np.log(px)) * dx
    # hy = -np.sum(py * np.log(py)) * dy
    # mi = hx + hy - hxy
    
    ic = np.sign(cor) * np.sqrt(1 - np.exp(- 2 * mi))
    
    return ic

t0 = time.time()
for i in range(n):
    x = np.random.random_sample(size)
    y = np.random.random_sample(size)
    information_coefficient(x, y, data_types='cc')
print(time.time() - t0)

4.958044052124023


In [6]:
print(r.information_coefficient(x, y))
print(information_coefficient_kde2d(x, y))
print(information_coefficient(x, y, data_types='cc'))
#print(information_coefficient_scipy(x, y, 'cc'))

[1] 0.1311764

0.131176387253
0.131176387245


## Using gaussian_kde

In [7]:
from scipy.stats import gaussian_kde


def information_coefficient_scipy(x, y, data_types, ngrid=25):
    x, y = add_jitter(drop_nan_columns([x, y]))

    if len(x) <= 2:
        return 0
    
    # Get bandwidths
    cor, p = pearsonr(x, y)
    bandwidth_x = np.asarray(mass.bcv(x)[0]) / 4 * (1 + (-0.75) * abs(cor))
    bandwidth_y = np.asarray(mass.bcv(y)[0]) / 4 * (1 + (-0.75) * abs(cor))

    # Kernel density estimate
    kde = gaussian_kde([x, y], bw_method='silverman')
    
    meshgrid_x, meshgrid_y = np.meshgrid(np.linspace(x.min(), x.max(), ngrid), np.linspace(y.min(), y.max(), ngrid))
    dx = (x.max() - x.min()) / (ngrid - 1)
    dy = (y.max() - y.min()) / (ngrid - 1)
    
    pxy = kde.pdf(np.vstack((meshgrid_x.flatten(), meshgrid_y.flatten()))).reshape((ngrid, ngrid)) + np.finfo(float).eps
    px = pxy.sum(axis=1) * dy
    py = pxy.sum(axis=0) * dx
    mi = np.sum(pxy * np.log(pxy/(px * py))) * dx * dy
    ic = np.sign(cor) * np.sqrt(1 - np.exp(- 2 * mi))
    return ic

t0 = time.time()
for i in range(n):
    x = np.random.random_sample(size)
    y = np.random.random_sample(size)
    information_coefficient_scipy(x, y, 'cc')
time.time() - t0

2.533968925476074

# Test data type cases

In [8]:
x = np.random.random_sample(10)
y = np.random.random_sample(10)
print(ccal.information.information_coefficient(x, y))

x = np.random.random_sample(10)
y = np.random.random_sample(11)
try:
    ccal.information.information_coefficient(x, y)
except ValueError as e:
    print(e)

x = np.random.random_sample(10)
x[1] = None
y = np.random.random_sample(10)
y[2] = None
print(ccal.information.information_coefficient(x, y))

x = np.random.random_sample(10)
x[1] = None
y = np.random.random_sample(10)
y[2] = None
y[6] = None
print(ccal.information.information_coefficient(x, y))

x = np.random.random_sample(10)
x[1] = None
x[3] = None
x[5] = None
y = np.random.random_sample(10)
y[2] = None
y[4] = None
print(ccal.information.information_coefficient(x, y))

x = np.array([12.517, 14.706, np.nan, 14.12, np.nan, np.nan, np.nan, 12.255])
y = np.array([0.98246356, 0.97525171, 0.77744759, 0.64084311, 0.4405853, 0.43827196, 0.12447757, 0.08116039])
print(ccal.information.information_coefficient(x, y))

0.494202437232
Input arrays have different lengths: 11 & 10.
-0.193143869531
0.26435120236
0.191147978571
0.355918908655
