In [1]:
import time
import math
import numpy as np
import sys
sys.path.insert(0, '..')
import ccal
%matplotlib inline

<18:15:41> Checking library dependencies ...
<18:15:41> Using the following libraries (in addition to the Anaconda libraries):
<18:15:41> 	scipy (v0.18.0)
<18:15:41> 	statsmodels (v0.6.1)
<18:15:41> 	scikit-learn (v0.17.1)
<18:15:41> 	seaborn (v0.7.1)
<18:15:41> 	rpy2 (v2.8.2)
<18:15:41> 	pandas (v0.18.1)
<18:15:41> 	matplotlib (v1.5.1)
<18:15:41> 	numpy (v1.11.1)
<18:15:41> Planted a random seed 20121020.


# Test data type cases

In [2]:
x = np.random.random_sample(10)
y = np.random.random_sample(10)
print(ccal.information.information_coefficient(x, y))

x = np.random.random_sample(10)
y = np.random.random_sample(11)
try:
    ccal.information.information_coefficient(x, y)
except ValueError as e:
    print(e)

x = np.random.random_sample(10)
x[1] = None
y = np.random.random_sample(10)
y[2] = None
print(ccal.information.information_coefficient(x, y))

x = np.random.random_sample(10)
x[1] = None
y = np.random.random_sample(10)
y[2] = None
y[6] = None
print(ccal.information.information_coefficient(x, y))

x = np.random.random_sample(10)
x[1] = None
x[3] = None
x[5] = None
y = np.random.random_sample(10)
y[2] = None
y[4] = None
print(ccal.information.information_coefficient(x, y))

x = np.array([12.517, 14.706, np.nan, 14.12, np.nan, np.nan, np.nan, 12.255])
y = np.array([0.98246356, 0.97525171, 0.77744759, 0.64084311, 0.4405853, 0.43827196, 0.12447757, 0.08116039])
print(ccal.information.information_coefficient(x, y))

0.672782394796
operands could not be broadcast together with shapes (10,) (11,) (10,) 
0.189996912609
-0.292898352695
0.25017956679
0.355918908665


# Time KDE implementations

In [3]:
n = 10
size = 1000

## R's bcv & kde2d

In [4]:
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects.numpy2ri import numpy2ri

ro.conversion.py2ri = numpy2ri
mass = importr('MASS')

for p in range(2, 5):
    n_grids = math.pow(2, p) + 1
    t0 = time.time()
    for i in range(n):
        x = np.random.random_sample(size)
        y = np.random.random_sample(size)
        bandwidth_x = np.asarray(mass.bcv(x)[0])
        bandwidth_y = np.asarray(mass.bcv(y)[0])
        kde = mass.kde2d(x, y, np.asarray([bandwidth_x, bandwidth_y]), n=np.asarray([n_grids]))
    print(int(n_grids), '\t', time.time() - t0)

5 	 0.035396575927734375
9 	 0.04148983955383301
17 	 0.05177450180053711


## fastKDE

In [5]:
# from fastkde import fastKDE

# for p in range(2, 10):
#     n_grids = math.pow(2, p) + 1
#     t0 = time.time()
#     for i in range(n):
#         x = np.random.random_sample(size)
#         y = np.random.random_sample(size)
#         pdf, axes = fastKDE.pdf(x, y, numPoints=n_grids)
#     print(int(n_grids), '\t', time.time() - t0)

## KDEMultivariate
### Higher overhead. Can specify bandwidth. Slowest.

In [6]:
from statsmodels.nonparametric.kernel_density import KDEMultivariate

t0 = time.time()
for i in range(n):
    x = np.random.random_sample(size)
    y = np.random.random_sample(size)
    kde = KDEMultivariate([x, y], var_type='cc', bw=[0.1, 0.1])
    kde.pdf()
time.time() - t0

0.9100980758666992

## gaussian_kde
### Faster. Can't specify bandwidth.

In [7]:
from scipy.stats import gaussian_kde

t0 = time.time()
for i in range(n):
    x = np.random.random_sample(size)
    y = np.random.random_sample(size)
    kde = gaussian_kde([x, y], bw_method='silverman')
    kde.pdf(np.random.random_sample((2, 25)))
time.time() - t0

0.02182602882385254

# Time information_coefficient implementations

## Wrapped R using P and H

In [8]:
t0 = time.time()
for i in range(n):
    x = np.random.random_sample(size)
    y = np.random.random_sample(size)
    ccal.information.information_coefficient(x, y)
print(time.time() - t0)

0.10862421989440918


In [9]:
from rpy2.robjects.packages import SignatureTranslatedAnonymousPackage


# TODO: refactor
r ='''
    information_coefficient <- function(x, y, n.grid = 25) {
        x.set <- !is.na(x)
        y.set <- !is.na(y)
        overlap <- x.set & y.set

        x <- x[overlap] + 1e-09 * runif(length(overlap))
        y <- y[overlap] + 1e-09 * runif(length(overlap))

        if (length(x) > 2) {
            delta <- c(bcv(x), bcv(y))
            cor <- cor(x, y)

            bandwidths <- delta * (1 + (-0.75) * abs(cor))
            
            kde2d.xy <- kde2d(x, y, bandwidths, n = n.grid)
            fxy <- kde2d.xy$z + .Machine$double.eps
            dx <- kde2d.xy$x[2] - kde2d.xy$x[1]
            dy <- kde2d.xy$y[2] - kde2d.xy$y[1]
            pxy <- fxy/(sum(fxy) * dx * dy)
            px <- rowSums(pxy) * dy
            py <- colSums(pxy) * dx

            # hxy <- -sum(pxy * log(pxy)) * dx * dy
            # hx <- -sum(px * log(px)) * dx
            # hy <- -sum(py * log(py)) * dy
            # mi <- hx + hy - hxy

            px <- matrix(px, nrow = n.grid, ncol = n.grid)
            py <- matrix(py, byrow = TRUE, nrow = n.grid, ncol = n.grid)
            mi <- sum(pxy * log(pxy/(px * py))) * dx * dy

            ic <- sign(cor) * sqrt(1 - exp(-2 * mi))

            if (is.na(ic)) {
                ic <- 0
            }
        } else {
            ic <- 0
        }
        return(ic)
    }
    '''
r = SignatureTranslatedAnonymousPackage(r, 'r')

t0 = time.time()
for i in range(n):
    x = np.random.random_sample(size)
    y = np.random.random_sample(size)
    r.information_coefficient(x, y)
print(time.time() - t0)

0.08505058288574219


## Using KDEMultivariate

In [10]:
from numpy import asarray
from numpy.random import random_sample
from statsmodels.nonparametric.kernel_density import KDEMultivariate
from scipy.stats import pearsonr

drop_nan_columns = ccal.support.drop_nan_columns

def information_coefficient_statsmodels(x, y, data_types='cc', n_grids=25, jitter=1E-10):
    x, y = drop_nan_columns([x, y])
    if len(x) < 3 or len(y) < 3:
        return 0
    x = asarray(x, dtype=float)
    y = asarray(y, dtype=float)
    x += random_sample(x.size) * jitter
    y += random_sample(y.size) * jitter

    cor, p = pearsonr(x, y)    
    bandwidth_x = np.asarray(mass.bcv(x)[0]) * (1 + (-0.75) * abs(cor)) / 4
    bandwidth_y = np.asarray(mass.bcv(y)[0]) * (1 + (-0.75) * abs(cor)) / 4
    
    kde = KDEMultivariate([x, y], bw=[bandwidth_x, bandwidth_y], var_type=data_types)
    meshgrid_x, meshgrid_y = np.meshgrid(np.linspace(x.min(), x.max(), n_grids), np.linspace(y.min(), y.max(), n_grids))
    fxy = kde.pdf(np.vstack([meshgrid_x.flatten(), meshgrid_y.flatten()])).reshape((n_grids, n_grids)) + np.finfo(float).eps
    dx = (x.max() - x.min()) / (n_grids - 1)
    dy = (y.max() - y.min()) / (n_grids - 1)
    pxy = fxy / (fxy.sum() * dx * dy)
    px = pxy.sum(axis=1) * dy
    py = pxy.sum(axis=0) * dx
    
    mi = np.sum(pxy * np.log(pxy/(np.array([px] * n_grids).T * np.array([py] * n_grids)))) * dx * dy
    
    # hxy = - np.sum(pxy * np.log(pxy)) * dx * dy
    # hx = -np.sum(px * np.log(px)) * dx
    # hy = -np.sum(py * np.log(py)) * dy
    # mi = hx + hy - hxy
    
    ic = np.sign(cor) * np.sqrt(1 - np.exp(- 2 * mi))
    
    return ic

t0 = time.time()
for i in range(n):
    x = np.random.random_sample(size)
    y = np.random.random_sample(size)
    information_coefficient_statsmodels(x, y)
print(time.time() - t0)

0.6361842155456543


## Using gaussian_kde

In [11]:
from scipy.stats import gaussian_kde


def information_coefficient_scipy(x, y, data_types='cc', n_grids=25, jitter=1E-10):
    x, y = drop_nan_columns([x, y])
    if len(x) < 3 or len(y) < 3:
        return 0
    x = asarray(x, dtype=float)
    y = asarray(y, dtype=float)
    x += random_sample(x.size) * jitter
    y += random_sample(y.size) * jitter
    
    # Get bandwidths
    cor, p = pearsonr(x, y)
    bandwidth_x = np.asarray(mass.bcv(x)[0]) / 4 * (1 + (-0.75) * abs(cor))
    bandwidth_y = np.asarray(mass.bcv(y)[0]) / 4 * (1 + (-0.75) * abs(cor))

    # Kernel density estimate
    kde = gaussian_kde([x, y], bw_method='silverman')
    
    meshgrid_x, meshgrid_y = np.meshgrid(np.linspace(x.min(), x.max(), n_grids), np.linspace(y.min(), y.max(), n_grids))
    dx = (x.max() - x.min()) / (n_grids - 1)
    dy = (y.max() - y.min()) / (n_grids - 1)
    
    pxy = kde.pdf(np.vstack((meshgrid_x.flatten(), meshgrid_y.flatten()))).reshape((n_grids, n_grids)) + np.finfo(float).eps
    px = pxy.sum(axis=1) * dy
    py = pxy.sum(axis=0) * dx
    mi = np.sum(pxy * np.log(pxy/(px * py))) * dx * dy
    ic = np.sign(cor) * np.sqrt(1 - np.exp(- 2 * mi))
    return ic

t0 = time.time()
for i in range(n):
    x = np.random.random_sample(size)
    y = np.random.random_sample(size)
    information_coefficient_scipy(x, y)
time.time() - t0

0.2732663154602051