In [1]:
import sys
sys.path.insert(0, '../../')
import ccal
%matplotlib inline
%config InlineBackend.figure_formats = {'svg',}

<17:10:30> Checking dependencies ...
<17:10:30> Using the following packages:
<17:10:30> 	matplotlib (v1.5.1)
<17:10:30> 	numpy (v1.10.4)
<17:10:30> 	pandas (v0.18.0)
<17:10:30> 	rpy2 (v2.7.9)
<17:10:30> 	scikit-learn (v0.17.1)
<17:10:30> 	scipy (v0.17.0)
<17:10:30> 	seaborn (v0.7.0)


In [2]:
import cProfile
import math

import numpy as np
from scipy.stats import pearsonr
from statsmodels.nonparametric.kernel_density import KDEMultivariate
from scipy.stats import binom_test

# TODO pythonize bcv
import rpy2.robjects as ro
from rpy2.robjects.numpy2ri import numpy2ri

ro.conversion.py2ri = numpy2ri
from rpy2.robjects.packages import importr

mass = importr('MASS')

from ccal.support import drop_nan_columns, add_jitter

In [25]:
def information_coefficient(x, y, data_types, ngrid=25):
    # Filter columsn and add jitter
    x, y = add_jitter(drop_nan_columns([x, y]))

    # Can't calcualte MI if length of a vector is smaller than or equal to the number of vectors
    if len(x) <= 2:
        return 0

    #def get_kde_bandwidth(x, y):
    # Get bandwidth
    bandwidth = np.array([rbcv(v) for v in [x, y]]) / 4
    cor, p = pearsonr(x, y)
    bandwidth_scaling = 1 + (-0.75) * abs(cor)
    if bandwidth_scaling:
        bandwidth *= bandwidth_scaling

    # Kernel density estimate
    kde = KDEMultivariate([x, y], var_type=data_types, bw=bandwidth)

    # Grid
    meshgrids = np.meshgrid(np.linspace(x.min(), x.max(), ngrid),
                             np.linspace(y.min(), y.max(), ngrid))
    p_joint = kde.pdf(data_predict=np.vstack([meshgrids[0].flatten(),
                                              meshgrids[1].flatten()])).reshape((ngrid, ngrid)) + np.finfo(float).eps
    grids = [np.linspace(v.min(), v.max(), ngrid) for v in [x, y]]
    ds = [grid[1] - grid[0] for grid in grids]
    ds_prod = np.prod(ds)
    p_joint /= p_joint.sum() * ds_prod
    h_joint = - np.sum(p_joint * np.log(p_joint)) * ds_prod
    dx = ds[0]
    dy = ds[1]
    dx = ds[0]
    dy = ds[1]
    px = p_joint.sum(axis=1) * dy
    py = p_joint.sum(axis=0) * dx
    hx = -np.sum(px * np.log(px)) * dx
    hy = -np.sum(py * np.log(py)) * dy
    mi = hx + hy - h_joint
    ic = np.sign(cor) * np.sqrt(1 - np.exp(- 2 * mi))
    return ic


def rbcv(x):
    bandwidth = np.array(mass.bcv(x))[0]
    return bandwidth

In [22]:
import time
t0 = time.time()
for i in range(100):
    x = np.random.random_sample(700)
    y = np.random.random_sample(700)
    ccal.information.information_coefficient(x, y)
time.time() - t0

4.179605007171631

In [32]:
cProfile.run("information_coefficient(x, y, 'cc')", sort=1)

         10253 function calls in 0.051 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
     1250    0.021    0.000    0.021    0.000 kernels.py:109(gaussian)
     1895    0.010    0.000    0.010    0.000 {method 'reduce' of 'numpy.ufunc' objects}
      625    0.009    0.000    0.044    0.000 _kernel_base.py:456(gpke)
        2    0.003    0.001    0.003    0.002 functions.py:101(__call__)
        1    0.002    0.002    0.046    0.046 kernel_density.py:165(pdf)
      643    0.001    0.000    0.001    0.000 {built-in method numpy.core.multiarray.array}
      626    0.001    0.000    0.002    0.000 fromnumeric.py:2390(prod)
        1    0.000    0.000    0.001    0.001 support.py:49(drop_nan_columns)
      627    0.000    0.000    0.000    0.000 {built-in method numpy.core.multiarray.empty}
     1251    0.000    0.000    0.009    0.000 _methods.py:34(_prod)
      625    0.000    0.000    0.008    0.000 {method 'prod' of 'numpy

# Test data type cases

In [None]:
x = np.random.random_sample(10)
y = np.random.random_sample(10)
print(ccal.information.information_coefficient(x, y))

x = np.random.random_sample(10)
y = np.random.random_sample(11)
try:
    ccal.information.information_coefficient(x, y)
except ValueError as e:
    print(e)

x = np.random.random_sample(10)
x[1] = None
y = np.random.random_sample(10)
y[2] = None
print(ccal.information.information_coefficient(x, y))

x = np.random.random_sample(10)
x[1] = None
y = np.random.random_sample(10)
y[2] = None
y[6] = None
print(ccal.information.information_coefficient(x, y))

x = np.random.random_sample(10)
x[1] = None
x[3] = None
x[5] = None
y = np.random.random_sample(10)
y[2] = None
y[4] = None
print(ccal.information.information_coefficient(x, y))

x = np.array([12.517, 14.706, np.nan, 14.12, np.nan, np.nan, np.nan, 12.255])
y = np.array([0.98246356, 0.97525171, 0.77744759, 0.64084311, 0.4405853, 0.43827196, 0.12447757, 0.08116039])
print(ccal.information.information_coefficient(x, y))