In [None]:
# Use higher resolution
%config InlineBackend.figure_formats = {'svg',}

In [None]:
import sys
sys.path.insert(0, '../../')
import ccal
%matplotlib inline
%config InlineBackend.figure_formats = {'svg',}

# Make simulation matrix

def add_value(df, inVal, outVal):
    # Add value in cluster
    # Set inVal or outVal to be None when not updating it
    
    for i,(n,s) in enumerate(df.iterrows()):
        #print('add_value:',i)
        for j,c in enumerate(s.index):
            if inVal and n==c:
                df.iloc[i,j]=inVal
            if outVal and n!=c:
                df.iloc[i,j]=outVal

def add_noise(df,inMu,inSigma,outMu,outSigma):
    # Add noise
    
    for i,(n,s) in enumerate(df.iterrows()):
        #print('add_noise:',i)
        for j,c in enumerate(s.index):
            if (inMu or inSigma) and n==c:
                df.iloc[i,j]+=random.gauss(inMu,inSigma)
            if (outMu or outSigma) and n!=c:
                df.iloc[i,j]+=random.gauss(outMu,outSigma)

def mix(df,k,mix):
    # Mix cluster values to nonclusters and vice versa
    
    assert k!=0, print('k cannot be 0')
    
    # Get the number of values in a cluster
    n_k_val=len(df.columns)*len(df.index)/k
    #print('n_k_val:',n_k_val)
    
    # Count
    c=0
    while c<mix*n_k_val:
        
        # Pick 1st random index and column
        r_idx0=random.randint(0,len(df.index)-1)
        r_col0=random.randint(0,len(df.columns)-1)

        # If index and column locate inside a cluster
        if df.index[r_idx0]==df.columns[r_col0]:    
            
            # Get cluster value located
            pick0=df.iloc[r_idx0,r_col0]
            
            # Pick 2nd random index and column
            r_idx1=random.randint(0,len(df.index)-1)
            r_col1=random.randint(0,len(df.columns)-1)
            
            # If index and column locate outside a cluster
            if df.index[r_idx1]!=df.columns[r_col1]:    

                # Get non-cluster value located
                pick1=df.iloc[r_idx1,r_col1]

                # Swap
                df.iloc[r_idx0,r_col0]=pick1
                df.iloc[r_idx1,r_col1]=pick0
                
                # Count
                c+=1
                
                #print('Swapped (%s,%s) & (%s,%s)'%(df.index[r_idx0],df.columns[r_col0],df.index[r_idx1],df.columns[r_col1]))

def initialize_simulation_df(df,inVal,inMu,inSigma,outVal,outMu,outSigma,mix):
    # Initialize values in and out of a cluster and add noise
    
    t0=time()
    
    # For each row
    for i,(n,s) in enumerate(df.iterrows()):
        print('initialize_simulation_df:',i)
        
        # For each column
        for j,c in enumerate(s.index):
            r=random.random()
            
            if mix and r<=mix:
                # Mix
                if n==c:
                    # In cluster gets out-value
                    df.iloc[i,j]=outVal
                    if outMu or outSigma:
                        df.iloc[i,j]+=random.gauss(outMu,outSigma)
                else:
                    # Out cluster gets in-value
                    df.iloc[i,j]=inVal
                    if inMu or inSigma:
                        df.iloc[i,j]+=random.gauss(inMu,inSigma) 
            else:
                # No mix
                if n==c:
                    # In cluster gets in-value
                    df.iloc[i,j]=inVal
                    if inMu or inSigma:
                        df.iloc[i,j]+=random.gauss(inMu,inSigma)
                else:
                    # Out cluster gets out-value
                    df.iloc[i,j]=outVal
                    if outMu or outSigma:
                        df.iloc[i,j]+=random.gauss(outMu,outSigma)
    print('initialize_simulation_df: done in %0.3fs.'%(time()-t0))

def plot_mtrx(mtrx):
    # Plot simulation matrix
    
    plt.imshow(mtrx,interpolation='nearest',cmap=plt.cm.ocean)
    plt.colorbar()
    plt.show()
    
def make_mtrx_sample_x_variable_simulation(n_sample,
                                           n_var,
                                           k,
                                           val_in,
                                           val_out,
                                           noise_in_mu=None,
                                           noise_in_sigma=None,
                                           noise_out_mu=None,
                                           noise_out_sigma=None,
                                           noise_mix=None,
                                           prefix_out_f=None,
                                           suffix_out_f=None,
                                           plot=False):
    """
    Make sample x variable matrix
    """
    assert k != 0, 'k cannot be 0'
    assert k <= n_var,'k cannot be greater than n_var'
    
    # Make an empty sample x variable matrix filled with 0
    mtrx_sample_x_var = pd.DataFrame(index=range(n_sample), columns=range(n_var)).fillna(0)

    # Slice dataframe index and column and make lists of dataframe indexes and columns for each index and column slice
    list_index_slice = toolK.slice_list(mtrx_sample_x_var.index, k)
    list_column_slice = toolK.slice_list(mtrx_sample_x_var.columns, k)

    # Make index and column slice x dataframe indexes dictionaries
    dict_index_slice = {}
    for i, l in enumerate(list_index_slice):
        dict_index_slice[i] = l
    dict_column_slice = {}
    for i, l in enumerate(list_column_slice):
        dict_column_slice[i] = l

    # Set dataframe index and column to be index and column slice indices respectively
    index=list(mtrx_sample_x_var.index)
    for i,l in dict_index_slice.items():
        for j in l:
            index[j]=i
    mtrx_sample_x_var.index=index
    columns=list(mtrx_sample_x_var.columns)
    for i,l in dict_column_slice.items():
        for j in l:
            columns[j]=i
    mtrx_sample_x_var.columns=columns

    # Initialize simulation matrix
    initialize_simulation_df(mtrx_sample_x_var,val_in,noise_in_mu,noise_in_sigma,val_out,noise_out_mu,noise_out_sigma,noise_mix)
    
    # Save
    if prefix_out_f:        
        mtrx_sample_x_var.to_csv(prefix_out_f+'_sample_%s_var_%s_k_%s_mix_%s_%s'%(n_sample,n_var,k,noise_mix,suffix_out_f),sep='\t')
    
    # Plot
    if plot:
        plot_mtrx(mtrx_sample_x_var)

# Make simulation matrix

# Set number of samples
list_n_sample = [100, 500, 1000, 5000]
# Set number of variables
list_n_var = [100, 500, 1000, 5000]
# Set values in clusters
val_in = 1
# Set values out of clusters
val_out = 0
# Set Ks
list_k = [1, 2, 3, 4, 5, 6, 10, 15, 20, 25]
# Set the fractions of cluster values to be swapped between clusters and nonclusters
list_noise_mix = [0, 0.05, 0.1, 0.2]
# Set noise in clusters
noise_in_mu = 0
noise_in_sigma = 0.1 * noise_in_mu
# Set noise out of clusters
noise_out_mu = 0
noise_out_sigma = 0.1*noise_out_mu

# Simulate
for sample in list_n_sample:
    print('sample:',sample)
    
    for var in list_n_var:
        print('\tvar:',var)
        
        for k in list_k:
            print('\t\tk:',k)
            
            for noise_mix in list_noise_mix:
                print('\t\t\tnoise_mix:',noise_mix)
                
                make_mtrx_sample_x_variable_simulation(sample,
                                                       var,
                                                       k,
                                                       val_in,
                                                       val_out,
                                                       noise_mix=noise_mix,
                                                       prefix_out_f='/cellar/users/hyeerna/aLL/mtrx_sample_x_var_simulation/test/',
                                                       suffix_out_f='sfx',
                                                       plot=True)

In [None]:
import time
import math

import numpy as np


n = 100
size = 1000
x = np.random.random_sample(size)
y = np.random.random_sample(size)
#ngrid = 33

## R's bcv & kde2d

import rpy2.robjects as ro
from rpy2.robjects.numpy2ri import numpy2ri
ro.conversion.py2ri = numpy2ri
from rpy2.robjects.packages import importr
mass = importr('MASS')

for p in range(2, 10):
    ngrid = math.pow(2, p) + 1
    t0 = time.time()
    for i in range(n):
        x = np.random.random_sample(size)
        y = np.random.random_sample(size)
        bandwidth_x = np.asarray(mass.bcv(x)[0])
        bandwidth_y = np.asarray(mass.bcv(y)[0])
        kde = mass.kde2d(x, y, np.asarray([bandwidth_x, bandwidth_y]), n=np.asarray([ngrid]))
    print(int(ngrid), '\t', time.time() - t0)

## fastKDE

from fastkde import fastKDE

for p in range(2, 10):
    ngrid = math.pow(2, p) + 1
    t0 = time.time()
    for i in range(n):
        x = np.random.random_sample(size)
        y = np.random.random_sample(size)
        pdf, axes = fastKDE.pdf(x, y, numPoints=ngrid)
    print(int(ngrid), '\t', time.time() - t0)

## KDEMultivariate
Higher overhead. Can specify bandwidth. Slowest.

from statsmodels.nonparametric.kernel_density import KDEMultivariate

t0 = time.time()
for i in range(n):
    x = np.random.random_sample(size)
    y = np.random.random_sample(size)
    kde = KDEMultivariate([x, y], var_type='cc', bw=[0.1, 0.1])
    kde.pdf()
time.time() - t0

## gaussian_kde
Faster. Can't specify bandwidth.

from scipy.stats import gaussian_kde

t0 = time.time()
for i in range(n):
    x = np.random.random_sample(size)
    y = np.random.random_sample(size)
    kde = gaussian_kde([x, y], bw_method='silverman')
    kde.pdf(np.random.random_sample((2, 25)))
time.time() - t0

In [1]:
import sys
import time
import math

import numpy as np
from scipy.stats import pearsonr

sys.path.insert(0, '../../')
import ccal
from ccal.support import drop_nan_columns, add_jitter

%matplotlib inline
%config InlineBackend.figure_formats = {'svg',}

<14:10:10> Checking dependencies ...
<14:10:10> Using the following packages:
<14:10:10> 	matplotlib (v1.5.1)
<14:10:10> 	numpy (v1.10.4)
<14:10:10> 	pandas (v0.18.0)
<14:10:10> 	rpy2 (v2.7.9)
<14:10:10> 	scikit-learn (v0.17.1)
<14:10:10> 	scipy (v0.17.0)
<14:10:10> 	seaborn (v0.7.0)


In [2]:
n = 100
size = 1000
x = np.random.random_sample(size)
y = np.random.random_sample(size)

# Time information_coefficient implementations

## Wrapped R using P and H

In [3]:
from rpy2.robjects.packages import SignatureTranslatedAnonymousPackage


# TODO: refactor
r ='''
    information_coefficient <- function(x, y, n.grid = 25) {
        x.set <- !is.na(x)
        y.set <- !is.na(y)
        overlap <- x.set & y.set

        x <- x[overlap] + 1e-09 * runif(length(overlap))
        y <- y[overlap] + 1e-09 * runif(length(overlap))

        if (length(x) > 2) {
            delta <- c(bcv(x), bcv(y))
            cor <- cor(x, y)

            bandwidths <- delta * (1 + (-0.75) * abs(cor))
            
            kde2d.xy <- kde2d(x, y, bandwidths, n = n.grid)
            fxy <- kde2d.xy$z + .Machine$double.eps
            dx <- kde2d.xy$x[2] - kde2d.xy$x[1]
            dy <- kde2d.xy$y[2] - kde2d.xy$y[1]
            pxy <- fxy/(sum(fxy) * dx * dy)
            px <- rowSums(pxy) * dy
            py <- colSums(pxy) * dx

            # hxy <- -sum(pxy * log(pxy)) * dx * dy
            # hx <- -sum(px * log(px)) * dx
            # hy <- -sum(py * log(py)) * dy
            # mi <- hx + hy - hxy

            px <- matrix(px, nrow = n.grid, ncol = n.grid)
            py <- matrix(py, byrow = TRUE, nrow = n.grid, ncol = n.grid)
            mi <- sum(pxy * log(pxy/(px * py))) * dx * dy

            ic <- sign(cor) * sqrt(1 - exp(-2 * mi))

            if (is.na(ic)) {
                ic <- 0
            }
        } else {
            ic <- 0
        }
        return(ic)
    }
    '''
r = SignatureTranslatedAnonymousPackage(r, 'r')

t0 = time.time()
for i in range(n):
    x = np.random.random_sample(size)
    y = np.random.random_sample(size)
    r.information_coefficient(x, y)
print(time.time() - t0)

0.9430339336395264


## Using R's kde2d

In [4]:
import rpy2.robjects as ro
from rpy2.robjects.numpy2ri import numpy2ri
ro.conversion.py2ri = numpy2ri
from rpy2.robjects.packages import importr
mass = importr('MASS')


def information_coefficient_kde2d(x, y, ngrid=25):
    x, y = add_jitter(drop_nan_columns([x, y]))

    if len(x) <= 2:
        return 0
    
    cor, p = pearsonr(x, y)    
    bandwidth_x = np.asarray(mass.bcv(x)[0]) * (1 + (-0.75) * abs(cor))
    bandwidth_y = np.asarray(mass.bcv(y)[0]) * (1 + (-0.75) * abs(cor))
    
    fxy = np.asarray(mass.kde2d(x, y, np.asarray([bandwidth_x, bandwidth_y]), n=np.asarray([ngrid]))[2]) + np.finfo(float).eps
    dx = (x.max() - x.min()) / (ngrid - 1)
    dy = (y.max() - y.min()) / (ngrid - 1)
    pxy = fxy / (fxy.sum() * dx * dy)
    px = pxy.sum(axis=1) * dy
    py = pxy.sum(axis=0) * dx
    
    mi = np.sum(pxy * np.log(pxy/(np.array([px] * ngrid).T * np.array([py] * ngrid)))) * dx * dy
    
    # hxy = - np.sum(pxy * np.log(pxy)) * dx * dy
    # hx = -np.sum(px * np.log(px)) * dx
    # hy = -np.sum(py * np.log(py)) * dy
    # mi = hx + hy - hxy
    
    ic = np.sign(cor) * np.sqrt(1 - np.exp(- 2 * mi))
    
    return ic

t0 = time.time()
for i in range(n):
    x = np.random.random_sample(size)
    y = np.random.random_sample(size)
    information_coefficient_kde2d(x, y)
time.time() - t0

0.9052920341491699

## Using KDEMultivariate

In [5]:
from statsmodels.nonparametric.kernel_density import KDEMultivariate


def information_coefficient(x, y, data_types, ngrid=25):
    x, y = add_jitter(drop_nan_columns([x, y]))

    if len(x) <= 2:
        return 0

    cor, p = pearsonr(x, y)    
    bandwidth_x = np.asarray(mass.bcv(x)[0]) * (1 + (-0.75) * abs(cor)) / 4
    bandwidth_y = np.asarray(mass.bcv(y)[0]) * (1 + (-0.75) * abs(cor)) / 4
    
    kde = KDEMultivariate([x, y], bw=[bandwidth_x, bandwidth_y], var_type=data_types)
    meshgrid_x, meshgrid_y = np.meshgrid(np.linspace(x.min(), x.max(), ngrid), np.linspace(y.min(), y.max(), ngrid))
    fxy = kde.pdf(np.vstack([meshgrid_x.flatten(), meshgrid_y.flatten()])).reshape((ngrid, ngrid)) + np.finfo(float).eps
    dx = (x.max() - x.min()) / (ngrid - 1)
    dy = (y.max() - y.min()) / (ngrid - 1)
    pxy = fxy / (fxy.sum() * dx * dy)
    px = pxy.sum(axis=1) * dy
    py = pxy.sum(axis=0) * dx
    
    mi = np.sum(pxy * np.log(pxy/(np.array([px] * ngrid).T * np.array([py] * ngrid)))) * dx * dy
    
    # hxy = - np.sum(pxy * np.log(pxy)) * dx * dy
    # hx = -np.sum(px * np.log(px)) * dx
    # hy = -np.sum(py * np.log(py)) * dy
    # mi = hx + hy - hxy
    
    ic = np.sign(cor) * np.sqrt(1 - np.exp(- 2 * mi))
    
    return ic

t0 = time.time()
for i in range(n):
    x = np.random.random_sample(size)
    y = np.random.random_sample(size)
    information_coefficient(x, y, data_types='cc')
print(time.time() - t0)

4.958044052124023


In [6]:
print(r.information_coefficient(x, y))
print(information_coefficient_kde2d(x, y))
print(information_coefficient(x, y, data_types='cc'))
#print(information_coefficient_scipy(x, y, 'cc'))

[1] 0.1311764

0.131176387253
0.131176387245


## Using gaussian_kde

In [7]:
from scipy.stats import gaussian_kde


def information_coefficient_scipy(x, y, data_types, ngrid=25):
    x, y = add_jitter(drop_nan_columns([x, y]))

    if len(x) <= 2:
        return 0
    
    # Get bandwidths
    cor, p = pearsonr(x, y)
    bandwidth_x = np.asarray(mass.bcv(x)[0]) / 4 * (1 + (-0.75) * abs(cor))
    bandwidth_y = np.asarray(mass.bcv(y)[0]) / 4 * (1 + (-0.75) * abs(cor))

    # Kernel density estimate
    kde = gaussian_kde([x, y], bw_method='silverman')
    
    meshgrid_x, meshgrid_y = np.meshgrid(np.linspace(x.min(), x.max(), ngrid), np.linspace(y.min(), y.max(), ngrid))
    dx = (x.max() - x.min()) / (ngrid - 1)
    dy = (y.max() - y.min()) / (ngrid - 1)
    
    pxy = kde.pdf(np.vstack((meshgrid_x.flatten(), meshgrid_y.flatten()))).reshape((ngrid, ngrid)) + np.finfo(float).eps
    px = pxy.sum(axis=1) * dy
    py = pxy.sum(axis=0) * dx
    mi = np.sum(pxy * np.log(pxy/(px * py))) * dx * dy
    ic = np.sign(cor) * np.sqrt(1 - np.exp(- 2 * mi))
    return ic

t0 = time.time()
for i in range(n):
    x = np.random.random_sample(size)
    y = np.random.random_sample(size)
    information_coefficient_scipy(x, y, 'cc')
time.time() - t0

2.533968925476074

# Test data type cases

In [8]:
x = np.random.random_sample(10)
y = np.random.random_sample(10)
print(ccal.information.information_coefficient(x, y))

x = np.random.random_sample(10)
y = np.random.random_sample(11)
try:
    ccal.information.information_coefficient(x, y)
except ValueError as e:
    print(e)

x = np.random.random_sample(10)
x[1] = None
y = np.random.random_sample(10)
y[2] = None
print(ccal.information.information_coefficient(x, y))

x = np.random.random_sample(10)
x[1] = None
y = np.random.random_sample(10)
y[2] = None
y[6] = None
print(ccal.information.information_coefficient(x, y))

x = np.random.random_sample(10)
x[1] = None
x[3] = None
x[5] = None
y = np.random.random_sample(10)
y[2] = None
y[4] = None
print(ccal.information.information_coefficient(x, y))

x = np.array([12.517, 14.706, np.nan, 14.12, np.nan, np.nan, np.nan, 12.255])
y = np.array([0.98246356, 0.97525171, 0.77744759, 0.64084311, 0.4405853, 0.43827196, 0.12447757, 0.08116039])
print(ccal.information.information_coefficient(x, y))

0.494202437232
Input arrays have different lengths: 11 & 10.
-0.193143869531
0.26435120236
0.191147978571
0.355918908655
