In [1]:
import os
import sys
import math
import random

import numpy as np
import pandas as pd
import scipy.stats as stats
from statsmodels.sandbox.stats.multicomp import multipletests
from scipy.spatial import distance
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import linkage, cophenet
from sklearn.decomposition import NMF
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.insert(0, '../../')
import ccal
%matplotlib inline
%config InlineBackend.figure_formats = {'svg',}

<22:19:01.560207> Checking dependencies ...
<22:19:01.562632> Using the following packages:
<22:19:01.564378> 	matplotlib (v1.5.1)
<22:19:01.564396> 	numpy (v1.10.4)
<22:19:01.564405> 	pandas (v0.18.0)
<22:19:01.564426> 	rpy2 (v2.7.9)
<22:19:01.564435> 	scikit-learn (v0.17.1)
<22:19:01.564442> 	scipy (v0.17.0)
<22:19:01.564449> 	seaborn (v0.7.0)


SyntaxError: invalid syntax (information.py, line 50)

In [None]:
features = ccal.support.read_gct('/Users/Kwat/Downloads/v22_pub.gct')

component_x_cellline = ccal.support.read_gct('/Users/Kwat/Downloads/CCLE.rpkm.v2.SELECTED_SIGNATURES.v2.gct')
ref = component_x_cellline.ix['KRAS_SALE_Late_Comp_C1_9', :]

In [None]:
features

In [None]:
ref

In [None]:
features_random = ccal.support.make_random_features(10, 10)
ref_random = ccal.support.make_random_features(1, 10)

In [None]:
ref_random.index = random.sample(ref.index.tolist(), ref_random.size)

In [None]:
ref_random

In [None]:
# ccal.analyze.rank_features_against_reference(features, ref_random, n_sampling=1, n_perm=1)

In [None]:
import math

import numpy as np
from scipy.stats import pearsonr
from statsmodels.nonparametric.kernel_density import KDEMultivariate
from scipy.stats import binom_test
# TODO: pythonize bcv
import rpy2.robjects as ro
from rpy2.robjects.numpy2ri import numpy2ri

ro.conversion.py2ri = numpy2ri
from rpy2.robjects.packages import importr

mass = importr('MASS')


def information_coefficient(x, y, z=None, n_grid=25, vector_data_type=None, n_perm=0, adaptive=True, alpha=0.05,
                            perm_alpha=0.05):
    """
    :param x: array-like, (n_samples,)
    :param y: array-like, (n_samples,)
    :param z: array-like, (n_samples,), optional, variable on which to condition
    :param n_grid: int, number of grid points at which to evaluate kernel density
    :param vector_data_types: str, 3 chars of 'c' (continuous), 'u' (unordered discrete), or 'o' (ordered discrete)
    :param n_perm: int, >0 will return a p-value in addition to the information coefficient
    :param adaptive: bool, quit permutations after achieving a specified confidence that the p-value is above (or below) alpha
    :param alpha: float, threshold empirical p-value for significance of IC
    :param perm_alpha: float, threshold probability for terminating adaptive permutation
    :return: float and float, information coefficient, and the empirical p-value if n_perm > 0
                Note that if adaptive, the accuracy of the empirical p-value will vary: values closer to alpha will be estimated
                more precisely, while values obviously greater or less than alpha will be estimated less precisely.
    """
    vectors = [x, y]
    if z:
        vectors.append(z)
        x, y, z = drop_nan_columns(vectors)
    else:
        x, y = drop_nan_columns(vectors)
        
    rho, p = pearsonr(x, y)
    print(rho, p)
    
    rho2 = abs(rho)
    bandwidth_scaling = 1 + (-0.75) * rho2
    
    mi = mutual_information(x, y, z=z, n_grid=n_grid,
                            vector_data_types=vector_data_type, bandwidth_scaling=bandwidth_scaling)
    ic_sign = np.sign(rho)
    ic = ic_sign * np.sqrt(1 - np.exp(- 2 * mi))

    if n_perm:
        n_more_extreme = 0
        trials = 0
        for i in range(n_perm):
            trials += 1
            # The question is whether I want to have
            # a certain width of confidence interval around my estimate of the pval
            # or just a certain confidence that the pval is greater than 0.05 (current solution)
            pm_x = np.random.permutation(x)
            pm_rho, p = pearsonr(pm_x, y)
            pm_rho2 = abs(pm_rho)
            pm_bandwidth_scaling = (1 + (-0.75) * pm_rho2)
            pm_mi = mutual_information(pm_x, y, z, n_grid=n_grid,
                                       vector_data_types=vector_data_type, bandwidth_scaling=pm_bandwidth_scaling)
            pm_ic_sign = np.sign(pm_rho)
            pm_ic = pm_ic_sign * np.sqrt(1 - np.exp(- 2 * pm_mi))
            if (pm_ic <= ic and ic < 0) or (0 < ic and ic <= pm_ic):
                n_more_extreme += 1
            if adaptive:
                ge_binom_p = binom_test(n_more_extreme, i + 1, alpha, alternative='greater')
                # * 2 because what I'm doing is two-sided testing in both directions
                if ge_binom_p * 2 < perm_alpha:
                    break
                le_binom_p = binom_test(n_more_extreme, i + 1, alpha, alternative='less')
                if le_binom_p * 2 < perm_alpha:
                    break
        p_value = n_more_extreme / float(trials)
        return ic, p_value
    else:
        return ic


# TODO: understand the math
def mutual_information(x, y, z=None, n_grid=25, vector_data_types=None, bandwidth_scaling=None):
    """
    :param x: array-like, (n_samples,)
    :param y: array-like, (n_samples,)
    :param z: array-like, (n_samples,), optional, variable on which to condition
    :param n_grid: int, number of grid points at which to evaluate kernel density
    :param vector_data_types: str, 3 chars of 'c' (continuous), 'u' (unordered discrete), or 'o' (ordered discrete)
    :param bandwidth_scaling: float,
    :return: float, information coefficient
    """
    vectors = [x, y]
    if z:
        vectors.append(z)
        x, y, z = drop_nan_columns(vectors)
    else:
        x, y = drop_nan_columns(vectors)
        
    if not vector_data_types:
        # TODO: guess variable types
        vector_data_types = 'c' * len(vectors)
    elif len(vector_data_types) is not len(vectors):
        raise ValueError('Number of specified variable types does not match number of vectors.')

    # Keep only columns that are not NaN in all vectors, and add jitter to the filtered vectors
    not_nan_filter = [True] * vectors[0].size
    for v in vectors:
        not_nan_filter &= ~np.isnan(v)
    if not_nan_filter.sum() < 3:
        return 0
    else:
        for i in range(len(vectors)):
            vectors[i] = vectors[i][not_nan_filter]
            vectors[i] += np.random.random_sample(vectors[i].size) * 1E-10
    print(vectors)

    grids = [np.linspace(v.min(), v.max(), n_grid) for v in vectors]
    print('grids', grids)
    mesh_grids = np.meshgrid(*grids)
    grid_shape = tuple([n_grid] * len(vectors))
    print(grid_shape)
    grid = np.vstack([mesh_grid.flatten() for mesh_grid in mesh_grids])
    print('grid', grid)
    delta = np.array([rbcv(q) for q in vectors]).reshape((len(vectors),)) / 4
    print('delta', delta)
    if bandwidth_scaling:
        delta *= bandwidth_scaling
        print('delta', delta)
    kde = KDEMultivariate(vectors, bw=delta, var_type=vector_data_types)
    p_joint = kde.pdf(grid).reshape(grid_shape) + np.finfo(float).eps
    print('p_joint', p_joint)
    ds = [grid[1] - grid[0] for grid in grids]
    ds_prod = np.prod(ds)
    p_joint /= (p_joint.sum() * ds_prod)
    h_joint = - np.sum(p_joint * np.log(p_joint)) * ds_prod
    dx = ds[0]
    dy = ds[1]
    if z:
        dz = ds[2]
        pxz = p_joint.sum(axis=1) * dy
        pyz = p_joint.sum(axis=0) * dx
        pz = p_joint.sum(axis=(0, 1)) * dx * dy
        hxz = -np.sum(pxz * np.log(pxz)) * dx * dz
        hyz = -np.sum(pyz * np.log(pyz)) * dy * dz
        hz = -np.sum(pz * np.log(pz)) * dz
        cmi = hxz + hyz - h_joint - hz
        return cmi
    else:
        dx = ds[0]
        print('dx', dx)
        dy = ds[1]
        print('dy', dy)
        px = p_joint.sum(axis=1) * dy
        print(px)
        py = p_joint.sum(axis=0) * dx
        print(py)
        hx = -np.sum(px * np.log(px)) * dx
        hy = -np.sum(py * np.log(py)) * dy
        mi = hx + hy - h_joint
        print('XX', mi)
        return mi


def rbcv(x):
    """
    :param x: array-like, (n_samples,)
    :return: float, bandwidth
    """
    bandwidth = np.array(mass.bcv(x))[0]
    return bandwidth


def drop_nan_columns(vectors):
    """
    Keep only column positions that are not nan in all vectors.
    :param vectors: list of numpy array, must have the same length (avoid [v1, ..., vn])
    :return: list of numpy arrays,
    """
    for v in vectors[1:]:
        if len(v) != len(vectors[0]):
            raise ValueError('Input arrays have different lengths.')
            
    not_nan_filter = [True] * len(vectors[0])
    for v in vectors:
        not_nan_filter &= ~np.isnan(v)
    
    only_not_nan = []
    for i in range(len(vectors)):
        only_not_nan.append(vectors[i][not_nan_filter])
    return only_not_nan
    
    
def add_jitter(vectors, inplace=False):
    """
    Add jitter to vectors inplace.
    :param vectors: numpy array,
    :return: None
    """
    for i in range(len(vectors)):
        vectors[i] += np.random.random_sample(vectors[i].size) * 1E-0

In [None]:
x = np.array([12.517, 14.706, np.nan, 14.12, np.nan, np.nan, np.nan, 12.255])
y = np.array([0.98246356, 0.97525171, 0.77744759, 0.64084311, 0.4405853, 0.43827196, 0.12447757, 0.08116039])
xx = np.random.random_sample(7)
xx[3] = None
yy = np.random.random_sample(7)
yy[5]=None
information_coefficient(x, y)