In [2]:
import sys
sys.path.insert(0, '../../')
import ccal

In [3]:
import numpy as np
import pandas as pd

In [4]:
metric = 'information_coef'
information_coefficient = ccal.information.information_coefficient
print_log = ccal.support.print_log

In [5]:
features = ccal.support.make_random_features(10, 10)
ref = ccal.support.make_random_features(1, 10)

In [28]:
if metric is 'information_coef':
    function = information_coefficient
elif metric is 'information_cmi_diff':
    function = cmi_diff
elif metric is 'information_cmi_ratio':
    function = cmi_ratio
else:
    raise ValueError('Unknown metric {}.'.format(metric))

print_log('Computing scores using {} metric ...'.format(metric))
scores = np.empty(features.shape[0])
for i, (idx, s) in enumerate(features.iterrows()):
    if i % 100 is 0:
        print_log('\t{}/{} ...'.format(i, features.shape[0]))
    scores[i] = function(s, ref)
scores = pd.DataFrame(scores, index=features.index, columns=[metric]).sort_values(metric)

<09:27:02> Computing scores using information_coef metric ...
<09:27:02> 	0/10 ...


In [29]:
scores

Unnamed: 0,information_coef
Feature 5,-0.182883
Feature 6,-0.131461
Feature 9,-0.101985
Feature 1,0.088753
Feature 3,0.139911
Feature 2,0.174277
Feature 0,0.21288
Feature 4,0.343922
Feature 8,0.365568
Feature 7,0.420255


In [30]:
nsampling = 5
nfeatures = 3
confidence = 0.95
import math
from scipy import stats

In [31]:
print_log('Bootstrapping to get {} confidence interval ...'.format(confidence))
nsample = math.ceil(0.632 * features.shape[1])
if nsampling < 2:
    print_log('Not bootstrapping because number of sampling < 3.')
elif nsample < 3:
    print_log('Not bootstrapping because 0.632 * number of sample < 3.')
else:
    # Limit features to be bootstrapped
    if nfeatures < 1:
        above_quantile = scores.ix[:, metric] >= scores.ix[:, metric].quantile(nfeatures)
        print_log('Bootstrapping {} features vs. reference > {} quantile ...'.format(sum(above_quantile),
                                                                                     nfeatures))
        below_quantile = scores.ix[:, metric] <= scores.ix[:, metric].quantile(1 - nfeatures)
        print_log('Bootstrapping {} features vs. reference < {} quantile ...'.format(sum(below_quantile),
                                                                                     1 - nfeatures))
        indices_to_bootstrap = scores.index[above_quantile | below_quantile].tolist()
    else:
        indices_to_bootstrap = scores.index[:nfeatures].tolist() + scores.index[-nfeatures:].tolist()
        print_log('Bootstrapping top & bottom {} features vs. reference ...'.format(len(indices_to_bootstrap)))

    # Random sample columns and compute scores using the sampled columns
    sampled_scores = pd.DataFrame(index=indices_to_bootstrap, columns=range(nsampling))
    for c in sampled_scores:
        sample_indices = np.random.choice(features.columns.tolist(), int(nsample)).tolist()
        sampled_features = features.ix[indices_to_bootstrap, sample_indices]
        sampled_ref = ref.ix[sample_indices]
        for idx, s in sampled_features.iterrows():
            sampled_scores.ix[idx, c] = function(s, sampled_ref)

    # Get confidence intervals
    confidence_intervals = pd.DataFrame(index=indices_to_bootstrap, columns=['{} MoE'.format(confidence)])
    z_critical = stats.norm.ppf(q=confidence)

    for i, s in sampled_scores.iterrows():
        std = s.std()
        moe = z_critical * (std / math.sqrt(s.size))
        confidence_intervals.ix[i, 0] = moe
    scores = pd.merge(scores, confidence_intervals, how='outer', left_index=True, right_index='True')

<09:27:03> Bootstrapping to get 0.95 confidence interval ...
<09:27:03> Bootstrapping top & bottom 6 features vs. reference ...


In [32]:
indices_to_bootstrap

['Feature 5', 'Feature 6', 'Feature 9', 'Feature 4', 'Feature 8', 'Feature 7']

In [33]:
scores

Unnamed: 0,information_coef,0.95 MoE
Feature 0,0.21288,
Feature 1,0.088753,
Feature 2,0.174277,
Feature 3,0.139911,
Feature 4,0.343922,0.0624322
Feature 5,-0.182883,0.130076
Feature 6,-0.131461,0.0828297
Feature 7,0.420255,0.132864
Feature 8,0.365568,0.401646
Feature 9,-0.101985,0.290488


In [34]:
nperm = 5

In [35]:
from statsmodels.sandbox.stats.multicomp import multipletests

In [36]:
ascending = True

In [37]:
print_log('Performing permutation test with {} permutations ...'.format(nperm))
permutation_pvals_and_fdrs = pd.DataFrame(index=features.index,
                                          columns=['Local P-value', 'Global P-value', 'FDR (BH)'])
# Compute scores using permuted ref
permutation_scores = np.empty((features.shape[0], nperm))
shuffled_ref = np.array(ref)
for i in range(nperm):
    np.random.shuffle(shuffled_ref)
    for j, (idx, s) in enumerate(features.iterrows()):
        permutation_scores[j, i] = function(s, shuffled_ref)

# Compute permutation P-values and FDRs
all_permutation_scores = permutation_scores.flatten()
for i, (idx, f) in enumerate(scores.iterrows()):
    local_pval = float(sum(permutation_scores[i, :] > float(f.ix[metric])) / nperm)
    if not local_pval:
        local_pval = float(1 / nperm)
    permutation_pvals_and_fdrs.ix[idx, 'Local P-value'] = local_pval

    global_pval = float(sum(all_permutation_scores > float(f.ix[metric])) / (nperm * features.shape[0]))
    if not global_pval:
        global_pval = float(1 / (nperm * features.shape[0]))
    permutation_pvals_and_fdrs.ix[idx, 'Global P-value'] = global_pval

permutation_pvals_and_fdrs.ix[:, 'FDR (BH)'] = multipletests(permutation_pvals_and_fdrs.ix[:, 'Global P-value'],
                                                             method='fdr_bh')[1]
scores = pd.merge(scores, permutation_pvals_and_fdrs, left_index=True, right_index=True)

scores.sort_values(metric, ascending=ascending)

<09:27:05> Performing permutation test with 5 permutations ...


Unnamed: 0,information_coef,0.95 MoE,Local P-value,Global P-value,FDR (BH)
Feature 5,-0.182883,0.130076,0.6,0.74,0.74
Feature 6,-0.131461,0.0828297,0.6,0.66,0.733333
Feature 9,-0.101985,0.290488,0.4,0.52,0.65
Feature 1,0.088753,,0.8,0.46,0.65
Feature 3,0.139911,,0.4,0.4,0.65
Feature 2,0.174277,,0.2,0.32,0.64
Feature 0,0.21288,,0.2,0.28,0.64
Feature 4,0.343922,0.0624322,0.2,0.12,0.4
Feature 8,0.365568,0.401646,0.2,0.12,0.4
Feature 7,0.420255,0.132864,0.2,0.1,0.4


In [38]:
ccal.analyze.compute_against_reference(features, ref, metric, nfeatures, ascending, nsampling, confidence, nperm)

<09:27:10> Computing scores using information_coef metric ...
<09:27:10> 	0/10 ...
<09:27:10> Bootstrapping to get 0.95 confidence interval ...
<09:27:10> Bootstrapping top & bottom 6 features vs. reference ...
<09:27:10> Performing permutation test with 5 permutations ...


Unnamed: 0,information_coef,0.95 MoE,Local P-value,Global P-value,FDR (BH)
Feature 5,-0.182883,0.449749,0.4,0.66,0.66
Feature 6,-0.131461,0.166379,0.4,0.56,0.622222
Feature 9,-0.101985,0.173724,0.8,0.52,0.622222
Feature 1,0.088753,,0.4,0.4,0.571429
Feature 3,0.139911,,0.2,0.3,0.5
Feature 2,0.174277,,0.6,0.26,0.5
Feature 0,0.21288,,0.2,0.24,0.5
Feature 4,0.343922,0.431597,0.4,0.12,0.4
Feature 8,0.365568,0.303669,0.2,0.12,0.4
Feature 7,0.420255,0.365209,0.2,0.08,0.4
