## Correcting local biases in sampling

In [1]:
import scipy
import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import scale

from scipy.stats.stats import pearsonr 

from scipy.stats import invgamma 
from scipy.stats import beta
import matplotlib.pyplot as plt

import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *

init_notebook_mode(connected=True)

In our first post we examined how to use frequency vectors to generate haplotypes and populations. We then proceeded to generate a universe of frequency vectors, whose distance in feature space allowed us to chose the relative differentiation of the populations we would simulate.

What i didn't touch on in that post was the importance of sampling in principal component analysis. In the last section, i chose vectors close to one another, together with vectors far distant, in order to produce differentiated populations. If you tweeked the population sizes, you might have noticed that if some of the close together populations largely outweighed the rest, the distances to the more differentiated clusters would be reduced.

- see [McVean 2009](http://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1000686) for a study case.

Let's do that now. As in the first post, we will start by generating a space of vectors. Chose at least two to be far appart, and the rest to be closer together. Give at least one of the latter an innordinate size difference to the rest.

In [2]:
# Simulate frequency vectors. 
# We must first define the number of populations, the length of the haplotypes desired, and their respective population sizes
L= 200

import itertools as it
n= 10

# Vary a (beta distribution parameter).
a_range= np.linspace(1,2,11)
a_set= [i for i in a_range for _ in range(n)]

# vary b.
b_range= np.linspace(0.1,.4,11)
b_set= [i for i in b_range for _ in range(n)]

## length of haplotypes to extract.
L_set= [L] * n * 11


background= np.array([a_set,b_set,L_set]).T

vector_lib= []
for k in range(background.shape[0]):
    
    probs= beta.rvs(background[k,0], background[k,1], size=int(background[k,2]))
    probs[(probs > 1)]= 1
    
    
    vector_lib.append(probs)

vector_lib= np.array(vector_lib)

In [3]:
vector_lib.shape

(110, 200)

In [4]:
## PCA on vectors simulated
n_comp = 3

pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')
features = pca.fit_transform(vector_lib) * pca.explained_variance_ratio_

print("; ".join(['PC{0}: {1}'.format(x+1,round(pca.explained_variance_ratio_[x],3)) for x in range(n_comp)]))
print('features shape: {}'.format(features.shape))

PC1: 0.029; PC2: 0.026; PC3: 0.025
features shape: (110, 3)


In [5]:
## Plot vector PCA
fig_data= [go.Scatter3d(
        x = features[:,0],
        y = features[:,1],
        z = features[:,2],
        type='scatter3d',
        mode= "markers",
        text= ['a: {}; b: {}, L: {}; index = {}'.format(background[k,0],background[k,1],background[k,2], k) for k in range(background.shape[0])],
        marker= {
        'line': {'width': 0},
        'size': 4,
        'symbol': 'circle',
      "opacity": .8
      }
    )]


layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    )
)

fig = go.Figure(data=fig_data, layout=layout)
iplot(fig)


Peruse frequency vector space, chose populations and biased sizes below:
    

In [6]:
### Select frequency vectors and draw haplotypes.
## Pops selected by Indicies.
Pops= [99,95,109,17,55]
N_pops= len(Pops)

## Population Sizes and labels
Sizes_bias= [130,80,300,35,50]
labels_bias= np.repeat(np.array([x for x in range(N_pops)]),Sizes_bias)

## Number of pops

data_ex= []

for k in range(N_pops):
    
    probs= vector_lib[Pops[k],:]
    
    m= Sizes_bias[k]
    Haps= [[np.random.choice([1,0],p= [1-probs[x],probs[x]]) for x in range(L)] for acc in range(m)]
    
    data_ex.extend(Haps)

data_ex= np.array(data_ex)
print(data_ex.shape)


(595, 200)


In [7]:
### Calculate individual pairwise distances for biased sampling.
def pairwise_gen(x,y):
    miss= 0
    same= 0
    if len(x) != len(y):
        return 'vector lengths differ'
    else:
        for n in range(len(x)):
            if x[n] == y[n]:
                same += 1
        return 1 - same / (len(x) - miss)

bias_gen_diffs= pairwise_distances(data_ex,metric= pairwise_gen)
bias_gen_diffs= np.array(bias_gen_diffs)

iugen= np.triu_indices(bias_gen_diffs.shape[0],1)
bias_gen_diffs= bias_gen_diffs[iugen]

In [8]:
### Calculate pairwise Fst based on frequency vectors selected.

def return_fsts2(freq_array):
    pops= range(freq_array.shape[0])
    H= {pop: [1-(freq_array[pop,x]**2 + (1 - freq_array[pop,x])**2) for x in range(freq_array.shape[1])] for pop in range(freq_array.shape[0])}
    Store= []

    for comb in it.combinations(H.keys(),2):
        P= [sum([freq_array[x,i] for x in comb]) / len(comb) for i in range(freq_array.shape[1])]
        HT= [2 * P[x] * (1 - P[x]) for x in range(len(P))]
        per_locus_fst= [[(HT[x] - np.mean([H[p][x] for p in comb])) / HT[x],0][int(HT[x] == 0)] for x in range(len(P))]
        per_locus_fst= np.nan_to_num(per_locus_fst)
        Fst= np.mean(per_locus_fst)

        Store.append([comb,Fst])
    
    
    ### total fst:
    P= [sum([freq_array[x,i] for x in pops]) / len(pops) for i in range(freq_array.shape[1])]
    HT= [2 * P[x] * (1 - P[x]) for x in range(len(P))]
    FST= np.mean([(HT[x] - np.mean([H[p][x] for p in pops])) / HT[x] for x in range(len(P))])
    
    return pd.DataFrame(Store,columns= ['pops','fst'])

freqs_selected= vector_lib[Pops,:]
Pairwise= return_fsts2(new_freqs)

176715

In [9]:
### PCA on haplotypes drawn.
n_comp = 5

pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(data_ex)

bias_features= pca.transform(data_ex)# * pca.explained_variance_ratio_

var_comps= pca.explained_variance_ratio_
print("; ".join(['PC{0}: {1}'.format(x+1,round(var_comps[x],3)) for x in range(n_comp)]))
print(bias_features.shape)

PC1: 0.09; PC2: 0.055; PC3: 0.044; PC4: 0.032; PC5: 0.013
(595, 5)


In [10]:
bias_features

array([[ 1.63066524, -1.24821692, -0.4304864 , -0.5438733 ,  0.55690094],
       [ 1.45946731, -1.03955142, -0.02247404,  0.66149383, -0.81039362],
       [ 1.50338495, -1.45622796, -0.50252437,  0.01990838, -0.49154693],
       ..., 
       [ 1.42108019,  0.92587723,  2.47750247, -1.46056228,  0.48323426],
       [ 1.2530336 ,  0.73650674,  1.7405405 , -0.6780822 , -0.29527825],
       [ 1.41761654,  1.10177155,  2.43932499, -1.18151578,  0.21460148]])

In [11]:
bias_centroids= [np.mean(bias_features[[y for y in range(bias_features.shape[0]) if labels_bias[y] == z],:],axis= 0) for z in range(N_pops)]
bias_centroids= np.array(bias_centroids)

fig_data= [go.Scatter(
        x = bias_features[[x for x in range(sum(Sizes_bias)) if labels_bias[x] == i],0],
        y = bias_features[[x for x in range(sum(Sizes_bias)) if labels_bias[x] == i],1],       
        type='scatter',
        mode= "markers",
        marker= {
        'line': {'width': 0},
        'size': 8,
        'symbol': 'circle',
      "opacity": .8
      },
      name= str(i)
    ) for i in range(N_pops)]


fig_data.append(
    go.Scatter(
        x= bias_centroids[:,0],
        y= bias_centroids[:,1],
        type= 'scatter',
        mode= 'markers',
        name= 'centres',
        marker= {
        'line': {'width': 1},
        'size': 10,
        'symbol': 'cross'
        }
    )
)

layout = go.Layout(
    title= 'Biased sampling; eigenvalues factored in',
    yaxis=dict(
        title='PC2: {}'.format(round(var_comps[1],3))),
    xaxis=dict(
    title= 'PC1: {}'.format(round(var_comps[0],3))),
)


fig = go.Figure(data=fig_data, layout=layout)
iplot(fig)

In [12]:
## centroid distances
iu1= np.triu_indices(N_pops,1)
bias_pair_dist= pairwise_distances(bias_centroids,metric= 'euclidean')
bias_pair_dist= bias_pair_dist[iu1]
bias_pair_dist= scale(bias_pair_dist)

## Individual distances:
bias_feat_dist= pairwise_distances(bias_features, metric= 'euclidean')
bias_feat_dist= bias_feat_dist[iugen]
bias_feat_dist= scale(bias_feat_dist)

bias_feat_Pearson= pearsonr(bias_feat_dist,bias_gen_diffs)
print(bias_feat_Pearson)

(0.74593077436096722, 0.0)


In [13]:
bias_centroids

array([[ 1.74939258, -1.4534051 , -0.22499351,  0.0500502 ,  0.00686856],
       [ 1.19260103,  1.87149551, -1.4600996 , -0.25515112, -0.01500881],
       [-1.34434977, -0.17508668, -0.06028981, -0.06802296, -0.00304901],
       [ 0.60982777,  1.11457456,  1.26415359,  2.81376971, -0.03437314],
       [ 1.18263681,  1.05477832,  2.39797386, -1.28338976,  0.04851109]])

I chose the first two populations to play the outliers, the rest to be a close pack. To two of these i gave population sizes of 300 and 180, six and 3.6 times the size of the largest outlying population. 

The distortion can be seen in that our outlying populations don't appear as far as we would have expected them to given their vectors alone. They tend to appear in the center because of their reduced impact on variance components.

As remarked by McVean, this can be a problem when deriving conclusions from relative distances in feature space.

My approach here isn't very elegant. MeanShift allows us to identify clusters in feature space, i just resample those clusters equally, inverse transform their coordinates and perform the PCA anew. The actual data is transposed onto the resulting space.

In [14]:
def local_sampling_correct(data_now,n_comp):
    pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(data_now)
    feats= pca.transform(data_now)
    
    N= 50
    bandwidth = estimate_bandwidth(feats, quantile=0.2)
    params = {'bandwidth': np.linspace(np.min(feats), np.max(feats),30)}
    grid = GridSearchCV(KernelDensity(algorithm = "ball_tree",breadth_first = False), params,verbose=0)
    
    ## perform MeanShift clustering.
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=False, min_bin_freq=5)
    ms.fit(feats)
    labels1 = ms.labels_
    label_select = {y:[x for x in range(len(labels1)) if labels1[x] == y] for y in sorted(list(set(labels1))) if y != -1}

    ## Extract the KDE of each cluster identified by MS.
    Proxy_data= []

    for lab in label_select.keys():
        if len(label_select[lab]) < 3:
            continue
            
        Quanted_set= feats[label_select[lab],:]
        grid.fit(Quanted_set)

        kde = grid.best_estimator_
        Extract= kde.sample(N)
        Return= pca.inverse_transform(Extract)
        
        #Return= data_now[np.random.choice(label_select[lab],N),:]
        Proxy_data.extend(Return)
    
    Proxy_data= np.array(Proxy_data)
    
    print([len(x) for x in label_select.values()])
    pca2 = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(Proxy_data)
    var_comp= pca2.explained_variance_ratio_
    
    New_features= pca2.transform(data_now)# * var_comp
    return New_features, var_comp


New_features,var_comp= local_sampling_correct(data_ex,5)

[300, 130, 80, 50, 35]


In [15]:
New_features.shape

(595, 5)

Plotting our original samples onto our re-computed feature space:

In [16]:
corr_centroids= [np.mean(New_features[[y for y in range(New_features.shape[0]) if labels_bias[y] == z],:3],axis= 0) for z in range(N_pops)]
corr_centroids= np.array(corr_centroids)

fig_data= [go.Scatter(
        x = New_features[[x for x in range(sum(Sizes_bias)) if labels_bias[x] == i],0],
        y = New_features[[x for x in range(sum(Sizes_bias)) if labels_bias[x] == i],1],
        type='scatter',
        mode= "markers",
        marker= {
        'line': {'width': 0},
        'size': 8,
        'symbol': 'circle',
      "opacity": .8
      },
      name= str(i)
    ) for i in range(N_pops)]

fig_data.append(
    go.Scatter(
        x= corr_centroids[:,0],
        y= corr_centroids[:,1],
        type= 'scatter',
        mode= 'markers',
        name= 'centres',
        marker= {
        'line': {'width': 1},
        'size': 10,
        'symbol': 'cross'
        }
    )
)


layout = go.Layout(
    title= 'Biased corrected, eigenvalues factored in',
    yaxis=dict(
        title='PC2: {}'.format(round(var_comp[1],3))),
    xaxis=dict(
    title= 'PC1: {}'.format(round(var_comp[0],3))),
)

fig = go.Figure(data=fig_data, layout=layout)
iplot(fig)

In [17]:
iu1= np.triu_indices(N_pops,1)
corrected_pair_dist= pairwise_distances(corr_centroids,metric= 'euclidean')
corrected_pair_dist= corrected_pair_dist[iu1]
corrected_pair_dist= scale(corrected_pair_dist)

## Individual distances:
corrected_feat_dist= pairwise_distances(New_features, metric= 'euclidean')
corrected_feat_dist= corrected_feat_dist[iugen]
corrected_feat_dist= scale(corrected_feat_dist)

corrected_gen_pearson= pearsonr(corrected_feat_dist,bias_gen_diffs)

print(corrected_gen_pearson)

(0.74542555638845631, 0.0)


In [18]:
iu1

(array([0, 0, 0, 0, 1, 1, 1, 2, 2, 3], dtype=int32),
 array([1, 2, 3, 4, 2, 3, 4, 3, 4, 4], dtype=int32))

We can compare this output with what we would have gotten from sampling equally across our selected vectors:

In [19]:
#### Selecting new, equal sample sizes but derive haplotypes from the same frequency vectors.

Sizes= [50,50,50,50,50]
labels= np.repeat(np.array([x for x in range(N_pops)]),Sizes)

data= []

for k in range(N_pops):
    
    probs= vector_lib[Pops[k],:]
    
    m= Sizes[k]
    Haps= [[np.random.choice([1,0],p= [1-probs[x],probs[x]]) for x in range(L)] for acc in range(m)]
    
    data.extend(Haps)

data= np.array(data)


#### clalculate pairwise genetic distances
iugen_unbiased= np.triu_indices(data.shape[0],1)

unbias_gen_diffs= pairwise_distances(data,metric= pairwise_gen)
unbias_gen_diffs= np.array(unbias_gen_diffs)

unbias_gen_diffs= unbias_gen_diffs[iugen_unbiased]

### perform PCA

n_comp = 3

pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(data)

features= pca.transform(data)# * pca.explained_variance_ratio_

var_comps= pca.explained_variance_ratio_
print("; ".join(['PC{0}: {1}'.format(x+1,round(var_comps[x],3)) for x in range(n_comp)]))
print(features.shape)


#### Calculate centroids of labelled data in feature space.
unbias_centroids= [np.mean(features[[y for y in range(features.shape[0]) if labels[y] == z],:],axis= 0) for z in range(N_pops)]
unbias_centroids= np.array(unbias_centroids)


#### Plot projections + Centroids.

fig_data= [go.Scatter(
        x = features[[x for x in range(sum(Sizes)) if labels[x] == i],0],
        y = features[[x for x in range(sum(Sizes)) if labels[x] == i],1],
        type='scatter',
        mode= "markers",
        marker= {
        'line': {'width': 0},
        'size': 8,
        'symbol': 'circle',
      "opacity": .8
      },
      name= str(i)
    ) for i in range(N_pops)]


fig_data.append(
    go.Scatter(
        x= unbias_centroids[:,0],
        y= unbias_centroids[:,1],
        type= 'scatter',
        mode= 'markers',
        name= 'centres',
        marker= {
        'line': {'width': 1},
        'size': 10,
        'symbol': 'cross'
        }
    )
)

layout = go.Layout(
    title= 'Unbiased sampling; eigenvalues factored in',
    yaxis=dict(
        title='PC2: {}'.format(round(var_comps[1],3))),
    xaxis=dict(
    title= 'PC1: {}'.format(round(var_comps[0],3))),
)

fig = go.Figure(data=fig_data, layout=layout)
iplot(fig)

PC1: 0.09; PC2: 0.088; PC3: 0.062
(250, 3)


In [20]:
unbias_pair_dist= pairwise_distances(unbias_centroids,metric= 'euclidean')
unbias_pair_dist= unbias_pair_dist[iu1]
unbias_pair_dist= scale(unbias_pair_dist)

## Individual distances:
unbiased_feat_dist= pairwise_distances(features, metric= 'euclidean')
print(unbiased_feat_dist.shape)
unbiased_feat_dist= unbiased_feat_dist[iugen_unbiased]
unbiased_feat_dist= scale(unbiased_feat_dist)

unbiased_gen_pearson= pearsonr(unbiased_feat_dist,unbias_gen_diffs)

print(corrected_gen_pearson)
print(unbias_pair_dist)

(250, 250)
(0.74542555638845631, 0.0)
[ 0.09345265 -2.43817943  0.62531922  0.37673136 -1.24087934  0.74804943
  0.6247184  -0.06315428  0.31408406  0.95985791]


In [21]:
t= np.array([
    unbias_pair_dist,
    bias_pair_dist,
    corrected_pair_dist
]).T

fig_data= [go.Scatter(
    x= t[:,0],
    y= t[:,i],
    mode= 'markers',
    marker= dict(
        color= i,
        opacity= .6
    ),
    name= ['bias','corrected'][i-1]
    ) for i in [1,2]
]

layout = go.Layout(
    title= 'MS correction distances',
    yaxis=dict(
        title='biased and corrected distances'),
    xaxis=dict(
        title='unbiased distances')
)

fig= go.Figure(data=fig_data, layout=layout)
iplot(fig)

We can see these last two plots resemble each other more than the first one.

Let's now repeat this process sequentially, to get an idea of how much this method actually corrects distances between pops.

At each repetition we will choose a fixed number of frequency vectors from the _Vector Universe_ created at the top of this page. We then perform a biased and an unbiased sampling of each, and perform PCA on both. For each scenario we calculate the pairwise eucledian distances between the centroids of populations and normalize them. 

We then apply the MScorrection to the feature space of the biased scenario and recalculate pairwise centroid distances and normalize them.

This will allow us to compare the unbiased distances to biased and corrected distances. Hopefully, we will have reduced the distortion produced by the biases in sampling.

In [117]:
### Select frequency vectors and draw haplotypes.


N_pops= 5 # Number of pops

n_comp= 5 # components to keep following PCA

Iter= 20 # repeats

N_sims= 100 # number of haplotypes to generate from each pop in the unbiased scenario.


## Population Sizes and labels
bias_scheme= [130,43,200,40,60]
unbiased_sheme= np.repeat(N_sims,N_pops)

bias_labels= np.repeat(np.array([x for x in range(N_pops)]),bias_scheme)
unbias_labels= np.repeat(np.array([x for x in range(N_pops)]),unbiased_sheme)

### store distances between centroids
biased_pairwise= []
unbiased_pairwise= []
corrected_pairwise= []

### store Pearson's r comparing gen_diffs and feature space diffs across scenarios
biased_pears= []
corrected_pears= []
unbiased_pears= []

### triangular matrices extract.
iu1= np.triu_indices(N_pops,1) # for centroid comparison

iu_unbias= np.triu_indices(sum(unbiased_sheme),1)
iu_bias= np.triu_indices(sum(bias_scheme),1)

### proceed.

for rep in range(Iter):
    Pops= np.random.choice(vector_lib.shape[0],N_pops,replace= False)
    print(Pops)
    #########################################################
    ############# unbiased sample
    
    #### generate data and perform PCA.
    data= []

    for k in range(N_pops):

        probs= vector_lib[Pops[k],:]
        
        m= unbiased_sheme[k]
        Haps= [[np.random.choice([1,0],p= [1-probs[x],probs[x]]) for x in range(L)] for acc in range(m)]

        data.extend(Haps)
    
    data1= np.array(data)
    pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(data1)
    feat_unbias= pca.transform(data1)# * pca.explained_variance_ratio_
    
    ####### centroid comparison
    unbias_centroids= [np.mean(feat_unbias[[y for y in range(feat_unbias.shape[0]) if unbias_labels[y] == z],:],axis= 0) for z in range(N_pops)]
    unbias_centroids= np.array(unbias_centroids)
    
    unbias_pair_dist= pairwise_distances(unbias_centroids,metric= 'euclidean')
    unbias_pair_dist= unbias_pair_dist[iu1]
    
    unbias_pair_dist= scale(unbias_pair_dist)
    unbiased_pairwise.extend(unbias_pair_dist)
    
    ######## ind distances
    ### genetic data
    unbias_gen_diffs= pairwise_distances(data1,metric= pairwise_gen)
    unbias_gen_diffs= np.array(unbias_gen_diffs)
    unbias_gen_diffs= unbias_gen_diffs[iu_unbias]
    
    ## feature space
    unbiased_feat_dist= pairwise_distances(feat_unbias, metric= 'euclidean')
    unbiased_feat_dist= unbiased_feat_dist[iu_unbias]
    unbiased_feat_dist= scale(unbiased_feat_dist)

    unbiased_gen_pearson= pearsonr(unbiased_feat_dist,unbias_gen_diffs)
    
    unbiased_pears.append(unbiased_gen_pearson[0])
    
    #################################################
    ############## biased sample
    
    #### generate data and perform PCA
    data= []

    for k in range(N_pops):

        probs= vector_lib[Pops[k],:]

        m= bias_scheme[k]
        Haps= [[np.random.choice([1,0],p= [1-probs[x],probs[x]]) for x in range(L)] for acc in range(m)]

        data.extend(Haps)

    data2= np.array(data)
    
    pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(data2)
    feat_bias= pca.transform(data2)# * pca.explained_variance_ratio_
    
    #### Centroid distances
    bias_centroids= [np.mean(feat_bias[[y for y in range(feat_bias.shape[0]) if bias_labels[y] == z],:],axis= 0) for z in range(N_pops)]
    bias_centroids= np.array(bias_centroids)
    
    bias_pair_dist= pairwise_distances(bias_centroids,metric= 'euclidean')
    bias_pair_dist= bias_pair_dist[iu1]
    bias_pair_dist= scale(bias_pair_dist)
    biased_pairwise.extend(bias_pair_dist)

    ######## Ind distances
    ### genetic data
    bias_gen_diffs= pairwise_distances(data2,metric= pairwise_gen)
    bias_gen_diffs= np.array(bias_gen_diffs)
    bias_gen_diffs= bias_gen_diffs[iu_bias]
    
    ## feature space
    biased_feat_dist= pairwise_distances(feat_bias, metric= 'euclidean')
    biased_feat_dist= biased_feat_dist[iu_bias]
    biased_feat_dist= scale(biased_feat_dist)

    biased_gen_pearson= pearsonr(biased_feat_dist,bias_gen_diffs)
    
    biased_pears.append(biased_gen_pearson[0])
    
    ###############################################################"
    ################## bias correct
    ### perform MS correction on biased samples
    feat_correct,var_comp= local_sampling_correct(data2,n_comp)
    
    ### centroid Distances
    centroids= [np.mean(feat_correct[[y for y in range(feat_correct.shape[0]) if bias_labels[y] == z],:],axis= 0) for z in range(N_pops)]
    centroids= np.array(centroids)
    pair_dist= pairwise_distances(centroids,metric= 'euclidean')
    pair_dist= pair_dist[iu1]
    pair_dist= scale(pair_dist)
    corrected_pairwise.extend(pair_dist)
    
    ######## Ind distances
    
    ## feature space
    corrected_feat_dist= pairwise_distances(feat_correct, metric= 'euclidean')
    corrected_feat_dist= corrected_feat_dist[iu_bias]
    corrected_feat_dist= scale(corrected_feat_dist)

    corr_gen_pearson= pearsonr(corrected_feat_dist,bias_gen_diffs)
    
    corrected_pears.append(corr_gen_pearson[0])
    print([unbiased_gen_pearson[0],biased_gen_pearson[0],corr_gen_pearson[0]])
    
    t= np.array([
    unbias_pair_dist,
    bias_pair_dist,
    pair_dist
    ]).T
    print(t)


[36 25 10 92 21]
[200, 130, 60, 43, 40]
[0.8116471785140954, 0.88240619161795653, 0.88242435766198257]
[[ 1.25168851  1.17878852  1.17891747]
 [ 0.72900391  0.50228112  0.50221853]
 [ 1.35348719  1.63510572  1.63507145]
 [ 0.73885184  0.67255158  0.67248624]
 [-1.59846571 -1.59620437 -1.59609632]
 [-0.32716569 -0.41969472 -0.41969862]
 [ 0.27318857  0.10174855  0.10185256]
 [-1.17778398 -1.2022727  -1.20233124]
 [-1.18166153 -1.01580417 -1.01587832]
 [-0.06114312  0.14350046  0.14345825]]
[20 71 73 81 32]
[199, 130, 60, 43, 40]
[0.86812391549116907, 0.87470537528156866, 0.87447607811125638]
[[ 0.05104371 -0.39129796 -0.39133536]
 [-0.92121791 -0.90768552 -0.90769091]
 [ 1.17862683  1.02863909  1.02866644]
 [ 1.70010541  1.61152462  1.61150857]
 [-1.95276123 -2.01799779 -2.01798312]
 [-0.16107481 -0.22886975 -0.22889228]
 [-0.84581568 -0.58566292 -0.58565945]
 [ 0.22427408  0.18699888  0.18703542]
 [ 0.37275317  0.56688016  0.56688999]
 [ 0.35406644  0.73747119  0.7374607 ]]
[ 71  95 10

[200, 130, 60, 43, 40]
[0.85567943450397199, 0.87362310599525783, 0.87321965052898187]
[[-0.87070506 -0.35647223 -0.35627661]
 [ 0.01985689 -0.14706469 -0.14702703]
 [-0.38897278 -0.03128885 -0.03125169]
 [-0.98242553 -1.07412526 -1.07421788]
 [ 0.83469223  0.68476153  0.68485484]
 [ 2.3255103   2.41142464  2.41143375]
 [ 0.8599371   0.67248952  0.67236424]
 [-0.87151365 -1.09143473 -1.09135613]
 [-0.67788956 -0.84927564 -0.84929137]
 [-0.24848993 -0.2190143  -0.21923212]]
[ 96 107  31  68  26]
[200, 130, 60, 43, 40]
[0.83113254613503318, 0.85976582940615287, 0.86070947316696966]
[[ 0.59883919  0.64740182  0.64750651]
 [ 1.18370482  1.23487321  1.23510109]
 [ 0.61338028  0.95526468  0.95540312]
 [ 1.96452615  1.65136131  1.65149407]
 [-1.17808801 -1.09341378 -1.09321808]
 [-0.67148239 -0.23485766 -0.23618838]
 [-0.36115664 -0.51502303 -0.51466116]
 [-0.78269713 -0.89151896 -0.89127509]
 [-1.17769269 -1.40961118 -1.40921468]
 [-0.18933358 -0.34447641 -0.34494741]]
[ 27  71  95  67 107]


In [107]:
t= np.array([
    unbiased_pairwise,
    biased_pairwise,
    corrected_pairwise
]).T


In [108]:
pear_bias= pearsonr(t[:,0],t[:,1])

pear_corr= pearsonr(t[:,0],t[:,2])


fig_data= [go.Scatter(
    x= t[:,0],
    y= t[:,i],
    mode= 'markers',
    marker= dict(
        color= i,
        opacity= .6
    ),
    name= ['bias','corrected'][i-1]
    ) for i in [1,2]
]

layout = go.Layout(
    title= 'MS correction test. Pearson r= {}'.format(round(pear_bias[0],3)),
    yaxis=dict(
        title='biased distances'),
    xaxis=dict(
        title='unbiased distances')
)

fig= go.Figure(data=fig_data, layout=layout)
iplot(fig)

In [109]:


fig_data= [go.Scatter(
x= t[:,0],
y= t[:,2],
mode= 'markers'
) 
]

layout = go.Layout(
    title= 'MS correction test. Pearson r= {}'.format(round(pear_corr[0],3)),
    yaxis=dict(
        title='corrected distances'),
    xaxis=dict(
        title='unbiased distances')
)

fig= go.Figure(data=fig_data, layout=layout)
iplot(fig)

In [119]:
P_stare= np.vstack([P_store_VARexc,P_store_VARinc])
P_stare.shape

Labels= np.repeat(['VARexc','VARinc'],Iter)

In [120]:
### Compare p-values of genetic to feature space comparisons across biased, unbiased and corrected schemes.
#####

P_store_VARexc= np.array([
    biased_pears,
    corrected_pears,
    unbiased_pears,
]
).T

box_names= ['biased','corrected','unbiased']

box_data= [go.Box(
    y= P_stare[:,i],
    x= Labels,
    name= box_names[i]
) for i in range(P_store.shape[1])]

layout= go.Layout(
    title= 'Genetic to Feature space Pearson p-values across settings',
    boxmode= 'group'
)

fig= go.Figure(data=box_data,layout= layout)
iplot(fig)

In [76]:
P_store

array([[ 0.9066842 ,  0.90642306,  0.84978766],
       [ 0.91646873,  0.91651775,  0.89216522],
       [ 0.82422035,  0.82427635,  0.81342105],
       [ 0.85966271,  0.85940631,  0.84218832],
       [ 0.87593998,  0.87585345,  0.87466112],
       [ 0.83718243,  0.83751185,  0.80077815],
       [ 0.87428352,  0.87421538,  0.81090356],
       [ 0.85216546,  0.85201827,  0.82712856],
       [ 0.78669288,  0.78627581,  0.76326608],
       [ 0.79210207,  0.79288072,  0.81821463],
       [ 0.9024305 ,  0.90235752,  0.84843801],
       [ 0.82825217,  0.82850497,  0.85464635],
       [ 0.87876132,  0.87861839,  0.87470816],
       [ 0.87214629,  0.87238737,  0.83172366],
       [ 0.70247813,  0.70238113,  0.77301631],
       [ 0.90982315,  0.90950979,  0.88639891],
       [ 0.85466483,  0.85501604,  0.81767274],
       [ 0.87811593,  0.87862594,  0.87703194],
       [ 0.78018934,  0.77911594,  0.80858132],
       [ 0.88986585,  0.89006211,  0.82496898]])