In [1]:
import scipy
import numpy as np
import pandas as pd
import itertools as it

from math import sin
import collections

def recursively_default_dict():
        return collections.defaultdict(recursively_default_dict)

from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import scale

from scipy.stats.stats import pearsonr 

from scipy.stats import invgamma 
from scipy.stats import beta
import matplotlib.pyplot as plt

import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *
import plotly.figure_factory as ff

from sklearn.metrics import silhouette_samples, silhouette_score


from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)

import StructE_tools as Ste
from Generate_freq_vectors import generate_vectors_Beta

In [2]:

def generate_vectors_Beta(L,n,rangeA= [1,2],rangeB = [.1,4],steps= 20,n_comp = 100):
    '''
    Starshape structure.
    '''
    # Simulate frequency vectors. 
    # We must first define the number of populations, the length of the haplotypes desired, and their respective population sizes

    import itertools as it

    # Vary a (beta distribution parameter).
    a_range= np.linspace(rangeA[0],rangeA[1],steps)
    a_set= [i for i in a_range for _ in range(n)]

    # vary b.
    b_range= np.linspace(rangeB[0],rangeB[1],steps)
    b_set= [i for i in b_range for _ in range(n)]

    ## length of haplotypes to extract.
    L_set= [L] * n * steps


    background_1= np.array([a_set,b_set,L_set]).T

    vector_lib= []
    for k in range(background_1.shape[0]):

        probs= beta.rvs(background_1[k,0], background_1[k,1], size=int(background_1[k,2]))
        probs[(probs > 1)]= 1


        vector_lib.append(probs)

    vector_lib= np.array(vector_lib)

    n_comp = 100

    
    return vector_lib

    
def generate_Branches_Beta(Nbranches,density,L,n,rangeA= [1,2],rangeB = [.1,4],steps= 20,n_comp = 100):
    
    vector_lib= generate_vectors_Beta(L,n,rangeA,rangeB,steps,n_comp)
    
    pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(vector_lib)
    features = pca.transform(vector_lib)# * pca.explained_variance_ratio_
    
    Iter= density
    target= [0,1]
    stairs= Nbranches

    MRCA= np.random.choice(range(vector_lib.shape[0]),1)
    calypso= []
    feat= []

    for inter in range(stairs):
        Pair= np.random.choice(range(vector_lib.shape[0]),2,replace= False)
        Pair[1]= MRCA
        print(Pair)

        coords= features[Pair,:]

        vector2= coords[target[1]] - coords[target[0]]
        for angle in np.linspace(-20,20,Iter):
            new_guy = coords[target[0]] + [angle / 10 * x for x in vector2]

            feat.append(new_guy)

            new_guy= pca.inverse_transform(new_guy)
            new_guy[new_guy < 0]= 0
            new_guy[new_guy > 1]= 1

            calypso.append(new_guy)

    features= np.array(feat)
    vector_lib= np.array(calypso)

    return features, vector_lib



In [3]:
Nbranches= 4
L= 150
n= 100
rangeA= [1,2.5]
rangeB = [.1,.6]
steps= 20
n_comp = 100
density= 50


vector_lib= generate_vectors_Beta(L,n,rangeA,rangeB,steps,n_comp)

pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(vector_lib)
features = pca.transform(vector_lib)


#features, vector_lib= generate_Branches_Beta(4,50,L,n,rangeA,rangeB,steps,n_comp)
print(features.shape)
print(vector_lib.shape)

(2000, 100)
(2000, 150)


In [4]:
### Getting Pops

def Gen_samples(Pops,Sizes,vector_lib,return_pca= False,n_comp= 100,prior= 'sinusoid',range_diff= [-10,10],steps= 100):
    
    print('...')
    
    Npops= len(Sizes)
    pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(vector_lib)
    features = pca.transform(vector_lib)
    
    target= [0,1]

    Windows= recursively_default_dict()
    
    threshold= .005
    P= 30

    Fst_labels= []

    Fst_crawl= []

    Fst_windows= recursively_default_dict()
    
    
    if len(Sizes) > Npops:
        print('Size vector longer than N pops. using first {}.'.format(Npops))
        Sizes= Sizes[:Npops]

    for angle in np.linspace(range_diff[0],range_diff[1],steps):
        coords= features[Pops,:]
        vector2= coords[target[0]] - coords[target[1]]
        
        if prior == 'sinusoid':
            coords[target[0]] = coords[target[0]] + [sin(angle) * x for x in vector2]
            
        if prior == 'linear':
            coords[target[0]] = coords[target[0]] + [angle / 10 * x for x in vector2]
            
        if prior == 'introgression':
            if angle >= -5 and angle <= 0:
                coords[target[0]] = coords[1]
                
        if prior == 'alien':
            if angle >= -5 and angle <= 0:
                coords[target[0]] = coords[len(coords) - 1]
            

        new_freqs= pca.inverse_transform(coords[:Npops])

        #new_freqs[target[0]]= pca.inverse_transform(coords[target[0]])

        N_pops= len(Sizes)
        

        data= []

        for k in range(N_pops):

            probs= new_freqs[k,:]

            probs[(probs > 1)]= 1
            probs[(probs < 0)]= 0
            m= Sizes[k]
            Haps= [[np.random.choice([1,0],p= [1-probs[x],probs[x]]) for x in range(L)] for acc in range(m)]

            data.extend(Haps)

        data= np.array(data)

        if return_pca:
                pca2 = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')

                data= pca2.fit_transform(data)

        Windows[int(angle*1000)]= data

        ##### FSTs
        Pairwise= Ste.return_fsts2(new_freqs)
        Pairwise['angle']= [angle] * Pairwise.shape[0]
        Fst_labels.extend(Pairwise.pops)

        Fst_crawl.extend(Pairwise.fst)



        Fst_windows[int(angle*1000)]= Pairwise

        ### store stuff.
        Windows[int(angle*1000)]= data
    
    
    Windows= {1:Windows}
    Fst_windows= {1:Fst_windows}
    print('Done.')
    return Windows, Fst_windows



def Check_Path(Npops,vector_lib,Pops= [], random= True,n_comp= 100,prior= 'sinusoid',range_diff= [-10,10],steps= 100):
    
    pca = PCA(n_components=100, whiten=False,svd_solver='randomized').fit(vector_lib)
    features = pca.transform(vector_lib)# * pca.explained_variance_ratio_
    
    target= [0,1]
    
    if random:
        if prior == 'alien':
            Pops= np.random.choice(vector_lib.shape[0],Npops + 1,replace= False)
        else:
            Pops= np.random.choice(vector_lib.shape[0],Npops,replace= False)
    
    
    Windows= recursively_default_dict()

    Fst_labels= []

    Fst_crawl= []

    Fst_windows= recursively_default_dict()

    for angle in np.linspace(range_diff[0],range_diff[1],steps):
        
        coords= features[Pops,:]
        
        vector2= coords[target[0]] - coords[target[1]]
        
        if prior == 'sinusoid':
            coords[target[0]] = coords[target[0]] + [sin(angle) * x for x in vector2]
        
        if prior == 'linear':
            coords[target[0]] = coords[target[0]] + [angle / 10 * x for x in vector2]
        
        if prior == 'introgression':
            if angle >= -5 and angle <= 0:
                coords[target[0]] = coords[1]
        
        if prior == 'alien':
            if angle >= -5 and angle <= 0:
                coords[target[0]] = coords[len(coords) - 1]
        
        new_freqs= pca.inverse_transform(coords)
        new_freqs[new_freqs > 1] = 1
        new_freqs[new_freqs < 0] = 0
        #print(coords.shape)
        
        #new_freqs[target[0]]= pca.inverse_transform(coords[target[0]])
        
        Pairwise= Ste.return_fsts2(new_freqs)
        Pairwise['angle']= [angle] * Pairwise.shape[0]
        #
        Fst_labels.extend(Pairwise.pops)

        Fst_crawl.extend(Pairwise.fst)
        

        Fst_windows[int(angle*1000)]= Pairwise

        ### store stuff
    
    Fst_windows= {1:Fst_windows}

    fig_data= [go.Scatter(
    x= [x for x in Fst_windows[1].keys()],
    y= [Fst_windows[1][x].fst[i] for x in Fst_windows[1].keys()],
    mode= 'markers',
    name= '{}'.format([x for x in it.combinations(range(Npops),2)][i])
    ) for i in range(len([x for x in it.combinations(range(Npops),2)]))
    ]

    layout = go.Layout(
        title= 'Fst across sets. prior: {}'.format(prior),
        yaxis=dict(
            title='fsts',
            range= [0,.5]),
        xaxis=dict(
            title='eucledian distance in feature space')
    )

    fig= go.Figure(data=fig_data, layout=layout)
    
    if random:
        return fig, Pops
    else: return fig



def plot_GenFst(Fst_lib,Npops,Chr):
    
    fig_data= [go.Scatter(
    x= [x for x in SequenceStore[1].keys()],
    y= [Fst_windows[1][x].fst[i] for x in Fst_windows[1].keys()],
    mode= 'markers',
    name= '{}'.format([x for x in it.combinations(range(Npops),2)][i])
    ) for i in range(len([x for x in it.combinations(range(Npops),2)]))
    ]

    layout = go.Layout(
        title= 'Fst vs. distance in vector feature space',
        yaxis=dict(
            title='fsts',
            range= [0,.5]),
        xaxis=dict(
            title='eucledian distance in feature space')
    )

    fig= go.Figure(data=fig_data, layout=layout)

    iplot(fig)



In [7]:
Sizes= [100,100,100,100]

Npops= len(Sizes)

range_diff= [-10,10]
#Pops= np.random.choice(vector_lib.shape[0],Npops,replace= False)
#Pops= [1685,543,1165]

# (Npops,Sizes,vector_lib,Pops= [], random= True,n_comp= 3,prior= 'sinusoid',range_diff= [-10,10],steps= 100)

fig, Pops= Check_Path(Npops,vector_lib,Pops= [],random= True,n_comp= 100,prior= 'sinusoid',range_diff= range_diff,steps= 100)

iplot(fig)


invalid value encountered in double_scalars



In [8]:
# (Pops,Sizes,vector_lib,return_pca= False,n_comp= 100,prior= 'sinusoid',range_diff= [-10,10],steps= 100)

SequenceStore, Fst_windows= Gen_samples(Pops,Sizes,vector_lib, prior= 'sinusoid',range_diff= [-10,10],steps= 40)

...



invalid value encountered in double_scalars



Done.


In [9]:
### Check that we got what we wanted.

plot_GenFst(Fst_windows,Npops,1)

## Statistics on simulated data sets.

### i. AMOVA

In [2]:

### Perform Distance and association analysis on the data sets generated

### Define reference and admixed associations:
### for the purpose of this analysis exploration will be driven by
### structure distance profiles. Since KDE will be used for exploration, 
### a set of accessions can be analysed that do not contribute to 
### distance profiles.

admx_N= 50
admx_indx= np.random.choice(range(sum(Sizes)),admx_N,replace= False)

ref_indx= [x for x in range(sum(Sizes)) if x not in admx_indx]

labels_pops= np.repeat(range(len(Sizes)),Sizes)

##

refs_lib= {x:[i for i in range(sum(Sizes)) if labels_pops[i] == x] for x in range(len(Sizes))}

admx_lib= {(max(refs_lib.keys()) + 1): admx_indx}


admx_lib.update(refs_lib)
Geneo= admx_lib

### Define parameters and libraries of analyses.
DIMr = 'PCA'
Bandwidth_split= 30
ncomp_local= 4
clsize= 15

Results = {x:recursively_default_dict() for x in SequenceStore.keys()}

Clover= []
Coordinates= []
Clusters_coords= []

Dist_vectors= []
Struct_dist_vectors= recursively_default_dict()

Construct = recursively_default_dict()
Distances_vectors= recursively_default_dict()
PC_var= recursively_default_dict()
center_distances= []

Ref_stats= []


for CHR in SequenceStore.keys():
    print('going on CHR: '+ str(CHR))
    for c in SequenceStore[CHR].keys():
        
        print('data set: {}'.format(c))
        ### PCA and MeanShift of information from each window copied from *FM36_Galaxy.py.
        Sequences = SequenceStore[CHR][c]
               
        Sequences= np.nan_to_num(Sequences)
        
        print(Sequences.shape)
        
        #### Dimensionality reduction
        
        if DIMr == 'PCA':
            pca = PCA(n_components=ncomp_local, whiten=False,svd_solver='randomized').fit(Sequences)
            data = pca.transform(Sequences)
            PC_var[CHR][c]= [x for x in pca.explained_variance_]
        
        if DIMr == 'NMF':
            from sklearn.decomposition import NMF
            data = NMF(n_components=ncomp_local, init='random', random_state=0).fit_transform(Sequences)
        
        Accurate = []
        
        params = {'bandwidth': np.linspace(np.min(data), np.max(data),Bandwidth_split)}
        grid = GridSearchCV(KernelDensity(algorithm = "ball_tree",breadth_first = False), params,verbose=0)
        
        ######################################
        ####### TEST global Likelihood #######
        ######################################
        Focus_labels = [z for z in it.chain(*refs_lib.values())]
        
        #### Mean Shift approach
        ## from sklearn.cluster import MeanShift, estimate_bandwidth

        bandwidth = estimate_bandwidth(data, quantile=0.2, n_samples=len(Focus_labels))
        if bandwidth <= 1e-3:
            bandwidth = 0.1

        ms = MeanShift(bandwidth=bandwidth, cluster_all=False, min_bin_freq=15)
        ms.fit(data[Focus_labels,:])
        labels = ms.labels_
        

        Tree = {x:[Focus_labels[y] for y in range(len(labels)) if labels[y] == x] for x in [g for g in list(set(labels)) if g != -1]}
        Keep= [x for x in Tree.keys() if len(Tree[x]) > clsize]
        
        Tree= {x:Tree[x] for x in Keep}
        
        print('N clusters: {}, Sizes: {}'.format(len(Tree),[len(x) for x in Tree.values()]))
        SpaceX = {x:data[Tree[x],:] for x in Tree.keys()}
        
        ### Extract MScluster likelihood by sample
        
        for hill in SpaceX.keys():
            
            grid.fit(data[Tree[hill],:])
            
            # use the best estimator to compute the kernel density estimate
            kde = grid.best_estimator_
            
            P_dist = kde.score_samples(data[Tree[hill],:])
            Dist = kde.score_samples(data)
            P_dist= np.nan_to_num(P_dist)
            Dist= np.nan_to_num(Dist)
            if np.std(P_dist) == 0:
                Dist= np.array([int(Dist[x] in P_dist) for x in range(len(Dist))])
            else:
                Dist = scipy.stats.norm(np.mean(P_dist),np.std(P_dist)).cdf(Dist)
            Dist= np.nan_to_num(Dist)
            Construct[CHR][c][hill] = Dist
        
            ######################################### 
        ############# AMOVA ################
            #########################################
        
        Who = [x for x in range(len(labels)) if labels[x] != -1 and labels[x] in Keep]
        labels = [labels[x] for x in Who]
        Who= [Focus_labels[x] for x in Who]
        
        AMOVA,Cig = Ste.findPhiPT(Sequences[Who,:],labels,n_boot=0)
        
        ########################################################################
        ############## Distances ###############################################
        ########################################################################
        
        Dist_vectors= Ste.Distance_profiles(data,Tree,1000,Bandwidth_split)
        
        if Dist_vectors:
            Distances_vectors[CHR][c]= Dist_vectors
            
        Structures= Ste.Structure_profiles(data,Tree,1000,Bandwidth_split)
        if Structures:
            Struct_dist_vectors[CHR][c]= Structures
        
        ################ Other stats
        SIL = silhouette_score(data[Who,:],labels)
        
        df_between = len(Tree) - 1
        df_within = sum([len(Tree[x]) for x in Tree.keys()]) - len(Tree)
        df_total = sum([len(Tree[x]) for x in Tree.keys()]) - 1
        
        Total = data[Who,:]
        mu = np.mean(Total)
        sd = np.std(Total)
        Total = (Total - mu) / sd
        
        T = np.sqrt(((Total - Total.mean(axis=0))**2).sum(axis=1)).sum()**2 / len(Total)
        Y = (np.sqrt(((Total - Total.mean(axis=0))**2).sum(axis = 1))**2).sum()
        
        SS_between = -T
        SS_within = Y        
        
        t_s = np.empty([len(Total),0])
        
        for gp in Tree.keys():
            Gp = Total[[x for x in range(Total.shape[0]) if labels[x] == gp],:]
            Gp = (Gp - mu) / sd
            SS_between += (np.sqrt(((Gp - Total.mean(axis=0))**2).sum(axis = 1)).sum()**2 / len(Gp))
            SS_within -= (np.sqrt(((Gp - Total.mean(axis=0))**2).sum(axis = 1)**2).sum() / len(Gp))
            t_s = np.append(t_s,np.sqrt(((Gp - Gp.mean(axis=0))**2)).sum(axis = 1))
            
        
        SS_total = Y - T
        MS_between = SS_between / df_between
        MS_within = SS_within / df_within
        
        sqrt_center_dists= np.sqrt(((Total - Total.mean(axis=0))**2)).sum(axis = 1)
        
        t_stat,p_val = scipy.stats.ttest_rel(t_s,sqrt_center_dists)
        
        F = MS_between / float(MS_within)
        N_PC= len(Tree)
        
        eta_sqrd = SS_between / SS_total
        om_sqrd = (SS_between - (df_between * MS_within))/(SS_total + MS_within)
        
        Results[CHR][c] = [N_PC,F,om_sqrd,MS_between,t_stat,AMOVA,SIL,len(Tree),]



NameError: name 'Sizes' is not defined

In [14]:
Fst_pairs= [i for i in it.combinations(range(len(Sizes)),2)]

Fst_lib= []
for pair in range(len(Fst_pairs)):
    Fst_crawl= [[Fst_windows[Chr][n].fst[pair] for n in Fst_windows[Chr].keys()] for Chr in Fst_windows.keys()]

    Fst_crawl= [z for z in it.chain(*Fst_crawl)]
    
    Fst_lib.append(Fst_crawl)

Other_stats= [[[Chr,wind,*Results[Chr][wind]] for wind in Results[Chr].keys()] for Chr in Results.keys()]
Other_stats= np.array([y for y in it.chain(*Other_stats)])

print(Other_stats.shape)

# columns [CHR,window,N_PC,F,om_sqrd,MS_between,t_stat,AMOVA,SIL]

fig_data= [go.Scatter(
x= Fst_lib[i],
y= Other_stats[:,7],
mode= 'markers',
name= str(Fst_pairs[i])
) for i in range(len(Fst_pairs))]

layout = go.Layout(
    title= 'Stats',
    yaxis=dict(
        title='AMOVA'),
    xaxis=dict(
        title='Manipulated Fst')
)

fig= go.Figure(data=fig_data, layout=layout)
iplot(fig)

(40, 9)


In [15]:
fig_data= [go.Scatter(
x= Other_stats[:,8],
y= Other_stats[:,7],
mode= 'markers',
name= 'Results'
)]

layout = go.Layout(
    title= 'Stats',
    yaxis=dict(
        title='AMOVA'),
    xaxis=dict(
        title='SIL')
)

fig= go.Figure(data=fig_data, layout=layout)
iplot(fig)

In [16]:
#####
KDE_stats= [[[Construct[Chr][wind][gp] for gp in sorted(Construct[Chr][wind].keys())] for wind in sorted(Construct[Chr].keys())] for Chr in Construct.keys()]
KDE_stats= np.array([y for y in it.chain(*[z for z in it.chain(*KDE_stats)])])

KDE_stats= np.array(KDE_stats)

n_comp = 4

pca1 = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')
likelihood_feats = pca1.fit_transform(KDE_stats)
var_comps= pca1.explained_variance_ratio_

print("; ".join(['PC{0}: {1}'.format(x+1,round(pca1.explained_variance_ratio_[x],3)) for x in range(n_comp)]))
print('features shape: {}'.format(likelihood_feats.shape))

## perform MeanShift clustering.
iu1 = np.triu_indices(KDE_stats.shape[0])

bandwidth = estimate_bandwidth(likelihood_feats, quantile=0.20)
params = {'bandwidth': np.linspace(np.min(likelihood_feats), np.max(likelihood_feats),30)}
grid = GridSearchCV(KernelDensity(algorithm = "ball_tree",breadth_first = False), params,verbose=0)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=False, min_bin_freq=15)
ms.fit(likelihood_feats)
labels1 = ms.labels_
label_select = {y:[x for x in range(len(labels1)) if labels1[x] == y] for y in sorted(list(set(labels1))) if y != -1}

MS_centroids= [np.mean(likelihood_feats[label_select[z],:],axis= 0) for z in label_select.keys()]
MS_pair_dist= pairwise_distances(MS_centroids,metric= 'euclidean')
#MS_pair_dist= MS_pair_dist[iu_control]



fig_data= [go.Scatter3d(
        x = likelihood_feats[label_select[i],0],
        y = likelihood_feats[label_select[i],1],
        z = likelihood_feats[label_select[i],2],
        type='scatter3d',
        mode= "markers",
        name= 'Cl: {}'.format(i),
        text= ['gp: {}'.format(i) for x in label_select[i]],
        marker= {
        'line': {'width': 0},
        'size': 4,
        'symbol': 'circle',
        "opacity": 1
      }
    ) for i in label_select.keys()]



layout = go.Layout(
    title= 'PCA red + MS clustering KDE profiles across windows',
    
    scene= Scene(
    yaxis=dict(
        title='{}'.format(round(var_comps[1],3))),
    xaxis=dict(
    title= '{}'.format(round(var_comps[0],3))),
    zaxis=dict(
    title= '{}'.format(round(var_comps[2],3))))
)


fig = go.Figure(data=fig_data,layout= layout)
iplot(fig)



PC1: 0.3; PC2: 0.24; PC3: 0.017; PC4: 0.013
features shape: (110, 4)


## Distances

In [17]:
### Converting distances to fst's.

### Select pre and post processing measures. 
Eigen = False
Scale= False
Center= True

MixL= False # select if to mix N_Pops or not.
Length_increment= False
L_step= 5
MixP= True # select if to mix lengths or not. 
Pairs= False # select if comparing Pairs of distances or the distances themselves
Control_inc= True
Predict= False

length_haps= vector_lib.shape[1]
length_range= [75,vector_lib.shape[1]]
length_step= 10

pop_max= 8 # Number of pops

n_comp= 10 # components to keep following PCA

Iter= 20 # repeats

N_sims= 100 # number of haplotypes to generate from each pop in the unbiased scenario.

#### Predict
predicted= []

#def controled_fsts(vector_lib,Eigen,length_haps,Scale,Center,N_pops,n_comp,Iter,N_sims,MixL,MixP,Pairs):
lengths_vector= []

### Control set to include in the transformation:
control_vecs= np.random.choice(range(vector_lib.shape[0]),2)
control_labels= np.repeat([0,1],N_sims)
### Control set distances
control_even_distances= []
control_bias_distances= []

### store distances between centroids
biased_pairwise= []
unbiased_pairwise= []
corrected_pairwise= []

### store PC projection:
dist_PC_even= {x:[] for x in range(n_comp)}
dist_PC_bias= {x:[] for x in range(n_comp)}
dist_PC_corrected= {x:[] for x in range(n_comp)}

### store increemental PC distances
dist_increment_even= {x:[] for x in range(1,n_comp)}
dist_increment_bias= {x:[] for x in range(1,n_comp)}    

### store fsts
fst_store= []


### proceed.

for rep in range(Iter):
    
    if MixP:
        N_pops= np.random.choice(range(3,pop_max),1,replace= False)[0]
    else: 
        N_pops= pop_max
    
    if MixL:
        length_haps= np.random.choice(length_range,1)[0]
    
    if Length_increment:
        length_haps= int(length_range[0] + L_step * np.floor(rep / L_step))
    
    ## Population Sizes and labels
    bias_scheme= np.random.choice(range(25,200),N_pops,replace= False)
    unbiased_sheme= np.repeat(N_sims,N_pops)

    bias_labels= np.repeat(np.array([x for x in range(N_pops)]),bias_scheme)
    unbias_labels= np.repeat(np.array([x for x in range(N_pops)]),unbiased_sheme)

    ### triangular matrices extract.
    iu1= np.triu_indices(N_pops,1) # for centroid comparison

    iu_unbias= np.triu_indices(sum(unbiased_sheme),1)
    iu_bias= np.triu_indices(sum(bias_scheme),1)
    
    iu_control= np.triu_indices(2,1)
    
    Pops= np.random.choice(vector_lib.shape[0],N_pops,replace= False)
    print('Iter: {}, vectors selected: {}, hap length: {}'.format(rep,Pops,length_haps))
    ########## FST

    freqs_selected= vector_lib[Pops,:length_haps]
    Pairwise= Ste.return_fsts2(freqs_selected)

    #fsts_compare = scale(Pairwise.fst)
    fsts_compare= Pairwise.fst
    if Pairs:
        t= fsts_compare
        fsts_compare= [min([t[y] for y in z]) / max([t[y] for y in z]) for z in it.combinations(range(len(t)),2)]
    
    fst_store.extend(fsts_compare)

    ## lengths
    lengths_vector.extend([length_haps] * len(fsts_compare))
    
    #########################################################
    ########### PCA ####################################
    #########################################################
    ### control sample
    
    control_data= []

    for k in range(2):

        probs= vector_lib[control_vecs[k],:length_haps]
        probs[probs < 0] = 0
        probs[probs > 1] = 1
        m= unbiased_sheme[k]
        Haps= [[np.random.choice([1,0],p= [1-probs[x],probs[x]]) for x in range(length_haps)] for acc in range(m)]
        
        control_data.extend(Haps)

    control_data= np.array(control_data)

    #### generate data and perform PCA.
    data= []

    for k in range(N_pops):

        probs= vector_lib[Pops[k],:length_haps]
        probs[probs < 0] = 0
        probs[probs > 1] = 1
        m= unbiased_sheme[k]
        Haps= [[np.random.choice([1,0],p= [1-probs[x],probs[x]]) for x in range(length_haps)] for acc in range(m)]

        data.extend(Haps)

    data1= np.array(data)

    if Scale:
        data1= scale(data1)
    
    if Control_inc:
        pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(np.vstack([control_data,data1]))
        control_unbias_feat= pca.transform(control_data)
    else:
        pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(data1)
    
    feat_unbias= pca.transform(data1)

    if Eigen:
        feat_unbias= feat_unbias * pca.explained_variance_ratio_

    ####### centroid comparison
    #### Controls
    if Control_inc:
        control_centroids= [np.mean(control_unbias_feat[[y for y in range(control_unbias_feat.shape[0]) if control_labels[y] == z],:],axis= 0) for z in range(2)]
        control_centroids= np.array(control_centroids)

        unbias_control_dist= pairwise_distances(control_centroids,metric= 'euclidean')
        unbias_control_dist= unbias_control_dist[iu_control]

        control_even_distances.extend(unbias_control_dist)

    ####
    unbias_centroids= [np.mean(feat_unbias[[y for y in range(feat_unbias.shape[0]) if unbias_labels[y] == z],:],axis= 0) for z in range(N_pops)]
    unbias_centroids= np.array(unbias_centroids)

    unbias_pair_dist= pairwise_distances(unbias_centroids,metric= 'euclidean')
    unbias_pair_dist= unbias_pair_dist[iu1]
    
    
    if Predict:
        fst_pred= [np.exp(m_coeff*np.log(x) + b) for x in unbias_pair_dist]
        predicted.extend(fst_pred)
        #print(np.array([fst_pred,fsts_compare]).T)
    
    #unbias_pair_dist= scale(unbias_pair_dist)
    unbiased_pairwise.extend(unbias_pair_dist)
    
    #################################################
    ############## biased sample

    #### generate data and perform PCA
    data= []

    for k in range(N_pops):

        probs= vector_lib[Pops[k],:]

        m= bias_scheme[k]
        Haps= [[np.random.choice([1,0],p= [1-probs[x],probs[x]]) for x in range(length_haps)] for acc in range(m)]

        data.extend(Haps)

    data2= np.array(data)

    if Scale:
        data2= scale(data2)
    
    if Control_inc:
        pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(np.vstack([control_data,data2]))
        control_bias_feat= pca.transform(control_data)
    else:
        pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(data2)
    
    feat_bias= pca.transform(data2)

    if Eigen:
        feat_bias= feat_bias * pca.explained_variance_ratio_

    #### Centroid distances
    #### Controls
    if Control_inc:
        control_centroids= [np.mean(control_bias_feat[[y for y in range(control_bias_feat.shape[0]) if control_labels[y] == z],:],axis= 0) for z in range(2)]
        control_centroids= np.array(control_centroids)

        bias_control_dist= pairwise_distances(control_centroids,metric= 'euclidean')
        bias_control_dist= bias_control_dist[iu_control]

        control_bias_distances.extend(bias_control_dist)
    
    bias_centroids= [np.mean(feat_bias[[y for y in range(feat_bias.shape[0]) if bias_labels[y] == z],:],axis= 0) for z in range(N_pops)]
    bias_centroids= np.array(bias_centroids)

    bias_pair_dist= pairwise_distances(bias_centroids,metric= 'euclidean')
    bias_pair_dist= bias_pair_dist[iu1]
    #bias_pair_dist= scale(bias_pair_dist)
    if Pairs:
        t= bias_pair_dist
        bias_pair_dist= [min([t[y] for y in z]) / max([t[y] for y in z]) for z in it.combinations(range(len(t)),2)]
    
    biased_pairwise.extend(bias_pair_dist)

    
    

t= np.array([
fsts_compare,
unbias_pair_dist,
bias_pair_dist
]).T


Size= length_haps
fst_lm_range= [.02,.3]

Lindexes= [x for x in range(len(lengths_vector)) if lengths_vector[x] == Size and fst_store[x] >= fst_lm_range[0] and fst_store[x] <= fst_lm_range[1]]
y_true= [np.log(biased_pairwise[x]) for x in Lindexes]
m_coeff,b= np.polyfit(y_true,[np.log(fst_store[x]) for x in Lindexes],1)





Iter: 0, vectors selected: [ 80  72 108  94], hap length: 150



invalid value encountered in double_scalars



Iter: 1, vectors selected: [159 138 146   6 158 118], hap length: 150
Iter: 2, vectors selected: [132 134  42  72], hap length: 150
Iter: 3, vectors selected: [ 26  72 181  93 177 165], hap length: 150
Iter: 4, vectors selected: [ 71  39 120], hap length: 150
Iter: 5, vectors selected: [ 22 116 123 184 111], hap length: 150
Iter: 6, vectors selected: [ 84  90 172  78 137], hap length: 150
Iter: 7, vectors selected: [ 46 118  23  19  27 127], hap length: 150
Iter: 8, vectors selected: [194 143 170  13  17   0  14], hap length: 150
Iter: 9, vectors selected: [168 120 177   8], hap length: 150
Iter: 10, vectors selected: [189  14  73  31  85 123  33], hap length: 150
Iter: 11, vectors selected: [ 88 123  83 129 142], hap length: 150
Iter: 12, vectors selected: [ 57  74 171  39  93], hap length: 150
Iter: 13, vectors selected: [ 45  94 167 107 199 142 143], hap length: 150
Iter: 14, vectors selected: [ 94  77 104], hap length: 150
Iter: 15, vectors selected: [163  19 172], hap length: 150


### Structure distance profiles

In [26]:
Structure_vertice= [2]

Combis= []
for chromo in Struct_dist_vectors.keys():
    for winds in Struct_dist_vectors[chromo].keys():
        for grp in Struct_dist_vectors[chromo][winds].keys():
            if len(grp) in Structure_vertice:
                Combis.append([chromo,winds,grp])

#Struct_distance_stats= np.array([y for y in it.chain(*Struct_distance_stats)])
Struct_distance_stats= [[[Struct_dist_vectors[Chr][wind][gp] for gp in sorted(Struct_dist_vectors[Chr][wind].keys()) if len(gp) in Structure_vertice] for wind in sorted(Struct_dist_vectors[Chr].keys())] for Chr in Struct_dist_vectors.keys()]
Struct_distance_stats= np.array([y for y in it.chain(*[z for z in it.chain(*Struct_distance_stats)])])

Struct_distance_stats= np.array(Struct_distance_stats)

In [27]:

n_comp = 4

pcaS = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')
Struct_dist_feats = pcaS.fit_transform(Struct_distance_stats)
var_compsQ= pcaS.explained_variance_ratio_

print("; ".join(['PC{0}: {1}'.format(x+1,round(pcaS.explained_variance_ratio_[x],3)) for x in range(n_comp)]))
print('features shape: {}'.format(Struct_dist_feats.shape))

bandwidth = estimate_bandwidth(Struct_dist_feats, quantile=0.2)
params = {'bandwidth': np.linspace(np.min(Struct_dist_feats), np.max(Struct_dist_feats),30)}
grid = GridSearchCV(KernelDensity(algorithm = "ball_tree",breadth_first = False), params,verbose=0)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=False, min_bin_freq=25)
ms.fit(Struct_dist_feats)
labels1 = ms.labels_
#labels1= [int(1 in x[2]) for x in Combis]
#labels1= [len(x[2]) for x in Combis]
#print(Combis[:10])
label_select = {y:[x for x in range(len(labels1)) if labels1[x] == y] for y in sorted(list(set(labels1))) if y != -1}


#### Let's plot the first 3 coordinates nonetheless.
####
fig_data= [go.Scatter3d(
        x = Struct_dist_feats[label_select[i],0],
        y = Struct_dist_feats[label_select[i],1],
        z = Struct_dist_feats[label_select[i],2],
        type='scatter3d',
        mode= "markers",
        name= 'Cl: {}'.format(i),
        text= ['gp: {}'.format(i) for x in label_select[i]],
        marker= {
        'line': {'width': 0},
        'size': 4,
        'symbol': 'circle',
        "opacity": 1
      }
    ) for i in label_select.keys()]




layout = go.Layout(
    title= 'PCA red + MS clustering PCA dist profiles across windows',
    
    scene= Scene(
    yaxis=dict(
        title='{}'.format(round(var_compsQ[1],3))),
    xaxis=dict(
    title= '{}'.format(round(var_compsQ[0],3))),
    zaxis=dict(
    title= '{}'.format(round(var_compsQ[2],3))))
)



fig = go.Figure(data=fig_data, layout=layout)
iplot(fig)



PC1: 0.894; PC2: 0.06; PC3: 0.014; PC4: 0.011
features shape: (100, 4)


In [22]:

def plot_density(selected_group):
    Distances= Struct_distance_stats[label_select[selected_group],:]
    print(Distances.shape)
    sum_profile= np.mean(Distances,axis=0)
    
    fig_data= [go.Scatter(
    x= [np.exp(m_coeff * np.log(x) + b) for x in np.linspace(0,8,Distances.shape[1])],
    y= sum_profile,
    mode= 'markers',
    name= 'Results'
    )]

    layout = go.Layout(
        title= 'Stats',
        yaxis=dict(
            title='density'),
        xaxis=dict(
            title= 'Fst')
    )

    fig= go.Figure(data=fig_data, layout=layout)
    iplot(fig)



interact(plot_density,selected_group= label_select.keys())    



<function __main__.plot_density>

In [23]:
Test= Struct_distance_stats.T
Novel= KDE_stats.T


n_comp = 4

pca1 = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')
likelihood_feats = pca1.fit_transform(Novel)
var_comps= pca1.explained_variance_ratio_

print("; ".join(['PC{0}: {1}'.format(x+1,round(pca1.explained_variance_ratio_[x],3)) for x in range(n_comp)]))
print('features shape: {}'.format(likelihood_feats.shape))


def kde_distances(gp):
    if gp == -1:
        apparel= Geneo
        fig_data= [go.Scatter3d(
                x = likelihood_feats[apparel[i],0],
                y = likelihood_feats[apparel[i],1],
                z = likelihood_feats[apparel[i],2],
                type='scatter3d',
                mode= "markers",
                name= 'Cl: {}'.format(i),
                text= ['gp: {}'.format(i) for x in apparel[i]],
                marker= {
                'line': {'width': 0},
                'size': 4,
                'symbol': 'circle',
                "opacity": 1
              }
            ) for i in apparel.keys()]
    else:
        sought_after= label_select[gp]
        print(len(sought_after))
        sought_after= [Combis[x] for x in sought_after]
        #print(sought_after[:4])
        
        Likes= [z for z in it.chain(*[[Construct[el[0]][el[1]][y] for y in el[2]] for el in sought_after])]
        
        Likes= np.array([x for x in Likes if len(x) == sum(Sizes)])
        
        print(Likes.shape)
        Likes= np.mean(Likes, axis= 0).T
        print(Likes.shape)
        
        fig_data= [go.Scatter3d(
                x = likelihood_feats[:,0],
                y = likelihood_feats[:,1],
                z = likelihood_feats[:,2],
                type='scatter3d',
                mode= "markers",
                marker= {
                'color': Likes,
                'colorbar': go.ColorBar(
                    title= 'ColorBar'
                ),
                'colorscale': 'Viridis',
                'line': {'width': 0},
                'size': 4,
                'symbol': 'circle',
                "opacity": 1
              }
            )]
    

    layout = go.Layout(
        title= 'PCA red + MS clustering KDE profiles across windows',

        scene= Scene(
        yaxis=dict(
            title='{}'.format(round(var_comps[1],3))),
        xaxis=dict(
        title= '{}'.format(round(var_comps[0],3))),
        zaxis=dict(
        title= '{}'.format(round(var_comps[2],3))))
    )


    fig = go.Figure(data=fig_data,layout= layout)
    iplot(fig)

interact(kde_distances,gp= [-1,*[x for x in label_select.keys()]])

PC1: 0.299; PC2: 0.242; PC3: 0.013; PC4: 0.012
features shape: (300, 4)


<function __main__.kde_distances>

In [36]:

def plot_density(selected_group):
    sought_after= label_select[selected_group]
    
    sought_after= [Combis[x] for x in sought_after]
    #print(sought_after[:4])
    Cl_number= len(sought_after)

    Likes= [z for z in it.chain(*[[Construct[el[0]][el[1]][y] for y in el[2]] for el in sought_after])]

    Likes= np.array([x for x in Likes if len(x) == sum(Sizes)])
    
    Likes= np.mean(Likes, axis= 0).T
    
    dense_plot=  ff.create_distplot([Likes], [selected_group])
    dense_plot['layout'].update(title='<b>likelihood density. N vertices: {} .N_clusters: {}</b>'.format(Structure_vertice,Cl_number))
    
    iplot(dense_plot)

interact(plot_density,selected_group= [x for x in label_select.keys()]) 

<function __main__.plot_density>