In [1]:
import scipy
import numpy as np
import pandas as pd
import itertools as it

from math import sin
import collections

def recursively_default_dict():
        return collections.defaultdict(recursively_default_dict)

from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import scale

from scipy.stats.stats import pearsonr 

from scipy.stats import invgamma 
from scipy.stats import beta
import matplotlib.pyplot as plt

import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *

init_notebook_mode(connected=True)

### Predicting Fsts, summarizing genetic structure.

In post _8. Controling for size_, we described the relation between genetic and feature space euclidian distances as a function of haplotype length and population structure.

We will now consider the immediate application of the insight produced in that post. 

The reasoning is this: we learned that for a given haplotype lengthh the relation between Fsts and euclidian genetic distances is stable. We saw that they are in fact linearly related on the logarithmic scale.

In this context, we are going to use a simple linear regression on the logarithm of euclidian distances to predict new genetic distances.



_We begin by defining some functions_

In [2]:
def return_fsts2(freq_array):
    pops= range(freq_array.shape[0])
    H= {pop: [1-(freq_array[pop,x]**2 + (1 - freq_array[pop,x])**2) for x in range(freq_array.shape[1])] for pop in range(freq_array.shape[0])}
    Store= []

    for comb in it.combinations(H.keys(),2):
        P= [sum([freq_array[x,i] for x in comb]) / len(comb) for i in range(freq_array.shape[1])]
        HT= [2 * P[x] * (1 - P[x]) for x in range(len(P))]
        per_locus_fst= [[(HT[x] - np.mean([H[p][x] for p in comb])) / HT[x],0][int(HT[x] == 0)] for x in range(len(P))]
        per_locus_fst= np.nan_to_num(per_locus_fst)
        Fst= np.mean(per_locus_fst)

        Store.append([comb,Fst])
        
    return pd.DataFrame(Store,columns= ['pops','fst'])


def local_sampling_correct(data_now,n_comp):
    pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(data_now)
    feats= pca.transform(data_now)
    
    N= 50
    bandwidth = estimate_bandwidth(feats, quantile=0.15)
    params = {'bandwidth': np.linspace(np.min(feats), np.max(feats),30)}
    grid = GridSearchCV(KernelDensity(algorithm = "ball_tree",breadth_first = False), params,verbose=0)
    
    ## perform MeanShift clustering.
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=False, min_bin_freq=5)
    ms.fit(feats)
    labels1 = ms.labels_
    label_select = {y:[x for x in range(len(labels1)) if labels1[x] == y] for y in sorted(list(set(labels1))) if y != -1}

    ## Extract the KDE of each cluster identified by MS.
    Proxy_data= []

    for lab in label_select.keys():
        if len(label_select[lab]) < 3:
            continue
            
        Quanted_set= feats[label_select[lab],:]
        grid.fit(Quanted_set)

        kde = grid.best_estimator_
        Extract= kde.sample(N)
        Return= pca.inverse_transform(Extract)
        
        Proxy_data.extend(Return)
    
    Proxy_data= np.array(Proxy_data)
    
    print([len(x) for x in label_select.values()])
    pca2 = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(Proxy_data)
    var_comp= pca2.explained_variance_ratio_
    
    New_features= pca2.transform(data_now)# * var_comp
    return New_features, var_comp


##### fun with real data 
# load data


def read_refs(index_file):
    indxs = recursively_default_dict()
    
    Input = open(index_file,'r')
    for line in Input:
        line = line.split()
        indxs[int(line[0])][line[1]] = []
    
    Input.close()
    
    indxs = {gop:[x for x in indxs[gop].keys()] for gop in indxs.keys()}
    
    return indxs, [x for x in sorted(indxs.keys())]



def read_Darwin(darwin_file):
    d= 0
    Input= open(darwin_file,'r')
    Names= []
    gen= []
    for line in Input:
        line= line.rstrip().split('\t')
        
        if d== 0:
            d+= 1
            Nsnps= max([int(x) for x in line[1:]])
            continue

        art= line[1:]
        art= [[x,'0'][int(x == ' ')] for x in art]

        Names.append(line[0])

        art= [int(x) for x in art]
        #art= [[x,np.nan][int(x == 9)] for x in art]
        
        if len(art) < Nsnps:
                        art.extend([0] * (Nsnps - len(art)))
        
        art= [x / 2 for x in art]
        gen.append(art)
        d += 1
    Input.close()
    
    
    return gen, Names


### I. Generating Frequency Vectors.

We will start by generating a space of vectors. 

For each vector we will extract _L_ random samples from the **beta** distribution. We will repeat this process as we vary the mean and variance (parameters _a_ and _b_) of the distribution. The ranges along which to vary these, the number of steps and the number of vectors to generate from each combination of _a_ and _b_ can be specified at the beginning of the next block of code.

In [3]:
# Simulate frequency vectors. 
# We must first define the number of populations, the length of the haplotypes desired, and their respective population sizes
L= 300

import itertools as it
n= 200

# Vary a (beta distribution parameter).
a_range= np.linspace(1,2,20)
a_set= [i for i in a_range for _ in range(n)]

# vary b.
b_range= np.linspace(0.1,.4,20)
b_set= [i for i in b_range for _ in range(n)]

## length of haplotypes to extract.
L_set= [L] * n * 20


background_1= np.array([a_set,b_set,L_set]).T

vector_lib= []
for k in range(background_1.shape[0]):
    
    probs= beta.rvs(background_1[k,0], background_1[k,1], size=int(background_1[k,2]))
    probs[(probs > 1)]= 1
    
    
    vector_lib.append(probs)

vector_lib= np.array(vector_lib)

In [4]:
print('Number of frequency vectors of size {} generated: {}'.format(vector_lib.shape[1],vector_lib.shape[0]))


Number of frequency vectors of size 300 generated: 4000


In [5]:
## PCA on vectors simulated
n_comp = 100

pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(vector_lib)
features = pca.transform(vector_lib)# * pca.explained_variance_ratio_

print("; ".join(['PC{0}: {1}'.format(x+1,round(pca.explained_variance_ratio_[x],3)) for x in range(n_comp)]))
print('features shape: {}'.format(features.shape))

PC1: 0.015; PC2: 0.005; PC3: 0.005; PC4: 0.005; PC5: 0.005; PC6: 0.005; PC7: 0.005; PC8: 0.005; PC9: 0.005; PC10: 0.005; PC11: 0.005; PC12: 0.005; PC13: 0.005; PC14: 0.005; PC15: 0.005; PC16: 0.005; PC17: 0.005; PC18: 0.005; PC19: 0.005; PC20: 0.005; PC21: 0.005; PC22: 0.005; PC23: 0.005; PC24: 0.005; PC25: 0.005; PC26: 0.005; PC27: 0.005; PC28: 0.005; PC29: 0.005; PC30: 0.005; PC31: 0.005; PC32: 0.004; PC33: 0.004; PC34: 0.004; PC35: 0.004; PC36: 0.004; PC37: 0.004; PC38: 0.004; PC39: 0.004; PC40: 0.004; PC41: 0.004; PC42: 0.004; PC43: 0.004; PC44: 0.004; PC45: 0.004; PC46: 0.004; PC47: 0.004; PC48: 0.004; PC49: 0.004; PC50: 0.004; PC51: 0.004; PC52: 0.004; PC53: 0.004; PC54: 0.004; PC55: 0.004; PC56: 0.004; PC57: 0.004; PC58: 0.004; PC59: 0.004; PC60: 0.004; PC61: 0.004; PC62: 0.004; PC63: 0.004; PC64: 0.004; PC65: 0.004; PC66: 0.004; PC67: 0.004; PC68: 0.004; PC69: 0.004; PC70: 0.004; PC71: 0.004; PC72: 0.004; PC73: 0.004; PC74: 0.004; PC75: 0.004; PC76: 0.004; PC77: 0.004; PC78: 0.

### MRCA - Most Recent Common Ancestor.

The following block serves to tie all the populations in the vector data set together.

The random generation of frequency vectors creates vectors distinct along, assymptotically, all possible directions.

Here, we limit the number of possible directions, by creating a data set made entirely of vectors generated as described for the manipulation of genetic distances, i.e. from equally distant coordinates between two initial projections. We continue to rely on pairs of initial projections. However, here, only one projection is made to vary, while the other is chosen beforehand and remains the same. 

The result is the starshaped distribution observed in the next graph.


In [6]:
Iter= 50
target= [0,1]
stairs= 4

MRCA= np.random.choice(range(vector_lib.shape[0]),1)
calypso= []
feat= []

for inter in range(stairs):
    Pair= np.random.choice(range(vector_lib.shape[0]),2,replace= False)
    Pair[1]= MRCA
    print(Pair)
    
    coords= features[Pair,:]
    
    vector2= coords[target[1]] - coords[target[0]]
    for angle in np.linspace(-20,20,Iter):
        new_guy = coords[target[0]] + [angle / 10 * x for x in vector2]
        
        feat.append(new_guy)
        
        new_guy= pca.inverse_transform(new_guy)
        new_guy[new_guy < 0]= 0
        new_guy[new_guy > 1]= 1
        
        calypso.append(new_guy)

features= np.array(feat)
vector_lib= np.array(calypso)

[1191 3853]
[1385 3853]
[ 448 3853]
[ 701 3853]


In [7]:
## Plot vector PCA
fig_data= [go.Scatter3d(
        x = features[:,0],
        y = features[:,1],
        z = features[:,2],
        type='scatter3d',
        mode= "markers",
        text= ['a: {}; b: {}, L: {}; index = {}'.format(background_1[k,0],background_1[k,1],background_1[k,2], k) for k in range(background_1.shape[0])],
        marker= {
        'line': {'width': 0},
        'size': 2,
        'symbol': 'circle',
      "opacity": .6
      }
    )]


layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    )
)

fig = go.Figure(data=fig_data)
iplot(fig)


## II. Load data.

The simulations performed in this post will be adjusted to the sequence length of real life examples. We extracted SNP variation at a number of loci and stored it in DataMatrix format (Darwin software).

We first load and organise our data. All the information available is in the DataMatrix file, which is useful.


In [8]:
## load real data
## This has to stay here as long as the trainning is not made a function.
DM_filename= 'Dmatrices/RC_DataMatrix.txt'
DM_filename= 'Dmatrices/qSH1_DataMatrix.txt'
DM_filename= 'Dmatrices/random1_DataMatrix.txt'
DM_filename= 'Dmatrices/bh4_DataMatrix.txt'
DM_filename= 'Dmatrices/Osc1_DataMatrix.txt'
DM_filename= 'Dmatrices/Waxy_DataMatrix.txt'

data, Names= read_Darwin(DM_filename)

ref_file= 'Complementary_data/refs_CORE.txt'
ref_lib, codes= read_refs(ref_file)

## read accession data
order_core= pd.read_csv('Complementary_data/Order_core.txt')


gen_data= np.array(data)

ref_names= [z for z in it.chain(*[ref_lib[r] for r in ref_lib.keys()])]

ref_codes= [z for z in it.chain(*[[r]*len(ref_lib[r]) for r in ref_lib.keys()])]

present= [x for x in range(gen_data.shape[0]) if Names[x] in ref_names]

color_code= [ref_codes[ref_names.index(Names[x])] for x in present]

color_indexes= {z:[present[x] for x in range(len(color_code)) if color_code[x] == z] for z in list(set(color_code))}

colors= ['blanc','red','blanc','yellow','blue','grey']

ref_names= ['blanc','Indica','black','cAus','Japonica','control']

print(gen_data.shape)


(948, 248)


In [11]:
list(set(order_core.K9_cluster))

['GJ-adm',
 'XI-1B',
 'GJ-trp',
 'GJ-sbtrp',
 'XI-adm',
 'admix',
 'GJ-tmp',
 'XI-1A',
 'XI-2',
 'cB_(Bas)',
 'cA_(Aus)',
 'XI-3']

##  III. Simulations. 


The next block of code has to be run twice. The first run will serve to extract the relation between Fsts and genetic distances for the number of SNPs at our locus of choice. We will generate a number of data sets with a varying number of populations, at varying genetic distances. For each data set we will extract euclidian distances in feature space, calculated between the centroids of the populations we are studying. The second run will make use of the distances calculated in the first run to correct our prediction of genetic distance from their euclidian equivalents.

Many of the parameters to tweak at the beginning of the next block are interesting for the study of the relation of Fst to genetic distances, but quite useless here. 

In Summary, for each iteration, the next blocks will:
- select population vectors from the base data set created in **I.**.
- calculate pairwise Fsts between the selected vectors.
- create two sets of haplotype populations from the selected vectors: one biased, where sampling number by population is a random draw between 25 and 300; one unbiased, where 50 haplotypes will be created by population.
- perform PCA on each of the generated data sets.
- Calculate pairwise centroid distances between the PCA projections of different haploid populations produced.

Refer to post _8. Controlling for Size_ for a step by step construction of the following block.

`Predict = False` for the first run.

`Predict = True` for a second run

**One of the possibilities here is to include a control pair of populations (`Control_inc= True`). These will be used to produce haplotypes that will be included in every PCA. In each new set their euclidian distances will be calculated and stored. The point is simply to provide a referencial, to confirm that the distance between the projections of two populations remains the same independently of context (the number of remaining populations will change continuously, as will sampling number).**

**Further on, these populations provide a control for the PCA transformation of real data. They will be included in that analysis, and their euclidian distance compared to their predicted distance - estimated following simulations.**


In [10]:
### Select pre and post processing measures. 
Eigen = False
Scale= False
Center= True

MixL= False # select if to mix N_Pops or not.
Length_increment= False
L_step= 5
MixP= True # select if to mix lengths or not. 
Pairs= False # select if comparing Pairs of distances or the distances themselves
Control_inc= True
Predict= True

length_haps= gen_data.shape[1]
length_range= [75,vector_lib.shape[1]]
length_step= 10

pop_max= 8 # Number of pops

n_comp= 10 # components to keep following PCA

Iter= 20 # repeats

N_sims= 100 # number of haplotypes to generate from each pop in the unbiased scenario.

#### Predict
predicted= []

#def controled_fsts(vector_lib,Eigen,length_haps,Scale,Center,N_pops,n_comp,Iter,N_sims,MixL,MixP,Pairs):
lengths_vector= []

### Control set to include in the transformation:
control_vecs= np.random.choice(range(vector_lib.shape[0]),2)
control_labels= np.repeat([0,1],N_sims)
### Control set distances
control_even_distances= []
control_bias_distances= []

### store distances between centroids
biased_pairwise= []
unbiased_pairwise= []
corrected_pairwise= []

### store PC projection:
dist_PC_even= {x:[] for x in range(n_comp)}
dist_PC_bias= {x:[] for x in range(n_comp)}
dist_PC_corrected= {x:[] for x in range(n_comp)}

### store increemental PC distances
dist_increment_even= {x:[] for x in range(1,n_comp)}
dist_increment_bias= {x:[] for x in range(1,n_comp)}    

### store fsts
fst_store= []


### proceed.

for rep in range(Iter):
    
    if MixP:
        N_pops= np.random.choice(range(3,pop_max),1,replace= False)[0]
    else: 
        N_pops= pop_max
    
    if MixL:
        length_haps= np.random.choice(length_range,1)[0]
    
    if Length_increment:
        length_haps= int(length_range[0] + L_step * np.floor(rep / L_step))
    
    ## Population Sizes and labels
    bias_scheme= np.random.choice(range(25,200),N_pops,replace= False)
    unbiased_sheme= np.repeat(N_sims,N_pops)

    bias_labels= np.repeat(np.array([x for x in range(N_pops)]),bias_scheme)
    unbias_labels= np.repeat(np.array([x for x in range(N_pops)]),unbiased_sheme)

    ### triangular matrices extract.
    iu1= np.triu_indices(N_pops,1) # for centroid comparison

    iu_unbias= np.triu_indices(sum(unbiased_sheme),1)
    iu_bias= np.triu_indices(sum(bias_scheme),1)
    
    iu_control= np.triu_indices(2,1)
    
    Pops= np.random.choice(vector_lib.shape[0],N_pops,replace= False)
    print('Iter: {}, vectors selected: {}, hap length: {}'.format(rep,Pops,length_haps))
    ########## FST

    freqs_selected= vector_lib[Pops,:length_haps]
    Pairwise= return_fsts2(freqs_selected)

    #fsts_compare = scale(Pairwise.fst)
    fsts_compare= Pairwise.fst
    if Pairs:
        t= fsts_compare
        fsts_compare= [min([t[y] for y in z]) / max([t[y] for y in z]) for z in it.combinations(range(len(t)),2)]
    
    fst_store.extend(fsts_compare)

    ## lengths
    lengths_vector.extend([length_haps] * len(fsts_compare))
    
    #########################################################
    ########### PCA ####################################
    #########################################################
    ### control sample
    
    control_data= []

    for k in range(2):

        probs= vector_lib[control_vecs[k],:length_haps]
        probs[probs < 0] = 0
        probs[probs > 1] = 1
        m= unbiased_sheme[k]
        Haps= [[np.random.choice([1,0],p= [1-probs[x],probs[x]]) for x in range(length_haps)] for acc in range(m)]
        
        control_data.extend(Haps)

    control_data= np.array(control_data)

    #### generate data and perform PCA.
    data= []

    for k in range(N_pops):

        probs= vector_lib[Pops[k],:length_haps]
        probs[probs < 0] = 0
        probs[probs > 1] = 1
        m= unbiased_sheme[k]
        Haps= [[np.random.choice([1,0],p= [1-probs[x],probs[x]]) for x in range(length_haps)] for acc in range(m)]

        data.extend(Haps)

    data1= np.array(data)

    if Scale:
        data1= scale(data1)
    
    if Control_inc:
        pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(np.vstack([control_data,data1]))
        control_unbias_feat= pca.transform(control_data)
    else:
        pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(data1)
    
    feat_unbias= pca.transform(data1)

    if Eigen:
        feat_unbias= feat_unbias * pca.explained_variance_ratio_

    ####### centroid comparison
    #### Controls
    if Control_inc:
        control_centroids= [np.mean(control_unbias_feat[[y for y in range(control_unbias_feat.shape[0]) if control_labels[y] == z],:],axis= 0) for z in range(2)]
        control_centroids= np.array(control_centroids)

        unbias_control_dist= pairwise_distances(control_centroids,metric= 'euclidean')
        unbias_control_dist= unbias_control_dist[iu_control]

        control_even_distances.extend(unbias_control_dist)

    ####
    unbias_centroids= [np.mean(feat_unbias[[y for y in range(feat_unbias.shape[0]) if unbias_labels[y] == z],:],axis= 0) for z in range(N_pops)]
    unbias_centroids= np.array(unbias_centroids)

    unbias_pair_dist= pairwise_distances(unbias_centroids,metric= 'euclidean')
    unbias_pair_dist= unbias_pair_dist[iu1]
    
    if Pairs:
        t= unbias_pair_dist
        unbias_pair_dist= [min([t[y] for y in z]) / max([t[y] for y in z]) for z in it.combinations(range(len(t)),2)]
    
    if Predict:
        fst_pred= [np.exp(m_coeff*np.log(x) + b) for x in unbias_pair_dist]
        predicted.extend(fst_pred)
        #print(np.array([fst_pred,fsts_compare]).T)
    
    #unbias_pair_dist= scale(unbias_pair_dist)
    unbiased_pairwise.extend(unbias_pair_dist)

    ## PC-wise centroid comparison
    for PC in range(unbias_centroids.shape[1]):
        unbias_PC_dist= pairwise_distances(unbias_centroids[:,PC].reshape(-1,1),metric= 'euclidean')
        unbias_PC_dist= unbias_PC_dist[iu1]
        if Pairs:
            t= unbias_PC_dist
            unbias_PC_dist= [min([t[y] for y in z]) / max([t[y] for y in z]) for z in it.combinations(range(len(t)),2)]
        
        dist_PC_even[PC].extend(unbias_PC_dist)
        if  PC > 0:
            unbias_increment_dist= pairwise_distances(unbias_centroids[:,:PC],metric= 'euclidean')
            unbias_increment_dist= unbias_increment_dist[iu1]
            dist_increment_even[PC].extend(unbias_increment_dist)            

    #################################################
    ############## biased sample

    #### generate data and perform PCA
    data= []

    for k in range(N_pops):

        probs= vector_lib[Pops[k],:]

        m= bias_scheme[k]
        Haps= [[np.random.choice([1,0],p= [1-probs[x],probs[x]]) for x in range(length_haps)] for acc in range(m)]

        data.extend(Haps)

    data2= np.array(data)

    if Scale:
        data2= scale(data2)
    
    if Control_inc:
        pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(np.vstack([control_data,data2]))
        control_bias_feat= pca.transform(control_data)
    else:
        pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(data2)
    
    feat_bias= pca.transform(data2)

    if Eigen:
        feat_bias= feat_bias * pca.explained_variance_ratio_

    #### Centroid distances
    #### Controls
    if Control_inc:
        control_centroids= [np.mean(control_bias_feat[[y for y in range(control_bias_feat.shape[0]) if control_labels[y] == z],:],axis= 0) for z in range(2)]
        control_centroids= np.array(control_centroids)

        bias_control_dist= pairwise_distances(control_centroids,metric= 'euclidean')
        bias_control_dist= bias_control_dist[iu_control]

        control_bias_distances.extend(bias_control_dist)
    
    bias_centroids= [np.mean(feat_bias[[y for y in range(feat_bias.shape[0]) if bias_labels[y] == z],:],axis= 0) for z in range(N_pops)]
    bias_centroids= np.array(bias_centroids)

    bias_pair_dist= pairwise_distances(bias_centroids,metric= 'euclidean')
    bias_pair_dist= bias_pair_dist[iu1]
    #bias_pair_dist= scale(bias_pair_dist)
    if Pairs:
        t= bias_pair_dist
        bias_pair_dist= [min([t[y] for y in z]) / max([t[y] for y in z]) for z in it.combinations(range(len(t)),2)]
    
    biased_pairwise.extend(bias_pair_dist)

    ### PC-wise centroid comparison
    for PC in range(bias_centroids.shape[1]):
        bias_PC_dist= pairwise_distances(bias_centroids[:,PC].reshape(-1,1),metric= 'euclidean')
        bias_PC_dist= bias_PC_dist[iu1]
        if Pairs:
            t= bias_PC_dist
            bias_PC_dist= [min([t[y] for y in z]) / max([t[y] for y in z]) for z in it.combinations(range(len(t)),2)]
        #bias_PC_dist= scale(bias_PC_dist)
        dist_PC_bias[PC].extend(bias_PC_dist)
        if PC > 0:
            bias_increment_dist= pairwise_distances(bias_centroids[:,:PC],metric= 'euclidean')
            bias_increment_dist= bias_increment_dist[iu1]
            #bias_PC_dist= scale(bias_PC_dist)
            dist_increment_bias[PC].extend(bias_increment_dist)

    ###############################################################"
    ################## bias correct
    ### perform MS correction on biased samples
    feat_correct,var_comp= local_sampling_correct(data2,n_comp)

    ### centroid Distances
    centroids= [np.mean(feat_correct[[y for y in range(feat_correct.shape[0]) if bias_labels[y] == z],:],axis= 0) for z in range(N_pops)]
    centroids= np.array(centroids)
    pair_dist= pairwise_distances(centroids,metric= 'euclidean')
    pair_dist= pair_dist[iu1]
    #pair_dist= scale(pair_dist)
    if Pairs:
        t= pair_dist
        pair_dist= [min([t[y] for y in z]) / max([t[y] for y in z]) for z in it.combinations(range(len(t)),2)]

    corrected_pairwise.extend(pair_dist)

    ### PC-wise centroid comparison
    for PC in range(centroids.shape[1]):
        corrected_PC_dist= pairwise_distances(centroids[:,PC].reshape(-1,1),metric= 'euclidean')
        corrected_PC_dist= corrected_PC_dist[iu1]
        #corrected_PC_dist= scale(corrected_PC_dist)
        if Pairs:
            t= corrected_PC_dist
            corrected_PC_dist= [min([t[y] for y in z]) / max([t[y] for y in z]) for z in it.combinations(range(len(t)),2)]

        dist_PC_corrected[PC].extend(corrected_PC_dist)
    
    

t= np.array([
fsts_compare,
unbias_pair_dist,
bias_pair_dist,
pair_dist
]).T



####

Size= gen_data.shape[1]
fst_lm_range= [.02,.3]

Lindexes= [x for x in range(len(lengths_vector)) if lengths_vector[x] == Size and fst_store[x] >= fst_lm_range[0] and fst_store[x] <= fst_lm_range[1]]
y_true= [np.log(biased_pairwise[x]) for x in Lindexes]
m_coeff,b= np.polyfit(y_true,[np.log(fst_store[x]) for x in Lindexes],1)



Iter: 0, vectors selected: [ 21 163  31], hap length: 248



invalid value encountered in double_scalars



[255, 109]
Iter: 1, vectors selected: [133 153  58 195  72  36], hap length: 248



invalid value encountered in double_scalars



[238, 165, 160]
Iter: 2, vectors selected: [  5 196 146  45 102  92 152], hap length: 248



invalid value encountered in double_scalars



[231, 176, 152, 104]
Iter: 3, vectors selected: [139  28 110  12], hap length: 248



invalid value encountered in double_scalars



[181, 176, 68]
Iter: 4, vectors selected: [  0 105 175 101 181 138], hap length: 248



invalid value encountered in double_scalars



[238, 215, 146]
Iter: 5, vectors selected: [136 156 162 147  16], hap length: 248



invalid value encountered in double_scalars



[266, 228, 177]
Iter: 6, vectors selected: [117  38 189  19 102  41 103], hap length: 248



invalid value encountered in double_scalars



[354, 342, 83]
Iter: 7, vectors selected: [198  59 147  24 130], hap length: 248



invalid value encountered in double_scalars



[142, 122, 60]
Iter: 8, vectors selected: [ 42  58 180  17  63  76], hap length: 248



invalid value encountered in double_scalars



[285, 217, 141]
Iter: 9, vectors selected: [137  85   8  23  74 144  18], hap length: 248



invalid value encountered in double_scalars



[353, 273]
Iter: 10, vectors selected: [ 67  40 133  70  92 135], hap length: 248



invalid value encountered in double_scalars



[495, 187]
Iter: 11, vectors selected: [142 187 189], hap length: 248



invalid value encountered in double_scalars



[281, 1, 1]
Iter: 12, vectors selected: [  2 138  95 131 170], hap length: 248



invalid value encountered in double_scalars



[330, 191, 88]
Iter: 13, vectors selected: [157 142 168 106   5  31 109], hap length: 248



invalid value encountered in double_scalars



[317, 195, 175, 104]
Iter: 14, vectors selected: [110   6  59  63], hap length: 248



invalid value encountered in double_scalars



[220, 157, 126]
Iter: 15, vectors selected: [196  37 120  45 101], hap length: 248



invalid value encountered in double_scalars



[170, 153]
Iter: 16, vectors selected: [ 63  93 168 186], hap length: 248



invalid value encountered in double_scalars



[192, 163, 42]
Iter: 17, vectors selected: [ 85  77 115 170 114], hap length: 248



invalid value encountered in double_scalars



[279, 252, 48]
Iter: 18, vectors selected: [136  34 184 148 186], hap length: 248



invalid value encountered in double_scalars



[410]
Iter: 19, vectors selected: [118 145  99], hap length: 248



invalid value encountered in double_scalars



[208, 54]


## IV. Data analysis


We will first confirm that our prediction of genetic distances from PCA euclidian distances is reasonable.

In [11]:
## lm fit
## lm_fit
Size= gen_data.shape[1]
fst_lm_range= [.02,.3]

Lindexes= [x for x in range(len(lengths_vector)) if lengths_vector[x] == Size and fst_store[x] >= fst_lm_range[0] and fst_store[x] <= fst_lm_range[1]]
y_true= [np.log(biased_pairwise[x]) for x in Lindexes]
m_coeff,b= np.polyfit(y_true,[np.log(fst_store[x]) for x in Lindexes],1)



if Predict:
    fig_data= [go.Scatter(
        x= [x for x in fst_store],
        y= [x for x in predicted],
        mode= 'markers'
        )
    ]

    layout = go.Layout(
        title= 'Predicted versus True Fsts, based on euclidian distances',
        yaxis=dict(
            title='predicted fst'),
        xaxis=dict(
            title='true fst')
    )

    fig= go.Figure(data=fig_data, layout=layout)
    iplot(fig)
    
    m_coeff_II,b_II= np.polyfit([x for x in predicted],[x for x in fst_store],1)


In [13]:
if Predict:
    fig_data= [go.Scatter(
        x= [m_coeff_II * np.exp(m_coeff*np.log(x) + b) + b_II for x in unbiased_pairwise],
        y= [x for x in fst_store],
        mode= 'markers'
        )
    ]

    layout = go.Layout(
        title= 'Predicted versus True Fsts, based on euclidian distances',
        yaxis=dict(
            title='predicted fst'),
        xaxis=dict(
            title='true fst')
    )

    fig= go.Figure(data=fig_data, layout=layout)
    iplot(fig)
    

**Fig. True and predicted Fsts.**

### Control verification

In [14]:
### Distribution of feature space distances between control populations for even and biased scenarios
from sklearn.neighbors import KernelDensity

###
lengths_pallette= [150]
include= [x for x in range(len(fst_store)) if fst_store[x] <= 1 and fst_store[x] >= 0 and lengths_vector[x] in lengths_pallette]
###

X_plot = np.linspace(min(control_bias_distances) - 2, max(control_bias_distances) + 2, 1000)

kde = KernelDensity(kernel='gaussian', bandwidth=0.05).fit(np.array(control_bias_distances).reshape(-1,1))

log_dens = kde.score_samples(X_plot.reshape(-1,1))

fig_roost_dens= [go.Scatter(x=X_plot, y=np.exp(log_dens), 
                            mode='lines', fill='tozeroy', name= 'Biased senarios',
                            line=dict(color='blue', width=2))]
##
X_plot = np.linspace(min(control_even_distances) - 2, max(control_even_distances) + 2, 1000)

kde = KernelDensity(kernel='gaussian', bandwidth=0.05).fit(np.array(control_even_distances).reshape(-1,1))

log_dens = kde.score_samples(X_plot.reshape(-1,1))

fig_roost_dens.append(go.Scatter(x=X_plot, y=np.exp(log_dens), 
                            mode='lines', fill='tozeroy', name= 'even scenarios (n= {})'.format(N_sims),
                            line=dict(color='red', width=2)))

##

layout= go.Layout(
    title= 'PCA distances between control populations across iterations.'
)

fig = go.Figure(data=fig_roost_dens, layout= layout)
iplot(fig)

**Fig. Control distances.** Euclidian distances between control populations across data sets with varying population number and sampling schemes.

### Fst and euclidian genetic distances

In [30]:
t= np.array([
    unbiased_pairwise,
    biased_pairwise,
    corrected_pairwise
]).T

pearsons= [pearsonr(fst_store,t[:,x])[0] for x in range(t.shape[1])]

lengths_indexes= {z:[x for x in range(len(lengths_vector)) if lengths_vector[x] == z] for z in list(set(lengths_vector))}

lengthy= lengths_indexes.keys()
fig_data= [go.Scatter(
    x= [fst_store[x] for x in lengths_indexes[i]],
    y= [t[x,0] for x in lengths_indexes[i]],
    mode= 'markers',
    marker= dict(
        color= i,
        opacity= .6
    ),
    name= 'L: {}, r: {}'.format(str(i),round(pearsonr([fst_store[x] for x in lengths_indexes[i]],[t[x,0] for x in lengths_indexes[i]])[0],3))
    ) for i in lengthy
]

layout = go.Layout(
    title= 'PCA distances against fst across sampling scenarios; Npops= {},(mixed={})'.format(pop_max,str(MixP)),
    yaxis=dict(
        #range= [0,9],
        title='feature space distances'),
    xaxis=dict(
        #range= [0,.4],
        title='Fst')
)

fig= go.Figure(data=fig_data, layout=layout)
iplot(fig)

**Fig. Genetic versus euclidian distances.** Fsts versus euclidian distances calculated in PCA feature space across sampling schemes and population number for the number of bi-allelic markers found at the selected locus.

## Real data

In this section we apply the relation learned during the preceding simulation step to the study of genetic structure at the chosen locus.

i. We will first perform PCA on the data matrix loaded, together with haplotypes from the two control populations. The data matrix came with labels and individual IDs. These are displayed in the first graph.

ii. We then perform Mean Shift unsupervised clustering to the real data in feature space. MeanShift is a clustering algorithm that relies on the KDE estimate of observed data to identify peaks, to which it assigns observations based on proximity ( ** ). The output of this clusterisation is displayed on the second graph.

iii. For the third part the user is first asked to chose a set of MeanShift clusters. The pairwise distances between the centroids of the chosen clusters will be calculated and their respective Fsts inferred from the relation shown in the above plot. The clusters chosen and the vectors connecting are plotted, annotated with their respective distances.



In [31]:
### Build control data set with apropriate distances.
length_haps= gen_data.shape[1]

Control_inc= True
### control sample
control_n= 50

control_data= []
for k in range(2):

    probs= vector_lib[control_vecs[k],:length_haps]
    probs[probs < 0] = 0
    probs[probs > 1] = 1
    m= control_n
    Haps= [[np.random.choice([1,0],p= [1-probs[x],probs[x]]) for x in range(length_haps)] for acc in range(m)]

    control_data.extend(Haps)

control_data= np.array(control_data)

## Perform PCA with or without including control populations.


if Control_inc:
    pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(np.vstack([gen_data,control_data]))
    feat_unbias= pca.transform(np.vstack([gen_data,control_data]))
    color_indexes[-1]= range(gen_data.shape[0], gen_data.shape[0] + control_n * 2)
    var_comps= pca.explained_variance_ratio_
else:
    pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(np.vstack([gen_data,control_data]))
    feat_unbias= pca.transform(np.vstack([gen_data,control_data]))
    color_indexes[-1]= range(gen_data.shape[0], gen_data.shape[0] + control_n * 2)
    var_comps= pca.explained_variance_ratio_


fig_data= [go.Scatter3d(
        x = feat_unbias[color_indexes[i],0],
        y = feat_unbias[color_indexes[i],1],
        z = feat_unbias[color_indexes[i],2],
        type='scatter3d',
        mode= "markers",
        name= ref_names[i],
        text= ['ID: {}, gp: {}'.format(Names[x], ref_names[i]) for x in color_indexes[i]],
        marker= {
        'line': {'width': 0},
        'size': 4,
        'symbol': 'circle',
        'color': colors[i],
        "opacity": 1
      }
    ) for i in [x for x in color_indexes.keys() if x >= 0]]

fig_data.append(go.Scatter3d(
    x = feat_unbias[color_indexes[-1],0],
    y = feat_unbias[color_indexes[-1],1],
    z = feat_unbias[color_indexes[-1],2],
    type='scatter3d',
    mode= "markers",
    name= ref_names[-1],
    #text= ['a: {}; b: {}, L: {}; index = {}'.format(background_1[k,0],background_1[k,1],background_1[k,2], k) for k in range(background_1.shape[0])],
    marker= {
    'line': {'width': 0},
    'size': 4,
    'symbol': 'circle',
    'color': colors[-1],
    "opacity": .3
  }
)
)


layout = go.Layout(
    title= 'Waxy locus',
    scene= Scene(
    yaxis=dict(
        title='{}'.format(round(var_comps[1],3))),
    xaxis=dict(
    title= '{}'.format(round(var_comps[0],3))),
    zaxis=dict(
    title= '{}'.format(round(var_comps[2],3))))
)


fig = go.Figure(data=fig_data,layout= layout)
iplot(fig)


**Fig. Genetic structure** at locus of choice.

In [17]:
## Distance between controls:


control_labels= {0: range(gen_data.shape[0],gen_data.shape[0] + control_n),
                1: range(gen_data.shape[0] + control_n,gen_data.shape[0] + control_n * 2)}
control_centroids= [np.mean(feat_unbias[control_labels[z],:],axis= 0) for z in control_labels.keys()]
control_centroids= np.array(control_centroids)

unbias_control_dist= pairwise_distances(control_centroids,metric= 'euclidean')
unbias_control_dist= round(unbias_control_dist[iu_control][0],2)
unbias_control_fst= m_coeff_II * (m_coeff* np.exp(np.log(unbias_control_dist) + b)) + b_II
print(unbias_control_dist)


#### MeanShift clusters

N= 50
bandwidth = estimate_bandwidth(feat_unbias, quantile=0.15)
params = {'bandwidth': np.linspace(np.min(feat_unbias), np.max(feat_unbias),30)}
grid = GridSearchCV(KernelDensity(algorithm = "ball_tree",breadth_first = False), params,verbose=0)

## perform MeanShift clustering.
ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=False, min_bin_freq=25)
ms.fit(feat_unbias[present,:])
labels1 = ms.labels_
label_select = {y:[present[x] for x in range(len(labels1)) if labels1[x] == y] for y in sorted(list(set(labels1))) if y != -1}

MS_centroids= [np.mean(feat_unbias[label_select[z],:],axis= 0) for z in label_select.keys()]
MS_pair_dist= pairwise_distances(MS_centroids,metric= 'euclidean')
MS_pair_dist= MS_pair_dist[iu_control]



fig_data= [go.Scatter3d(
        x = feat_unbias[label_select[i],0],
        y = feat_unbias[label_select[i],1],
        z = feat_unbias[label_select[i],2],
        type='scatter3d',
        mode= "markers",
        name= 'Cl: {}'.format(i),
        text= ['ID: {}, gp: {}'.format(Names[x], i) for x in label_select[i]],
        marker= {
        'line': {'width': 0},
        'size': 4,
        'symbol': 'circle',
        "opacity": 1
      }
    ) for i in label_select.keys()]

fig_data.append(go.Scatter3d(
    x = feat_unbias[color_indexes[-1],0],
    y = feat_unbias[color_indexes[-1],1],
    z = feat_unbias[color_indexes[-1],2],
    type='scatter3d',
    mode= "markers",
    name= ref_names[-1],
    #text= ['a: {}; b: {}, L: {}; index = {}'.format(background_1[k,0],background_1[k,1],background_1[k,2], k) for k in range(background_1.shape[0])],
    marker= {
    'line': {'width': 0},
    'size': 4,
    'symbol': 'circle',
    'color': colors[-1],
    "opacity": .3
  }
)
)


layout = go.Layout(
    title= 'Control dist: {}, Exp. dist: {}, pred. Fst: {}'.format(unbias_control_dist, 
                                                                   round(np.mean(control_bias_distances),2),
                                                                   round(unbias_control_fst,2)),
    scene= Scene(
    yaxis=dict(
        title='{}'.format(round(var_comps[1],3))),
    xaxis=dict(
    title= '{}'.format(round(var_comps[0],3))),
    zaxis=dict(
    title= '{}'.format(round(var_comps[2],3))))
)


fig = go.Figure(data=fig_data,layout= layout)
iplot(fig)



1.27


**Fig. Mean Shift clustering of PCA projections** of real data at locus of choice.

Identify accessions at MS cluster of choice.

In [28]:
[len(x) for x in label_select.values()] 

OC= 3
voices= [Names[x] for x in label_select[OC]]
focus_core= [x for x in range(order_core.shape[0]) if order_core.ID[x] in voices]

order_target= order_core.loc[focus_core,:]
order_target


Unnamed: 0,ID,NAME,COUNTRY,REGION,sNMF_K3,Jap_K4,K9_cluster,Initial_subpop,genoIndex
233,IRIS_313-8305,URAIBOOL,India,As2a,0,1,XI-2,ind2,768
282,IRIS_313-9384,BARIK_KUDI,India,As2a,0,1,XI-2,ind2,1100
413,IRIS_313-11310,ARC_15929,India,As2e,0,1,XI-2,ind2,2192
440,IRIS_313-11638,NCS_331,India,As2a,0,1,XI-2,ind2,2468
443,IRIS_313-11646,NCS_771_A,India,As2a,0,1,XI-2,ind2,2476
563,CX368,N22,India,As2a,1,1,cA_(Aus),aus,495
564,IRIS_313-8321,BHADOIA_303,Bangladesh,As2c,1,1,cA_(Aus),aus,774
566,IRIS_313-8410,JABOR_SAIL,Bangladesh,As2c,1,1,cA_(Aus),aus,802
570,IRIS_313-8963,BATHURI,Bangladesh,As2c,1,1,cA_(Aus),aus,973
573,IRIS_313-9449,JAMBALI,Pakistan,As1,1,1,cA_(Aus),aus,1123


At the top of the next block, select the clusters between which to calculate distances.

In [32]:
#### Making it interesting:
Vertices= [0,1,2]

fig_vertix= []
d= 0

for pair in it.combinations(Vertices,2):
    coordinates= [np.mean(feat_unbias[label_select[z],:],axis= 0) for z in pair]
    coordinates= np.array(coordinates)
    
    MS_pair_dist= pairwise_distances(coordinates,metric= 'euclidean')
    MS_pair_dist= MS_pair_dist[iu_control][0]
    MS_pair_fst= m_coeff_II * np.exp(m_coeff * np.log(MS_pair_dist) + b) + b_II
    
    fig_vertix.append(go.Scatter3d(
    x= coordinates[:,0],
    y= coordinates[:,1], 
    z= coordinates[:,2],
    text= ['group: {}'.format(x) for x in pair], 
    name= 'pred Fst {}: {}'.format(pair, round(MS_pair_fst,2)),
    marker=dict(
        size=5,
        color= '#1f77b4'
    ),
    line=dict(
        color= d,
        width= 2,
    )))
    
    d += 1


fig_vertix.extend([go.Scatter3d(
x= feat_unbias[label_select[i],0],
y= feat_unbias[label_select[i],1], 
z= feat_unbias[label_select[i],2],
mode= 'markers',
name= 'Cl: {}'.format(i),
marker=dict(
    size=5,
    color= '#1f77b4'
)
) for i in Vertices])


layout = go.Layout(
    title= 'Waxy. control dist: {}, pred. Fst: {}'.format(round(unbias_control_dist,2),round(unbias_control_fst,2)
),
    scene= Scene(
    yaxis=dict(
        showgrid= False,
        title='{}'.format(round(var_comps[1],3))),
    xaxis=dict(
        showgrid= False,
        title= '{}'.format(round(var_comps[0],3))),
    zaxis=dict(
        showgrid= False,
        title= '{}'.format(round(var_comps[2],3))))
)


fig = go.Figure(data=fig_vertix,layout= layout)
iplot(fig)


**Fig. Genetic structure summary** Genetic distances between clusters selected are represented as edges.

## Under construction..

In [29]:
#Osc1_coords
#Osc1_clusters
#Waxy_coords
#Waxy_clusters

Seneca= {'Waxy':Waxy_coords,'Osc1':Osc1_coords}

series_gen= ['Waxy','Osc1']

fig_scale= []

for gen in series_gen:
    
    for gp in color_indexes.keys():
        if gp != -1:
            fig_scale.append(go.Scatter3d(
                    x = Seneca[gen][color_indexes[gp],0] + 6 * series_gen.index(gen),
                    y = Seneca[gen][color_indexes[gp],1],
                    z = Seneca[gen][color_indexes[gp],2],
                    type='scatter3d',
                    mode= "markers",
                    name= gp,
                    #text= ['ID: {}, gp: {}'.format(Names[x], ref_names[gp]) for x in color_indexes[gp]],
                    marker= {
                    'line': {'width': 0},
                    'size': 4,
                    'symbol': 'circle',
                    'color': ['blue','red'][series_gen.index(gen)],
                    "opacity": 1
                  }
                ))

layout = go.Layout(
    title= 'Control dist: {}, Exp. dist: {}, pred. Fst: {}'.format(unbias_control_dist, 
                                                                   round(np.mean(control_bias_distances),2),
                                                                   round(unbias_control_fst,2)),
    scene= Scene(
    yaxis=dict(
        title='{}'.format(round(var_comps[1],3))),
    xaxis=dict(
    title= '{}'.format(round(var_comps[0],3))),
    zaxis=dict(
    title= '{}'.format(round(var_comps[2],3))))
)


fig = go.Figure(data=fig_scale)#,layout= layout)
iplot(fig)


NameError: name 'Waxy_coords' is not defined

### gp