In [1]:
import numpy as np
import pandas as pd
import allel
import math

In [2]:
encoded = pd.read_csv("encoded.csv", header=0, names=['rsID', 'sequence'])
encoded.head()

Unnamed: 0,rsID,sequence
0,rs577013928,ttttttttttttttttttttttttttttttttttttttttttttt...
1,rs565082899,ttttttttttttttttttttttttttttttttttttttttttttt...
2,rs540382744,ttttttttttttttttttttttttttttttttttttttttttttt...
3,rs573244332,ttttttttttttttttttttttttttttttttttttttttttttt...
4,rs539162497,ttttttttttttttttttttttttttttttttttttttttttttt...


In [3]:
def makeArray(strings):
    
    # initialize sample array with empty list
    bebG, cheG, esnG, gbrG, pelG = [], [], [], [], []
    bebH, cheH, esnH, gbrH, pelH = [], [], [], [], []
    
    # swap the keys and values in the dictionary below
    nested_dictionary = {'GBR':{'e':'a', 'f':'b', 'g':'c', 'h': 'd'},
                         'PEL':{'i':'a', 'j':'b', 'k':'c', 'l': 'd'},
                         'ESN':{'m':'a', 'n':'b', 'o':'c', 'p': 'd'},
                         'BEB':{'q':'a', 'r':'b', 's':'c', 't': 'd'},
                         'CHE':{'u':'a', 'v':'b', 'w':'c', 'x': 'd'}}

    ginfo = {'a':'1|0', 'b':'1|1', 'c':'0|1', 'd':'0|0'}

    dic = {'e':'GBR', 'f':'GBR', 'g':'GBR', 'h':'GBR',
           'i':'PEL', 'j':'PEL', 'k':'PEL', 'l':'PEL',
           'm':'ESN', 'n':'ESN', 'o':'ESN', 'p':'ESN',
           'q':'BEB', 'r':'BEB', 's':'BEB', 't':'BEB',
           'u':'CHE', 'v':'CHE', 'w':'CHE', 'x':'CHE'}

    for string in strings:
        if isinstance(string, str):
            string = string.strip()
        genotypes = {}
        N = len(string)
         # initialize haplotype dictionary with 462 zeros for each sample
        haplotypes = {'BEB':[0]*N, 'CHE':[0]*N, 'ESN':[0]*N, 'GBR':[0]*N, 'PEL':[0]*N}

        for i, c in enumerate(string):
            # take sample as key and 0|1 etc as value
            genotypes.setdefault(dic[c], []).append(ginfo[nested_dictionary[dic[c]][c]])
            # set 1 in which position sample present
            haplotypes[dic[c]][i] = 1

        # sort dictionary alphabetically
        genotypes = dict(sorted(genotypes.items(), key = lambda x:x[0].lower()))
        # split each 0|1 etc by | and make a list,,,thus apply on each sample
        genotype_array = [[list(map(int, v.split('|'))) for v in val] for val in genotypes.values()]
        # create haplotype array
        haplotype_array = [val for val in haplotypes.values()]

        # extracting genotype array samples
        bebG.append(genotype_array[0])
        cheG.append(genotype_array[1])
        esnG.append(genotype_array[2])
        gbrG.append(genotype_array[3])
        pelG.append(genotype_array[4])
        
        # extracting haplotype array samples
        bebH.append(haplotype_array[0])
        cheH.append(haplotype_array[1])
        esnH.append(haplotype_array[2])
        gbrH.append(haplotype_array[3])
        pelH.append(haplotype_array[4])
        
    return (bebG, cheG, esnG, gbrG, pelG)

In [4]:
# passing first 3 sequences into makeArray function
g = makeArray(encoded['sequence'].iloc[0:1]) # we can pass a list inside the function as [snp1, snp2, snp3, ...]

# extract genotype array into samples
bebG, cheG, esnG, gbrG, pelG = g

# extract haplotype array into samples
# bebH, cheH, esnH, gbrH, pelH = h

In [5]:
from itertools import combinations

def calc_hudson_fst(strings):
    # passing sequences into makeArray function
    g = makeArray(strings)
    # extract genotype array into samples
    bebG, cheG, esnG, gbrG, pelG = g
    
    FSTs = {}
    
    for pair,val in zip( combinations(['bebG','cheG','esnG','gbrG','pelG'],2), combinations([bebG,cheG,esnG,gbrG,pelG],2)):
        ac1 = allel.GenotypeArray(val[0]).count_alleles()
        ac2 = allel.GenotypeArray(val[1]).count_alleles()
        num, den = allel.hudson_fst(ac1, ac2)
        fst = np.sum(num)/np.sum(den)
        FSTs.update({pair : fst})
    
    return FSTs

In [6]:
11/3

3.6666666666666665

In [7]:
11//3

3

In [8]:
# floor, ceil

math.floor(11/3)

3

In [9]:
math.ceil(11/3)

4

### Here  the FUN begins :)

In [10]:
import math
import numpy as np

numbers = list(range(20000))
# creating list of 100 integers by randomly choosing value from 0 to 20000
positions = [np.random.choice(numbers) for _ in range(100)]
print(sorted(positions))

position_dict = {}
indices = {}

for i, num in enumerate(sorted(positions)):
    # take upper integer value of num
    n = math.ceil(num/1000)
    # add the indices to the corresponding key as n
    indices.setdefault(n, []).append(i)
    # add the positions to the corresponding key as n
    position_dict.setdefault(n, []).append(num)

# sort the dictionaries
indices = dict(sorted(indices.items(), key=lambda x:x[0]))
print(indices)
position_dict = dict(sorted(position_dict.items(), key=lambda x:x[0]))
print(position_dict)

[205, 444, 598, 763, 1186, 1531, 1587, 1651, 1823, 1828, 2031, 2097, 2271, 2306, 2740, 3556, 4122, 4145, 4216, 4277, 4499, 4576, 4611, 4774, 4827, 4933, 4988, 5356, 5569, 5869, 6202, 6329, 6470, 6586, 6603, 6660, 6714, 6745, 6843, 6927, 7055, 7426, 7452, 7464, 7658, 7970, 7977, 7988, 8340, 8386, 8387, 8535, 8740, 8943, 9009, 9166, 9689, 9703, 9747, 9753, 9824, 10438, 11045, 11322, 11452, 11506, 12009, 12148, 12149, 12926, 12939, 13059, 13105, 13110, 13486, 13491, 13698, 13866, 13946, 14064, 14199, 14653, 14912, 15138, 15240, 15707, 15785, 15993, 16446, 16907, 17180, 17323, 17396, 18014, 18152, 18464, 18472, 18497, 19326, 19861]
{1: [0, 1, 2, 3], 2: [4, 5, 6, 7, 8, 9], 3: [10, 11, 12, 13, 14], 4: [15], 5: [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26], 6: [27, 28, 29], 7: [30, 31, 32, 33, 34, 35, 36, 37, 38, 39], 8: [40, 41, 42, 43, 44, 45, 46, 47], 9: [48, 49, 50, 51, 52, 53], 10: [54, 55, 56, 57, 58, 59, 60], 11: [61], 12: [62, 63, 64, 65], 13: [66, 67, 68, 69, 70], 14: [71, 72, 73, 74,

In [11]:
fst_dict1 = {}
fst_dict2 = {}
index_positions = {}

for i, val in indices.items():
    results = calc_hudson_fst(encoded.iloc[val]['sequence'])
    #print(results)
    
    # update index_positions dictionary as {i : range} pair
    index_positions.update({i : str(val[0])+':'+str(val[-1])})
    
    # update fst_dict2 dictionary as {i : results} pair
    fst_dict2.update({i : results})
    
    for k, v in results.items():
        # nested dictionary as {pops : {index : fst_value}}
        fst_dict1.setdefault(k, {}).update({i : v})
        
print(fst_dict1)

{('bebG', 'cheG'): {1: nan, 2: nan, 3: 0.03543337016195934, 4: nan, 5: 0.03173478728221308, 6: 0.0, 7: 0.009218226361340315, 8: 0.012315589154460021, 9: 0.03920352486186563, 10: 0.07317073170731694, 11: nan, 12: 0.03225512282548355, 13: 0.0, 14: 0.02583256702309786, 15: 0.02548853462188253, 16: 0.024153937500207296, 17: nan, 18: nan, 19: 0.0, 20: 0.0012092489093636133}, ('bebG', 'esnG'): {1: nan, 2: 0.0, 3: 0.021540710717167916, 4: nan, 5: 0.055822692659454644, 6: nan, 7: 0.11583692065065801, 8: 0.0023597201262175915, 9: 0.3369898338571044, 10: 0.008121827411167525, 11: nan, 12: 0.09372912770504943, 13: 0.0, 14: 0.17329725387952882, 15: 0.19952230362401444, 16: 0.12555595702285982, 17: 0.0, 18: 0.04060913705583763, 19: -0.004189232540146793, 20: 0.11111111111111115}, ('bebG', 'gbrG'): {1: nan, 2: nan, 3: 0.06790303197805556, 4: nan, 5: 0.008564171274548556, 6: nan, 7: 0.02786516593329824, 8: -0.0003413530222385175, 9: 0.005124760878514103, 10: nan, 11: nan, 12: 0.0018063795563675869, 1

  fst = np.sum(num)/np.sum(den)


In [12]:
print(fst_dict2)

{1: {('bebG', 'cheG'): nan, ('bebG', 'esnG'): nan, ('bebG', 'gbrG'): nan, ('bebG', 'pelG'): nan, ('cheG', 'esnG'): nan, ('cheG', 'gbrG'): nan, ('cheG', 'pelG'): nan, ('esnG', 'gbrG'): nan, ('esnG', 'pelG'): nan, ('gbrG', 'pelG'): nan}, 2: {('bebG', 'cheG'): nan, ('bebG', 'esnG'): 0.0, ('bebG', 'gbrG'): nan, ('bebG', 'pelG'): nan, ('cheG', 'esnG'): 0.0, ('cheG', 'gbrG'): nan, ('cheG', 'pelG'): nan, ('esnG', 'gbrG'): 0.0, ('esnG', 'pelG'): 0.0, ('gbrG', 'pelG'): nan}, 3: {('bebG', 'cheG'): 0.03543337016195934, ('bebG', 'esnG'): 0.021540710717167916, ('bebG', 'gbrG'): 0.06790303197805556, ('bebG', 'pelG'): 0.017330127948078904, ('cheG', 'esnG'): 0.05836813036299754, ('cheG', 'gbrG'): 0.19165261895798888, ('cheG', 'pelG'): 0.0036668097083743024, ('esnG', 'gbrG'): 0.09675356717141077, ('esnG', 'pelG'): 0.02078511113580761, ('gbrG', 'pelG'): 0.14944938295224527}, 4: {('bebG', 'cheG'): nan, ('bebG', 'esnG'): nan, ('bebG', 'gbrG'): nan, ('bebG', 'pelG'): nan, ('cheG', 'esnG'): nan, ('cheG', 'g

In [13]:
print(index_positions)

{1: '0:3', 2: '4:9', 3: '10:14', 4: '15:15', 5: '16:26', 6: '27:29', 7: '30:39', 8: '40:47', 9: '48:53', 10: '54:60', 11: '61:61', 12: '62:65', 13: '66:70', 14: '71:78', 15: '79:82', 16: '83:87', 17: '88:89', 18: '90:92', 19: '93:97', 20: '98:99'}


# Function to implement, we could easily change step size here

In [14]:
def fst_dict_calc(positions, dividend=1000): 
    
    indices = {}

    for i, num in enumerate(sorted(positions)):
        
        # take upper integer value of num
        n = math.ceil(num/dividend)
        
        # add the indices to the corresponding key as n
        indices.setdefault(n, []).append(i)
    
    # sort the dictionariy
    indices = dict(sorted(indices.items(), key=lambda x:x[0]))
    
    fst_dict1 = {}
    fst_dict2 = {}
    index_positions = {}

    for i, val in indices.items():
        results = calc_hudson_fst(encoded.iloc[val]['sequence'])
        #print(results)
        
        
        # update index_positions dictionary as {i : range} pair
        index_positions.update({i : str(val[0])+':'+str(val[-1])})
        
        
        # update fst_dict2 dictionary as {i : results} pair
        fst_dict2.update({i : results})

        
        for k, v in results.items():
            
            # nested dictionary as {pops : {index : fst_value}}
            fst_dict1.setdefault(k, {}).update({i : v})

    return fst_dict1, fst_dict2, index_positions

In [15]:
# calling the function
f1, f2, ind = fst_dict_calc(positions, 5000)

In [16]:
print(f1)

{('bebG', 'cheG'): {1: 0.033965142017862875, 2: 0.020338952458864853, 3: 0.02829640374557564, 4: 0.011969472066051177}, ('bebG', 'esnG'): {1: 0.033950165902009506, 2: 0.22964449202370138, 3: 0.1439432092931027, 4: 0.104780270647522}, ('bebG', 'gbrG'): {1: 0.04513583112102886, 2: 0.019293514982147947, 3: 0.043925149596432, 4: 0.06649401774038718}, ('bebG', 'pelG'): {1: 0.022848016101287363, 2: 0.05748160747314586, 3: 0.1200093297802129, 4: 0.1675742115967348}, ('cheG', 'esnG'): {1: 0.06016570098124014, 2: 0.2416076185889834, 3: 0.14795362128785383, 4: 0.0929974027699578}, ('cheG', 'gbrG'): {1: 0.14513217793633923, 2: 0.07257541173870458, 3: 0.052244525622411726, 4: 0.029221633500032658}, ('cheG', 'pelG'): {1: 0.010662917429055646, 2: 0.08879712970015427, 3: 0.13107618191745593, 4: 0.12901670484988106}, ('esnG', 'gbrG'): {1: 0.0773255971059701, 2: 0.1920806796852942, 3: 0.09010871811293061, 4: 0.050164036520277365}, ('esnG', 'pelG'): {1: 0.030874994446445793, 2: 0.18016576094794665, 3: 0

In [17]:
print(f2)

{1: {('bebG', 'cheG'): 0.033965142017862875, ('bebG', 'esnG'): 0.033950165902009506, ('bebG', 'gbrG'): 0.04513583112102886, ('bebG', 'pelG'): 0.022848016101287363, ('cheG', 'esnG'): 0.06016570098124014, ('cheG', 'gbrG'): 0.14513217793633923, ('cheG', 'pelG'): 0.010662917429055646, ('esnG', 'gbrG'): 0.0773255971059701, ('esnG', 'pelG'): 0.030874994446445793, ('gbrG', 'pelG'): 0.10799152555853336}, 2: {('bebG', 'cheG'): 0.020338952458864853, ('bebG', 'esnG'): 0.22964449202370138, ('bebG', 'gbrG'): 0.019293514982147947, ('bebG', 'pelG'): 0.05748160747314586, ('cheG', 'esnG'): 0.2416076185889834, ('cheG', 'gbrG'): 0.07257541173870458, ('cheG', 'pelG'): 0.08879712970015427, ('esnG', 'gbrG'): 0.1920806796852942, ('esnG', 'pelG'): 0.18016576094794665, ('gbrG', 'pelG'): 0.04385200742540727}, 3: {('bebG', 'cheG'): 0.02829640374557564, ('bebG', 'esnG'): 0.1439432092931027, ('bebG', 'gbrG'): 0.043925149596432, ('bebG', 'pelG'): 0.1200093297802129, ('cheG', 'esnG'): 0.14795362128785383, ('cheG', '

In [18]:
print(ind)

{1: '0:26', 2: '27:60', 3: '61:82', 4: '83:99'}


In [19]:
encoded.iloc[[35, 36, 37]]['sequence']

35     ttttttttttttttttttttttttttttttttttttttttttttt...
36     ttttttttttttttttttttttttttttttttttttttttttttt...
37     ttttttttttttttttttttttttttttttttttttttttttttt...
Name: sequence, dtype: object