In [4]:
import allel
import numpy as np
import pandas as pd
from itertools import combinations

In [5]:
encoded = pd.read_csv("encoded.csv", header=0, names=['rsID', 'sequence'])
encoded.head()

Unnamed: 0,rsID,sequence
0,rs577013928,ttttttttttttttttttttttttttttttttttttttttttttt...
1,rs565082899,ttttttttttttttttttttttttttttttttttttttttttttt...
2,rs540382744,ttttttttttttttttttttttttttttttttttttttttttttt...
3,rs573244332,ttttttttttttttttttttttttttttttttttttttttttttt...
4,rs539162497,ttttttttttttttttttttttttttttttttttttttttttttt...


In [6]:
def makeArray(strings):
    
    # initialize sample array with empty list
    bebG, cheG, esnG, gbrG, pelG = [], [], [], [], []
    bebH, cheH, esnH, gbrH, pelH = [], [], [], [], []
    
    # swap the keys and values in the dictionary below
    nested_dictionary = {'GBR':{'e':'a', 'f':'b', 'g':'c', 'h': 'd'},
                         'PEL':{'i':'a', 'j':'b', 'k':'c', 'l': 'd'},
                         'ESN':{'m':'a', 'n':'b', 'o':'c', 'p': 'd'},
                         'BEB':{'q':'a', 'r':'b', 's':'c', 't': 'd'},
                         'CHE':{'u':'a', 'v':'b', 'w':'c', 'x': 'd'}}

    ginfo = {'a':'1|0', 'b':'1|1', 'c':'0|1', 'd':'0|0'}

    dic = {'e':'GBR', 'f':'GBR', 'g':'GBR', 'h':'GBR',
           'i':'PEL', 'j':'PEL', 'k':'PEL', 'l':'PEL',
           'm':'ESN', 'n':'ESN', 'o':'ESN', 'p':'ESN',
           'q':'BEB', 'r':'BEB', 's':'BEB', 't':'BEB',
           'u':'CHE', 'v':'CHE', 'w':'CHE', 'x':'CHE'}

    for string in strings:
        string = string.strip()
        genotypes = {}
        N = len(string)
         # initialize haplotype dictionary with 462 zeros for each sample
        haplotypes = {'BEB':[0]*N, 'CHE':[0]*N, 'ESN':[0]*N, 'GBR':[0]*N, 'PEL':[0]*N}

        for i, c in enumerate(string):
            # take sample as key and 0|1 etc as value
            genotypes.setdefault(dic[c], []).append(ginfo[nested_dictionary[dic[c]][c]])
            # set 1 in which position sample present
            haplotypes[dic[c]][i] = 1

        # sort dictionary alphabetically
        genotypes = dict(sorted(genotypes.items(), key = lambda x:x[0].lower()))
        # split each 0|1 etc by | and make a list,,,thus apply on each sample
        genotype_array = [[list(map(int, v.split('|'))) for v in val] for val in genotypes.values()]
        # create haplotype array
        haplotype_array = [val for val in haplotypes.values()]

        # extracting genotype array samples
        bebG.append(genotype_array[0])
        cheG.append(genotype_array[1])
        esnG.append(genotype_array[2])
        gbrG.append(genotype_array[3])
        pelG.append(genotype_array[4])
        
        # extracting haplotype array samples
        bebH.append(haplotype_array[0])
        cheH.append(haplotype_array[1])
        esnH.append(haplotype_array[2])
        gbrH.append(haplotype_array[3])
        pelH.append(haplotype_array[4])
        
    return (bebG, cheG, esnG, gbrG, pelG)

In [7]:
# passing first 3 sequences into makeArray function
g = makeArray(encoded['sequence'].iloc[100:200]) # we can pass a list inside the function as [snp1, snp2, snp3, ...]

# extract genotype array into samples
bebG, cheG, esnG, gbrG, pelG = g

# extract haplotype array into samples
# bebH, cheH, esnH, gbrH, pelH = h

In [15]:
def calc_moving_hudson_fst(strings):
    # passing sequences into makeArray function
    g = makeArray(strings)
    # extract genotype array into samples
    bebG, cheG, esnG, gbrG, pelG = g
    
    FSTs = {}
    
    for pair,val in zip( combinations(['bebG','cheG','esnG','gbrG','pelG'],2), combinations([bebG,cheG,esnG,gbrG,pelG],2)):
        ac1 = allel.GenotypeArray(val[0]).count_alleles()
        ac2 = allel.GenotypeArray(val[1]).count_alleles()
        fst = allel.moving_hudson_fst(ac1, ac2, 15, step = 15)
    
        FSTs.update({pair : fst})
    
    return FSTs

In [16]:
calc_moving_hudson_fst(encoded['sequence'].iloc[0:101])

{('bebG',
  'cheG'): array([0.03543337, 0.0315438 , 0.00944336, 0.04456595, 0.02946088,
        0.02491669]),
 ('bebG',
  'esnG'): array([0.02142849, 0.05582269, 0.11426458, 0.32566496, 0.13148831,
        0.15959616]),
 ('bebG',
  'gbrG'): array([0.06790303, 0.00856417, 0.02699473, 0.00512476, 0.00028977,
        0.16291702]),
 ('bebG',
  'pelG'): array([0.01733013, 0.03025453, 0.02291235, 0.10282672, 0.12648646,
        0.07740511]),
 ('cheG',
  'esnG'): array([0.05810838, 0.06303388, 0.19060241, 0.29045066, 0.15537565,
        0.07898431]),
 ('cheG',
  'gbrG'): array([0.19165262, 0.07118688, 0.07546849, 0.06820209, 0.04661426,
        0.07510459]),
 ('cheG',
  'pelG'): array([0.00366681, 0.01947445, 0.07142241, 0.11278983, 0.15059607,
        0.01258583]),
 ('esnG',
  'gbrG'): array([0.09610099, 0.0440949 , 0.03436135, 0.28341565, 0.09397178,
        0.01500246]),
 ('esnG',
  'pelG'): array([0.02068164, 0.04520733, 0.04398022, 0.25958801, 0.24068202,
        0.0321381 ]),
 ('gbrG',
