In [1]:
import allel
import numpy as np
import pandas as pd
from itertools import combinations

In [2]:
encoded = pd.read_csv("encoded.csv", header=0, names=['rsID', 'sequence'])
encoded.head()

Unnamed: 0,rsID,sequence
0,rs577013928,ttttttttttttttttttttttttttttttttttttttttttttt...
1,rs565082899,ttttttttttttttttttttttttttttttttttttttttttttt...
2,rs540382744,ttttttttttttttttttttttttttttttttttttttttttttt...
3,rs573244332,ttttttttttttttttttttttttttttttttttttttttttttt...
4,rs539162497,ttttttttttttttttttttttttttttttttttttttttttttt...


In [3]:
def makeArray(strings):
    
    # initialize sample array with empty list
    bebG, cheG, esnG, gbrG, pelG = [], [], [], [], []
    bebH, cheH, esnH, gbrH, pelH = [], [], [], [], []
    
    # swap the keys and values in the dictionary below
    nested_dictionary = {'GBR':{'e':'a', 'f':'b', 'g':'c', 'h': 'd'},
                         'PEL':{'i':'a', 'j':'b', 'k':'c', 'l': 'd'},
                         'ESN':{'m':'a', 'n':'b', 'o':'c', 'p': 'd'},
                         'BEB':{'q':'a', 'r':'b', 's':'c', 't': 'd'},
                         'CHE':{'u':'a', 'v':'b', 'w':'c', 'x': 'd'}}

    ginfo = {'a':'1|0', 'b':'1|1', 'c':'0|1', 'd':'0|0'}

    dic = {'e':'GBR', 'f':'GBR', 'g':'GBR', 'h':'GBR',
           'i':'PEL', 'j':'PEL', 'k':'PEL', 'l':'PEL',
           'm':'ESN', 'n':'ESN', 'o':'ESN', 'p':'ESN',
           'q':'BEB', 'r':'BEB', 's':'BEB', 't':'BEB',
           'u':'CHE', 'v':'CHE', 'w':'CHE', 'x':'CHE'}

    for string in strings:
        string = string.strip()
        genotypes = {}
        N = len(string)
         # initialize haplotype dictionary with 462 zeros for each sample
        haplotypes = {'BEB':[0]*N, 'CHE':[0]*N, 'ESN':[0]*N, 'GBR':[0]*N, 'PEL':[0]*N}

        for i, c in enumerate(string):
            # take sample as key and 0|1 etc as value
            genotypes.setdefault(dic[c], []).append(ginfo[nested_dictionary[dic[c]][c]])
            # set 1 in which position sample present
            haplotypes[dic[c]][i] = 1

        # sort dictionary alphabetically
        genotypes = dict(sorted(genotypes.items(), key = lambda x:x[0].lower()))
        # split each 0|1 etc by | and make a list,,,thus apply on each sample
        genotype_array = [[list(map(int, v.split('|'))) for v in val] for val in genotypes.values()]
        # create haplotype array
        haplotype_array = [val for val in haplotypes.values()]

        # extracting genotype array samples
        bebG.append(genotype_array[0])
        cheG.append(genotype_array[1])
        esnG.append(genotype_array[2])
        gbrG.append(genotype_array[3])
        pelG.append(genotype_array[4])
        
        # extracting haplotype array samples
        bebH.append(haplotype_array[0])
        cheH.append(haplotype_array[1])
        esnH.append(haplotype_array[2])
        gbrH.append(haplotype_array[3])
        pelH.append(haplotype_array[4])
        
    return (bebG, cheG, esnG, gbrG, pelG)

In [4]:
# passing first 3 sequences into makeArray function
g = makeArray(encoded['sequence'].iloc[100:200]) # we can pass a list inside the function as [snp1, snp2, snp3, ...]

# extract genotype array into samples
bebG, cheG, esnG, gbrG, pelG = g

# extract haplotype array into samples
# bebH, cheH, esnH, gbrH, pelH = h

In [18]:
def moving_tajimas_d(strings):
    # passing sequences into makeArray function
    g = makeArray(strings)
    # extract genotype array into samples
    bebG, cheG, esnG, gbrG, pelG = g
    
    moving_Tajima_D = {}
    
    for pair,val in zip( combinations(['bebG','cheG','esnG','gbrG','pelG'],1), combinations([bebG,cheG,esnG,gbrG,pelG],1)):
        
        ac1 = allel.GenotypeArray(val[0]).count_alleles()
        
        fst = allel.moving_tajima_d(ac1, 50, step = 1)
    
        moving_Tajima_D.update({pair : fst})
    
    return moving_Tajima_D

In [19]:
moving_tajimas_d(encoded['sequence'].iloc[0:301])

{('bebG',): array([ 0.32500514,  0.35637416,  0.35637416,  0.35637416,  0.35953126,
         0.35953126,  0.35953126,  0.35953126,  0.35953126,  0.35953126,
         0.35953126,  0.2886782 ,  0.2886782 ,  0.12957927, -0.09206151,
        -0.09206151, -0.03804883, -0.03804883, -0.03804883, -0.03804883,
        -0.23072711, -0.27733768, -0.27733768, -0.23072711, -0.23072711,
        -0.23072711, -0.23072711, -0.29580138, -0.29580138, -0.29580138,
        -0.29580138, -0.41522199, -0.32652915, -0.32652915, -0.45584501,
        -0.51684477, -0.51684477, -0.36552765, -0.36552765, -0.36552765,
        -0.21394918, -0.21394918, -0.21394918, -0.00838703,  0.23119207,
         0.23119207,  0.23119207,  0.23119207, -0.00838703,  0.03257775,
         0.25690725,  0.2051532 ,  0.11552177,  0.11552177,  0.09138336,
         0.09138336,  0.09138336,  0.09138336,  0.09138336,  0.09138336,
         0.09138336,  0.09138336,  0.09138336,  0.00608213,  0.00608213,
         0.00608213, -0.33595906, -0.335