In [1]:
import numpy as np
import pandas as pd
import allel

In [2]:
encoded = pd.read_csv("encoded.csv", header=0, names=['rsID', 'sequence'])
encoded.head()

Unnamed: 0,rsID,sequence
0,rs577013928,ttttttttttttttttttttttttttttttttttttttttttttt...
1,rs565082899,ttttttttttttttttttttttttttttttttttttttttttttt...
2,rs540382744,ttttttttttttttttttttttttttttttttttttttttttttt...
3,rs573244332,ttttttttttttttttttttttttttttttttttttttttttttt...
4,rs539162497,ttttttttttttttttttttttttttttttttttttttttttttt...


In [3]:
def makeArray(strings):
    
    # initialize sample array with empty list
    bebG, cheG, esnG, gbrG, pelG = [], [], [], [], []
    bebH, cheH, esnH, gbrH, pelH = [], [], [], [], []
    
    # swap the keys and values in the dictionary below
    nested_dictionary = {'GBR':{'e':'a', 'f':'b', 'g':'c', 'h': 'd'},
                         'PEL':{'i':'a', 'j':'b', 'k':'c', 'l': 'd'},
                         'ESN':{'m':'a', 'n':'b', 'o':'c', 'p': 'd'},
                         'BEB':{'q':'a', 'r':'b', 's':'c', 't': 'd'},
                         'CHE':{'u':'a', 'v':'b', 'w':'c', 'x': 'd'}}

    ginfo = {'a':'1|0', 'b':'1|1', 'c':'0|1', 'd':'0|0'}

    dic = {'e':'GBR', 'f':'GBR', 'g':'GBR', 'h':'GBR',
           'i':'PEL', 'j':'PEL', 'k':'PEL', 'l':'PEL',
           'm':'ESN', 'n':'ESN', 'o':'ESN', 'p':'ESN',
           'q':'BEB', 'r':'BEB', 's':'BEB', 't':'BEB',
           'u':'CHE', 'v':'CHE', 'w':'CHE', 'x':'CHE'}

    for string in strings:
        string = string.strip()
        genotypes = {}
        N = len(string)
        # initialize haplotype dictionary with 462 zeros for each sample
        haplotypes = {'BEB':[0]*N, 'CHE':[0]*N, 'ESN':[0]*N, 'GBR':[0]*N, 'PEL':[0]*N}

        for i, c in enumerate(string):
            # take sample as key and 0|1 etc as value
            genotypes.setdefault(dic[c], []).append(ginfo[nested_dictionary[dic[c]][c]])
            # set 1 in which position sample present
            haplotypes[dic[c]][i] = 1

        # sort dictionary alphabetically
        genotypes = dict(sorted(genotypes.items(), key = lambda x:x[0].lower()))
        # split each 0|1 etc by | and make a list,,,thus apply on each sample
        genotype_array = [[list(map(int, v.split('|'))) for v in val] for val in genotypes.values()]
        # create haplotype array
        haplotype_array = [val for val in haplotypes.values()]

        # extracting genotype array samples
        bebG.append(genotype_array[0])
        cheG.append(genotype_array[1])
        esnG.append(genotype_array[2])
        gbrG.append(genotype_array[3])
        pelG.append(genotype_array[4])
        
        # extracting haplotype array samples
        bebH.append(haplotype_array[0])
        cheH.append(haplotype_array[1])
        esnH.append(haplotype_array[2])
        gbrH.append(haplotype_array[3])
        pelH.append(haplotype_array[4])
        
    return (bebG, cheG, esnG, gbrG, pelG), (bebH, cheH, esnH, gbrH, pelH)

In [120]:
# passing first 3 sequences into makeArray function
g, h = makeArray(encoded['sequence'].iloc[1:36]) # we can pass a list inside the function as [snp1, snp2, snp3, ...]

# extract genotype array into samples
bebG, cheG, esnG, gbrG, pelG = g

# extract haplotype array into samples
bebH, cheH, esnH, gbrH, pelH = h

### Hudson_Fst

###### Allel count GT Arrays

- ac1 GBR
- ac2 BEB
- ac3 PEL
- ac4 CHB
- ac5 ESN

In [124]:
def calc_hudson_fst(g1, g2):
    genotype_array1 = allel.GenotypeArray(g1)
    genotype_array2 = allel.GenotypeArray(g2)
    ac1 = genotype_array1.count_alleles()
    ac2 = genotype_array2.count_alleles()
    num, den = allel.hudson_fst(ac1, ac2)
    fst = np.sum(num) / np.sum(den)
    return fst
     
        

In [122]:
calc_hudson_fst(gbrG, bebG)

0.038761426036041435

In [138]:
def calc_hudson_fst(g):  
    genotype_array1 = allel.GenotypeArray(g[0])
    genotype_array2 = allel.GenotypeArray(g[1])
    ac1 = genotype_array1.count_alleles()
    ac2 = genotype_array2.count_alleles()
    num1, den1 = allel.hudson_fst(ac1, ac2)
    fst1 = np.sum(num1) / np.sum(den1)

    genotype_array1 = allel.GenotypeArray(g[0])
    genotype_array2 = allel.GenotypeArray(g[2])
    ac1 = genotype_array1.count_alleles()
    ac2 = genotype_array2.count_alleles()
    num1, den1 = allel.hudson_fst(ac1, ac2)
    fst2 = np.sum(num1) / np.sum(den1)

    genotype_array1 = allel.GenotypeArray(g[0])
    genotype_array2 = allel.GenotypeArray(g[3])
    ac1 = genotype_array1.count_alleles()
    ac2 = genotype_array2.count_alleles()
    num1, den1 = allel.hudson_fst(ac1, ac2)
    fst3 = np.sum(num1) / np.sum(den1)

    genotype_array1 = allel.GenotypeArray(g[0])
    genotype_array2 = allel.GenotypeArray(g[4])
    ac1 = genotype_array1.count_alleles()
    ac2 = genotype_array2.count_alleles()
    num1, den1 = allel.hudson_fst(ac1, ac2)
    fst4 = np.sum(num1) / np.sum(den1)

    genotype_array1 = allel.GenotypeArray(g[1])
    genotype_array2 = allel.GenotypeArray(g[2])
    ac1 = genotype_array1.count_alleles()
    ac2 = genotype_array2.count_alleles()
    num1, den1 = allel.hudson_fst(ac1, ac2)
    fst5 = np.sum(num1) / np.sum(den1)

    genotype_array1 = allel.GenotypeArray(g[1])
    genotype_array2 = allel.GenotypeArray(g[3])
    ac1 = genotype_array1.count_alleles()
    ac2 = genotype_array2.count_alleles()
    num1, den1 = allel.hudson_fst(ac1, ac2)
    fst6 = np.sum(num1) / np.sum(den1)

    genotype_array1 = allel.GenotypeArray(g[1])
    genotype_array2 = allel.GenotypeArray(g[4])
    ac1 = genotype_array1.count_alleles()
    ac2 = genotype_array2.count_alleles()
    num1, den1 = allel.hudson_fst(ac1, ac2)
    fst7 = np.sum(num1) / np.sum(den1)

    genotype_array1 = allel.GenotypeArray(g[2])
    genotype_array2 = allel.GenotypeArray(g[3])
    ac1 = genotype_array1.count_alleles()
    ac2 = genotype_array2.count_alleles()
    num1, den1 = allel.hudson_fst(ac1, ac2)
    fst8 = np.sum(num1) / np.sum(den1)

    genotype_array1 = allel.GenotypeArray(g[2])
    genotype_array2 = allel.GenotypeArray(g[4])
    ac1 = genotype_array1.count_alleles()
    ac2 = genotype_array2.count_alleles()
    num1, den1 = allel.hudson_fst(ac1, ac2)
    fst9 = np.sum(num1) / np.sum(den1)

    genotype_array1 = allel.GenotypeArray(g[3])
    genotype_array2 = allel.GenotypeArray(g[4])
    ac1 = genotype_array1.count_alleles()
    ac2 = genotype_array2.count_alleles()
    num1, den1 = allel.hudson_fst(ac1, ac2)
    fst10 = np.sum(num1) / np.sum(den1)

    return fst1, fst2, fst3, fst4, fst5, fst6, fst7, fst8, fst9, fst10

In [139]:
calc_hudson_fst(g)

(0.025776758769816985,
 0.06423248562245433,
 0.038761426036041435,
 0.023094224232544584,
 0.10746792936412862,
 0.12433518262849666,
 0.03191078619039487,
 0.06410325174946345,
 0.035221064391849376,
 0.0756007809810408)

In [59]:
# Gbr vs Beb
gbrG_GA = allel.GenotypeArray(gbrG)
bebG_GA = allel.GenotypeArray(bebG)
ac1 = gbrG_GA.count_alleles()
ac2 = bebG_GA.count_alleles()
num, den = allel.hudson_fst(ac1, ac2)
fst = np.sum(num) / np.sum(den)
fst

0.038761426036041435

In [60]:
# Gbr vs Pel
pelG_GA = allel.GenotypeArray(pelG)
ac1 = gbrG_GA.count_alleles()
ac3 = pelG_GA.count_alleles()
num, den = allel.hudson_fst(ac1, ac3)
fst = np.sum(num) / np.sum(den)
fst

0.0756007809810408

In [61]:
# Gbr vs Chb
cheG_GA = allel.GenotypeArray(cheG)
ac1 = gbrG_GA.count_alleles()
ac4 = cheG_GA.count_alleles()
num, den = allel.hudson_fst(ac1, ac4)
fst = np.sum(num) / np.sum(den)
fst

0.12433518262849666

In [62]:
# Gbr vs Esn
esnG_GA = allel.GenotypeArray(esnG)
ac1 = gbrG_GA.count_alleles()
ac5 = esnG_GA.count_alleles()
num, den = allel.hudson_fst(ac1, ac5)
fst = np.sum(num) / np.sum(den)
fst

0.06410325174946345

In [69]:
# beb vs pel
num, den = allel.hudson_fst(ac2, ac3)
fst = np.sum(num) / np.sum(den)
fst

0.023094224232544584

In [70]:
# beb vs chb
num, den = allel.hudson_fst(ac2, ac4)
fst = np.sum(num) / np.sum(den)
fst

0.025776758769816985

In [71]:
# chb vs esn
num, den = allel.hudson_fst(ac2, ac5)
fst = np.sum(num) / np.sum(den)
fst

0.06423248562245433

In [72]:
# pel vs chb
num, den = allel.hudson_fst(ac3, ac4)
fst = np.sum(num) / np.sum(den)
fst

0.03191078619039487

In [73]:
# pel vs esn
num, den = allel.hudson_fst(ac3, ac5)
fst = np.sum(num) / np.sum(den)
fst

0.035221064391849376

In [74]:
# chb vs esn
num, den = allel.hudson_fst(ac4, ac5)
fst = np.sum(num) / np.sum(den)
fst

0.10746792936412862

In [8]:
allel.moving_hudson_fst(ac1, ac2, 100)

array([0.03781759, 0.04380646, 0.01445758, 0.04797316, 0.05831046,
       0.04578263, 0.05511765, 0.02167918, 0.07797163])

### Patterson_Fst

In [14]:
posi = list(range(len((gbrG_GA))))
num, den = allel.patterson_fst(ac1, ac2)
All = np.nansum(num) / np.nansum(den)
All

0.043420225487391266

In [12]:
allel.windowed_patterson_fst(posi, ac1, ac2, 100, start=1 , stop= 1000, step= 100)


(array([0.03649384, 0.04709216, 0.01445758, 0.04497913, 0.06339513,
        0.04578263, 0.05511765, 0.02167918, 0.07797163, 0.01734574]),
 array([[   1,  100],
        [ 101,  200],
        [ 201,  300],
        [ 301,  400],
        [ 401,  500],
        [ 501,  600],
        [ 601,  700],
        [ 701,  800],
        [ 801,  900],
        [ 901, 1000]]),
 array([100, 100, 100, 100, 100, 100, 100, 100, 100,  98]))

In [17]:
# allel.moving_weir_cockerham_fst(, posi, 100)

## Haplotype Diversity
### - Garud

In [76]:
# genotype Array
gbrG_GA = allel.GenotypeArray(gbrG)
# haplotype Array
gbrG_HA = gbrG_GA.to_haplotypes()
allel.garud_h(gbrG_HA)

(0.4985508996498008,
 0.6872358410819949,
 0.7868614901581934,
 0.05383311129950354)

In [78]:
# genotype Array
bebG_GA = allel.GenotypeArray(bebG)
# haplotype Array
bebG_HA = bebG_GA.to_haplotypes()
allel.garud_h(bebG_HA)

(0.30313683071930775,
 0.4384126554894538,
 0.5871416982152514,
 0.15599910793933988)

In [79]:
# genotype Array
pelG_GA = allel.GenotypeArray(pelG)
# haplotype Array
pelG_HA = pelG_GA.to_haplotypes()
allel.garud_h(pelG_HA)

(0.21460207612456755,
 0.3369550173010381,
 0.4605536332179932,
 0.25443405353111914)

In [81]:
# genotype Array
cheG_GA = allel.GenotypeArray(cheG)
# haplotype Array
cheG_HA = cheG_GA.to_haplotypes()
allel.garud_h(cheG_HA)

(0.2677443679894429,
 0.4911395984541426,
 0.6811198039400507,
 0.4507129026579827)

In [82]:
# genotype Array
esnG_GA = allel.GenotypeArray(esnG)
# haplotype Array
esnG_HA = esnG_GA.to_haplotypes()
allel.garud_h(esnG_HA)

(0.3011937557392103,
 0.484287317620651,
 0.6825323946536068,
 0.20316734417344184)

In [19]:
allel.moving_garud_h(arrH, 100)

(array([0.25069436, 0.17310711, 0.77508755, 0.17298635, 0.21120638,
        0.26506461, 0.449523  , 0.18361309, 0.17552228]),
 array([0.37096969, 0.30268084, 0.84271223, 0.32441734, 0.39282695,
        0.41770318, 0.66477479, 0.2830576 , 0.28432556]),
 array([0.51956285, 0.41166526, 0.8729622 , 0.39149861, 0.52686874,
        0.61206376, 0.89397416, 0.42120517, 0.45242121]),
 array([0.17039981, 0.41332403, 0.00288229, 0.43298429, 0.41452258,
        0.28917995, 0.11182001, 0.38819467, 0.51685587]))

### estimated Haplotype diversity


Haplotype diversity (also known as gene diversity) represents the probability that two randomly sampled alleles are different, while nucleotide diversity Is defined as the average number of nucleotide differences per site in pairwise comparisons among DNA sequences 

In [113]:
def haplo_div_calc(h):
    haplo_div = allel.haplotype_diversity(h)
    return haplo_div

In [114]:
haplo_div_calc(gbrG_HA)

0.5042195373687085

In [21]:
# h = allel.HaplotypeArray(cheH)
# allel.haplotype_diversity(h)
## thats wrong array

0.34615885901541665

In [23]:
arrH #(999,182 cause into Haplotype)

Unnamed: 0,0,1,2,3,4,...,177,178,179,180,181,Unnamed: 12
0,0,0,0,0,0,...,0,0,0,0,0,
1,0,0,0,0,0,...,0,0,0,0,0,
2,0,0,0,0,0,...,0,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...
996,0,0,0,0,0,...,0,0,0,0,0,
997,0,0,0,0,0,...,0,0,0,0,0,
998,0,0,0,0,0,...,0,0,0,0,0,


In [77]:
# british
allel.haplotype_diversity(gbrG_HA)

0.5042195373687085

In [83]:
# bengali
allel.haplotype_diversity(bebG_HA)

0.70093839249286

In [84]:
# peru
allel.haplotype_diversity(pelG_HA)

0.7900452488687781

In [85]:
# china
allel.haplotype_diversity(cheG_HA)

0.7358276107032915

In [86]:
# nigeria
allel.haplotype_diversity(esnG_HA)

0.7023534840793723

In [97]:
allel.moving_haplotype_diversity(gbrG_HA, 5)

array([0.        , 0.12385405, 0.19786291, 0.11420072, 0.        ,
       0.28176796, 0.3861332 ])

### nucleotide dversity
nucleotide diversity Is defined as the average number of nucleotide differences per site in pairwise comparisons among DNA sequences

In [106]:
posiGbrG = list(range(len((gbrG_GA))))
ac_gbrG_GA = gbrG_GA.count_alleles()
pi, windows, n_bases, counts = allel.windowed_diversity(posiGbrG, ac_gbrG_GA, 5)
pi

array([0.        , 0.02477081, 0.07740878, 0.04568029, 0.        ,
       0.05883067, 0.10782588])

In [104]:
posiBebG = list(range(len((bebG_GA))))
ac_bebG_GA = bebG_GA.count_alleles()
pi, windows, n_bases, counts = allel.windowed_diversity(posiBebG, ac_bebG_GA, 5)
pi

array([0.        , 0.05324357, 0.15764994, 0.08268734, 0.        ,
       0.09812322, 0.14602203])

In [103]:
# Peru
posiPelG = list(range(len((pelG_GA))))
ac_pelG_GA = pelG_GA.count_alleles()
pi, windows, n_bases, counts = allel.windowed_diversity(posiPelG, ac_pelG_GA, 5)
pi

array([0.        , 0.03994431, 0.18890359, 0.06861121, 0.04533241,
       0.11469544, 0.15820397])

In [101]:
# China
posiCheG = list(range(len((cheG_GA))))
ac_cheG_GA = cheG_GA.count_alleles()
pi, windows, n_bases, counts = allel.windowed_diversity(posiCheG, ac_cheG_GA, 5)
pi

array([0.        , 0.06289368, 0.19612598, 0.09723893, 0.00194175,
       0.14861473, 0.12965191])

In [102]:
# Nigeria
posiEsnG = list(range(len((esnG_GA))))
ac_esnG_GA = esnG_GA.count_alleles()
pi, windows, n_bases, counts = allel.windowed_diversity(posiEsnG, ac_esnG_GA, 5)
pi

array([0.0020202 , 0.        , 0.16653848, 0.0040404 , 0.01197764,
       0.06601036, 0.08467415])

### iHS (seems useless, cause NaN)

In [26]:
posiH = list(range(len((arrH))))

In [37]:
allel.ihs(arrH, posiH, map_pos=None, min_ehh=0.4, min_maf=0.05, include_edges=False,
        gap_scale=20000, max_gap=200000, is_accessible=None, use_threads=True)

array([        nan,         nan,         nan,         nan,         nan,
               nan,         nan,         nan,         nan,         nan,
               nan,         nan,         nan,         nan,         nan,
               nan,         nan,         nan,         nan,         nan,
               nan,         nan,         nan,         nan,         nan,
               nan,         nan,         nan,         nan,         nan,
               nan,         nan,         nan,         nan,         nan,
               nan,         nan,         nan,         nan,         nan,
               nan,         nan,         nan,         nan,         nan,
               nan,         nan,         nan,         nan,         nan,
               nan,         nan,         nan,         nan,         nan,
               nan,         nan,         nan,         nan,         nan,
               nan,         nan,         nan,         nan,         nan,
               nan,         nan,         nan,         nan,      

### XEPHH

In [32]:
# # genotype Array
# gbrG_GA = allel.GenotypeArray(gbrG)
# # haplotype Array 1 gbrG_GA
# arrH = gbrG_GA.to_haplotypes()
# # haplotype Array 2 bebG_GA
#  bebG_GA = allel.GenotypeArray(bebG)
arrH2 = bebG_GA.to_haplotypes()


In [None]:
posi_posi = list(range(len((arrH))))

In [39]:
allel.xpehh(arrH, arrH2, posiH, min_ehh=0.2)

array([            nan,             nan,             nan,             nan,
                   nan,             nan,             nan,             nan,
                   nan,             nan,             nan,             nan,
                   nan,             nan,             nan,             nan,
                   nan,             nan,             nan,             nan,
                   nan,             nan,             nan,             nan,
                   nan,             nan,             nan,             nan,
                   nan,             nan,             nan,             nan,
                   nan,             nan,             nan,             nan,
                   nan,             nan,             nan,             nan,
                   nan,             nan,             nan,             nan,
                   nan,             nan,             nan,             nan,
                   nan,             nan,             nan,             nan,
                   nan,  

### Tajima D

In [50]:
# british
g = allel.GenotypeArray(gbrG)
ac = g.count_alleles()
allel.tajima_d(ac)

-0.7171168167623679

In [109]:
# bengali
g = allel.GenotypeArray(bebG)
ac = g.count_alleles()
allel.tajima_d(ac)

1.2795246729157983

In [108]:
# peru
g = allel.GenotypeArray(pelG)
ac = g.count_alleles()
allel.tajima_d(ac)

1.1526123512596067

In [107]:
# china
g = allel.GenotypeArray(cheG)
ac = g.count_alleles()
allel.tajima_d(ac)

1.370615872873514

In [110]:
# nigeria
g = allel.GenotypeArray(esnG)
ac = g.count_alleles()
allel.tajima_d(ac)

-0.04015478347124156

In [52]:
allel.moving_delta_tajima_d(ac1, ac2, 100)

array([-1.4390487 ,  1.11253559, -0.41472605,  0.69572628, -1.05028639,
        0.24645094,  0.73594978,  1.3457841 , -1.23238556])

In [None]:
## write a function, to get all fsts and delta_moving tajimas D for all possible combinations

In [None]:
# Fst: A with B, A with C, A with D, A with E, B with C, B with D, B with E, C with D, C with E:, D with E