diff --git a/functions/calcPi.txt b/functions/calcPi.txt new file mode 100644 index 0000000..26f5d49 --- /dev/null +++ b/functions/calcPi.txt @@ -0,0 +1,55 @@ +function (float)calcPi(object genomes, [No muts = NULL], [Ni$ start = NULL], [Ni$ end = NULL]) + +{ if (genomes.length() == 0) + stop("ERROR (calcPi()): genomes must be non-empty."); + if (community.allSpecies.length() > 1) + { + species = unique(genomes.individual.subpopulation.species, preserveOrder=F); + if (species.length() != 1) + stop("ERROR (calcPi()): genomes must all belong to the same species."); + if (!isNULL(muts)) + if (!all(muts.mutationType.species == species)) + stop("ERROR (calcPi()): muts must all belong to the same species as genomes."); + } + else + { + species = community.allSpecies; + } + + length = species.chromosome.lastPosition + 1; + + if (isNULL(muts)) + muts = species.mutations; + + // handle windowing + if (!isNULL(start) & !isNULL(end)) + { + if (start > end) + stop("ERROR (calcPi()): start must be less than or equal to end."); + mpos = muts.position; + muts = muts[(mpos >= start) & (mpos <= end)]; + length = end - start + 1; + } + else if (!isNULL(start) | !isNULL(end)) + { + stop("ERROR (calcPi()): start and end must both be NULL or both be non-NULL."); + } + + // narrow down to the mutations that are actually present in the genomes and aren't fixed + p = genomes.mutationFrequenciesInGenomes(muts); + muts = muts[(p != 0.0) & (p != 1.0)]; + + // do the calculation + // obtain counts of variant sequences for all segregating sites + varCount = genomes.mutationCountsInGenomes(muts); + // total count of sequences subtracted by count of variant sequences equals count of invariant sequences + invarCount = genomes.size() - varCount; + // count of pairwise differences per site is the product of counts of both alleles (equation 1 in Korunes and Samuk 2021), this is then summed for all sites + diffs = sum(varCount * invarCount); + // pi is the ratio of pairwise differences to number of possible combinations of the given sequences + // the latter is calculated by a standard formula defined in combinationTwo function (not by default in SLiM) + pi = sum(varCount * invarCount) / ((genomes.size() * (genomes.size() - 1)) / 2);; + // pi is conventionally averaged per site and this is consistent with SLiM's calculation of Watterson's theta + avg_pi = pi / length; + return avg_pi; +} diff --git a/functions/calcPi_docString.txt b/functions/calcPi_docString.txt new file mode 100644 index 0000000..4047b04 --- /dev/null +++ b/functions/calcPi_docString.txt @@ -0,0 +1,4 @@ +Calculates pi (a metric of genetic diversity based on pairwise sequence differences) for a vector of genomes, based upon the mutations in the genomes. The mathematical formulation (as an estimator of the population parameter theta) is based on work in (Nei and Li 1979; Nei and Tajima 1981; Tajima 1983 (eq. A3)) and the exact formula used here is common in textbooks (e.g. equation 3.3 in Hahn 2018 or 2.2 in Coop 2020). This value is averaged by the number of sites. +Often genomes will be all of the genomes in a subpopulation, or in the entire population, but any genome vector may be used. By default, with muts=NULL, the calculation is based upon all mutations in the simulation; the calculation can instead be based upon a subset of mutations, such as mutations of a specific mutation type, by passing the desired vector of mutations for muts. +The calculation can be narrowed to apply to only a window – a subrange of the full chromosome – by passing the interval bounds [start, end] for the desired window. In this case, the vector of mutations used for the calculation will be subset to include only mutations within the specified window. The default behavior, with start and end of NULL, provides the genome-wide pi. +The implementation of calcPiTheta(), viewable with functionSource(), treats every mutation as independent in the heterozygosity calculations. One could regard this choice as embodying an infinite-sites interpretation of the segregating mutations, as with calcHeterozygosity(). Indeed, finite-sites models of pi have been derived (Tajima 1996) though are not used here. In most biologically realistic models, such genetic states will be quite rare, and so the impact of this assumption will be negligible; however, in some models this distinction may be important. See calcPairHeterozygosity() for further discussion. This function was written by Nick Bailey (currently affiliated with the CNRS and Laboratory of Biometry and Evolutionary Biology at University Lyon 1), based on code in calcWattersonsTheta, and with helpful input from Peter Ralph. diff --git a/functions/calcTajimaD.txt b/functions/calcTajimaD.txt new file mode 100644 index 0000000..4872f85 --- /dev/null +++ b/functions/calcTajimaD.txt @@ -0,0 +1,62 @@ +function (float)calcTajimaD(object genomes, [No muts = NULL], [Ni$ start = NULL], [Ni$ end = NULL]) + +{ if (genomes.length() == 0) + stop("ERROR (calcTajimaD()): genomes must be non-empty."); + if (community.allSpecies.length() > 1) + { + species = unique(genomes.individual.subpopulation.species, preserveOrder=F); + if (species.length() != 1) + stop("ERROR (calcTajimaD()): genomes must all belong to the same species."); + if (!isNULL(muts)) + if (!all(muts.mutationType.species == species)) + stop("ERROR (calcTajimaD()): muts must all belong to the same species as genomes."); + } + else + { + species = community.allSpecies; + } + + length = species.chromosome.lastPosition + 1; + + if (isNULL(muts)) + muts = species.mutations; + + // handle windowing + if (!isNULL(start) & !isNULL(end)) + { + if (start > end) + stop("ERROR (calcTajimaD()): start must be less than or equal to end."); + mpos = muts.position; + muts = muts[(mpos >= start) & (mpos <= end)]; + length = end - start + 1; + } + else if (!isNULL(start) | !isNULL(end)) + { + stop("ERROR (calcTajimaD()): start and end must both be NULL or both be non-NULL."); + } + + // narrow down to the mutations that are actually present in the genomes and aren't fixed + p = genomes.mutationFrequenciesInGenomes(muts); + muts = muts[(p != 0.0) & (p != 1.0)]; + + // do the calculation + // Pi and Watterson's theta functions divide by sequence length so this must be undone in Tajima's D + // Sequence length is constant (i.e. no missing data or indels) so this can be applied equally over both metrics + diff = (calcPi(genomes, muts, start, end) - calcWattersonsTheta(genomes, muts, start, end)) * length; + // calculate standard deviation of covariance of pi and Watterson's theta + // note that first 3 variables defined below are sufficient for Watterson's theta calculation as well, though the function is used above for proper interval handling and clarity + k = size(muts); + n = genomes.size(); + a_1 = sum(1 / 1:(n - 1)); + a_2 = sum(1 / (1:(n - 1)) ^ 2); + b_1 = (n + 1) / (3 * (n - 1)); + b_2 = 2 * (n ^ 2 + n + 3) / (9 * n * (n - 1)); + c_1 = b_1 - 1 / a_1; + c_2 = b_2 - (n + 2) / (a_1 * n) + a_2 / a_1 ^ 2; + e_1 = c_1 / a_1; + e_2 = c_2 / (a_1 ^ 2 + a_2); + covar = e_1 * k + e_2 * k * (k - 1); + stdev = sqrt(covar); + tajima_d = diff / stdev; + return tajima_d; +} diff --git a/functions/calcTajimaD_docString.txt b/functions/calcTajimaD_docString.txt new file mode 100644 index 0000000..0db0a47 --- /dev/null +++ b/functions/calcTajimaD_docString.txt @@ -0,0 +1,3 @@ +Calculates Tajima's D (a test of neutrality based on the allele frequency spectrum) for a vector of genomes, based upon the mutations in the genomes. The mathematical formulation is given in Tajima 1989 (equation 38) and remains unchanged (e.g. equations 2.30 in Durrett 2008, 8.4 in Hahn 2018, and 4.44 in Coop 2020). Often genomes will be all of the genomes in a subpopulation, or in the entire population, but any genome vector may be used. By default, with muts=NULL, the calculation is based upon all mutations in the simulation; the calculation can instead be based upon a subset of mutations, such as mutations of a specific mutation type, by passing the desired vector of mutations for muts. +The calculation can be narrowed to apply to only a window – a subrange of the full chromosome – by passing the interval bounds [start, end] for the desired window. In this case, the vector of mutations used for the calculation will be subset to include only mutations within the specified window. The default behavior, with start and end of NULL, provides the genome-wide Tajima's D. +The implementation of calcTajimaD(), viewable with functionSource(), treats every mutation as independent in the heterozygosity calculations. One could regard this choice as embodying an infinite-sites interpretation of the segregating mutations, as with calcHeterozygosity(). Indeed, Tajima's D can be modified with finite-sites models of pi and theta (Misawa and Tajima 1997) though these are not used here. In most biologically realistic models, such genetic states will be quite rare, and so the impact of this assumption will be negligible; however, in some models this distinction may be important. See calcPairHeterozygosity() for further discussion. This function was written by Nick Bailey (currently affiliated with the CNRS and Laboratory of Biometry and Evolutionary Biology at University Lyon 1), based on code in calcWattersonsTheta, and with helpful input from Peter Ralph.