-
Notifications
You must be signed in to change notification settings - Fork 15
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adding Pi and Tajima's D calculations as Eidos Functions #28
Changes from 2 commits
052a9b1
2f55662
37390f4
93a46f6
6e9df15
cd0187c
7a1a887
f6c4646
95b8311
13465b6
18b9510
1b8126c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
// Written by Nick Bailey, submitted on 2024-06-13 | ||
// Currently affiliated with CNRS and University of Lyon 1 | ||
// This code is provided in the public domain free of restriction, please credit SLiM-Extras (https://github.com/MesserLab/SLiM-Extras) and myself, Nick Bailey, if you use it | ||
|
||
// calculate population genetic statistic pi (designated PiTheta to differentiate from mathematical constant pi) | ||
// calculations originally in Nei and Li 1979 and Tajima 1983 | ||
function (float)calcPiTheta(object<Genome> genomes, [No<Mutation> muts = NULL], [Ni$ start = NULL], [Ni$ end = NULL]) | ||
// conduct various checks for proper usage copied straight from calcWattersonTheta() | ||
{ if (genomes.length() == 0) | ||
stop("ERROR (calcPiTheta()): genomes must be non-empty."); | ||
if (community.allSpecies.length() > 1) | ||
{ | ||
species = unique(genomes.individual.subpopulation.species, preserveOrder=F); | ||
if (species.length() != 1) | ||
stop("ERROR (calcPiTheta()): genomes must all belong to the same species."); | ||
if (!isNULL(muts)) | ||
if (!all(muts.mutationType.species == species)) | ||
stop("ERROR (calcPiTheta()): muts must all belong to the same species as genomes."); | ||
} | ||
else | ||
{ | ||
species = community.allSpecies; | ||
} | ||
|
||
if (isNULL(muts)) | ||
muts = species.mutations; | ||
|
||
// handle windowing | ||
if (!isNULL(start) & !isNULL(end)) | ||
{ | ||
if (start > end) | ||
stop("ERROR (calcPiTheta()): start must be less than or equal to end."); | ||
mpos = muts.position; | ||
muts = muts[(mpos >= start) & (mpos <= end)]; | ||
length = end - start + 1; | ||
} | ||
else if (!isNULL(start) | !isNULL(end)) | ||
{ | ||
stop("ERROR (calcPiTheta()): start and end must both be NULL or both be non-NULL."); | ||
} | ||
|
||
// narrow down to the mutations that are actually present in the genomes and aren't fixed | ||
p = genomes.mutationFrequenciesInGenomes(muts); | ||
muts = muts[(p != 0.0) & (p != 1.0)]; | ||
|
||
// do the calculation | ||
// multiply vector of mutation frequencies by integer count of sequences to get vector of counts of variant sequences | ||
varCount = genomes.mutationFrequenciesInGenomes(species.mutations) * size(genomes); | ||
// total count of sequences subtracted by count of variant sequences equals count of invariant sequences | ||
invarCount = size(genomes) - varCount; | ||
// count of pairwise differences per site (assuming a biallelic site) is the product of counts of both alleles, this is then summed for all sites | ||
diffs = sum(varCount * invarCount); | ||
// pi is the ratio of pairwise differences to number of possible combinations of the given sequences | ||
// the latter is calculated by a standard formula defined in a function above (not by default in SLiM) | ||
pi = sum(varCount * invarCount) / combinationTwo(genomes.size()); | ||
// pi is conventionally averaged per site and this is consistent with SLiM's calculation of Watterson's theta | ||
// sequences start at 0, so sequence length is final position plus 1 | ||
avg_pi = pi / (species.chromosome.lastPosition + 1); | ||
return avg_pi; | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
// Written by Nick Bailey, submitted on 2024-06-13 | ||
// Currently affiliated with CNRS and University of Lyon 1 | ||
// This code is provided in the public domain free of restriction, please credit SLiM-Extras (https://github.com/MesserLab/SLiM-Extras) and myself, Nick Bailey, if you use it | ||
|
||
// calculate neutrality test based on the site frequency spectrum, Tajima's D | ||
// calculations originally in Tajima 1989 | ||
function (float)calcTajimaD(object<Genome> genomes, [No<Mutation> muts = NULL], [Ni$ start = NULL], [Ni$ end = NULL]) | ||
// conduct various checks for proper usage copied straight from calcWattersonTheta() | ||
{ if (genomes.length() == 0) | ||
stop("ERROR (calcTajimaD()): genomes must be non-empty."); | ||
if (community.allSpecies.length() > 1) | ||
{ | ||
species = unique(genomes.individual.subpopulation.species, preserveOrder=F); | ||
if (species.length() != 1) | ||
stop("ERROR (calcTajimaD()): genomes must all belong to the same species."); | ||
if (!isNULL(muts)) | ||
if (!all(muts.mutationType.species == species)) | ||
stop("ERROR (calcTajimaD()): muts must all belong to the same species as genomes."); | ||
} | ||
else | ||
{ | ||
species = community.allSpecies; | ||
} | ||
|
||
if (isNULL(muts)) | ||
muts = species.mutations; | ||
|
||
// handle windowing | ||
if (!isNULL(start) & !isNULL(end)) | ||
{ | ||
if (start > end) | ||
stop("ERROR (calcTajimaD()): start must be less than or equal to end."); | ||
mpos = muts.position; | ||
muts = muts[(mpos >= start) & (mpos <= end)]; | ||
length = end - start + 1; | ||
} | ||
else if (!isNULL(start) | !isNULL(end)) | ||
{ | ||
stop("ERROR (calcTajimaD()): start and end must both be NULL or both be non-NULL."); | ||
} | ||
|
||
// narrow down to the mutations that are actually present in the genomes and aren't fixed | ||
p = genomes.mutationFrequenciesInGenomes(muts); | ||
muts = muts[(p != 0.0) & (p != 1.0)]; | ||
|
||
// do the calculation | ||
// Pi and Watterson's theta functions divide by sequence length so this must be undone in Tajima's D | ||
// Sequence length is constant (i.e. no missing data or indels) so this can be applied equally over both metrics | ||
diff = (calcPiTheta(genomes) - calcWattersonsTheta(genomes)) * (species.chromosome.lastPosition + 1); | ||
npb596 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// calculate standard deviation of covariance of pi and Watterson's theta | ||
k = size(muts); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Isn't it the case that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. k here should represent number of sites with a mutation. Not sure what to do here because I see how it's simpler to just calculate k/a_1 and divide by sites (as is done in calcWattersonsTheta) since that information is present already but as in the above comment I need to change this so intervals work properly over calcPiTheta and calcWattersonTheta. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ah, I see - perhaps a comment? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since I kept the above interval handling, I have now just left a comment that the variables given also define Watterson's theta. |
||
n = genomes.size(); | ||
a_1 = sum(1 / 1:(n - 1)); | ||
a_2 = sum(1 / 1:(n - 1 ^ 2)); | ||
npb596 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
b_1 = (n + 1) / (3 * (n - 1)); | ||
b_2 = 2 * (n ^ 2 + n + 3) / (9 * n * (n - 1)); | ||
c_1 = b_1 - 1 / a_1; | ||
c_2 = b_2 - (n + 2) / (a_1 * n) + a_2 / a_1 ^ 2; | ||
e_1 = c_1 / a_1; | ||
e_2 = c_2 / (a_1 ^ 2 + a_2); | ||
covar = e_1 * k + e_2 * k * (k - 1); | ||
stdev = sqrt(covar); | ||
tajima_d = diff / stdev; | ||
return tajima_d; | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
// Written by Nick Bailey, submitted on 2024-06-13 | ||
// Currently affiliated with CNRS and University of Lyon 1 | ||
// This code is provided in the public domain free of restriction, please credit SLiM-Extras (https://github.com/MesserLab/SLiM-Extras) and myself, Nick Bailey, if you use it | ||
|
||
// calculate simple combination equation with only two possible choices in a given selection | ||
function (integer)combinationTwo(integer x) | ||
{ | ||
return asInteger((x * (x - 1)) / 2); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this phrasing in the comment is confusing, since sites are not necessarily biallelic
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agreed though not sure my new wording is any better. I mention this a bit now in my bigger comment and within the docstrings. I also changed the comment within the script now so it states where I got the idea of just multiplying ref and var alleles, since I don't see that notation often.