Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add Rmd to simulate data for vignettes
- Loading branch information
1 parent
34f1f98
commit 396d625
Showing
1 changed file
with
217 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,217 @@ | ||
--- | ||
title: "Data simulation for rgs3 vignettes" | ||
author: "Timothée Flutre (INRA)" | ||
date: "`r format(Sys.time(), '%d/%m/%Y %H:%M:%S')`" | ||
output: | ||
html_document: | ||
toc: true | ||
toc_depth: 3 | ||
toc_float: true | ||
number_sections: TRUE | ||
pdf_document: | ||
toc: true | ||
toc_depth: 3 | ||
number_sections: TRUE | ||
urlcolor: blue | ||
--- | ||
|
||
<!-- | ||
setwd("~/src/rgs3/misc/") | ||
library(rmarkdown) | ||
render("simul_for_rgs3.Rmd", "html_document") | ||
--> | ||
|
||
|
||
# Preamble | ||
|
||
This document requires the [rutilstimflutre](https://github.com/timflutre/rutilstimflutre) package to be installed, which itself may require other packages, depending on the functions: | ||
```{r load_pkg} | ||
suppressPackageStartupMessages(library(rutilstimflutre)) | ||
packageVersion("rutilstimflutre") | ||
library(rrBLUP) | ||
packageVersion("rrBLUP") | ||
``` | ||
|
||
This R chunk is used to assess how much time it takes to execute the R code in this document until the end: | ||
```{r time_0} | ||
t0 <- proc.time() | ||
``` | ||
|
||
To make all this reproducible, the seed of the pseudo-random number generator is set: | ||
```{r set_seed} | ||
set.seed(1859) | ||
``` | ||
|
||
|
||
# Simulate SNP genotypes | ||
|
||
```{r simul_geno} | ||
nb.inds <- 500 | ||
nb.chrs <- 10 | ||
Ne <- 10^4 | ||
L <- 5*10^5 | ||
mu <- 10^(-8) | ||
theta <- 4 * Ne * mu * L | ||
c <- 10^(-8) | ||
rho <- 4 * Ne * c * L | ||
genomes <- simulCoalescent(nb.inds=nb.inds, nb.reps=nb.chrs, chrom.len=L, | ||
pop.mut.rate=theta, pop.recomb.rate=rho) | ||
dim(X <- genomes$genos) | ||
X[1:3,1:4] | ||
afs <- estimSnpAf(X) | ||
summary(afs) # mean should be 0.5 | ||
hist(afs, breaks="FD", xlim=c(0,1), las=1, col="grey", border="white", | ||
main=paste0("AFs of ", ncol(X), " SNPs"), | ||
xlab="Allele frequency", ylab="Number of SNPs") | ||
mafs <- estimSnpMaf(X) | ||
sum(mafs < 0.01) | ||
plotHistMinAllelFreq(mafs=mafs) | ||
``` | ||
|
||
Check that the linkage disequilibrium is as expected: | ||
```{r ld, eval=TRUE} | ||
chr <- "chr1" | ||
min.maf <- 0.2 | ||
min.pos <- 0 | ||
max.pos <- L | ||
(length(snps.tokeep <- | ||
rownames(genomes$snp.coords[mafs >= min.maf & | ||
genomes$snp.coords$chr == chr & | ||
genomes$snp.coords$pos >= min.pos & | ||
genomes$snp.coords$pos <= max.pos,]))) | ||
(nrow(ld <- estimLd(X=genomes$genos[,snps.tokeep], | ||
snp.coords=genomes$snp.coords[snps.tokeep,], | ||
use.ldcorsv=FALSE))) | ||
summary(ld$cor2) | ||
snp.dist <- distSnpPairs(snp.pairs=ld[, c("loc1","loc2")], | ||
snp.coords=genomes$snp.coords[snps.tokeep,]) | ||
plotLd(snp.dist, | ||
## ld$cor2, estim="r2", | ||
sqrt(ld$cor2), estim="r", | ||
main=paste0(length(snps.tokeep), " SNPs with MAF >= ", min.maf, | ||
" on ", chr), | ||
use.density=TRUE, | ||
span=1/20, | ||
sample.size=2 * nb.inds, | ||
Ne=Ne, c=c) | ||
abline(v=500, lty=2) | ||
``` | ||
|
||
Check that the matrix of additive genetic relationships is as expected: | ||
```{r matrix_A} | ||
A <- estimGenRel(X=X, relationships="additive", method="vanraden1") | ||
summary(diag(A)) # mean should be 1 | ||
summary(A[upper.tri(A)]) # mean should be 0 | ||
``` | ||
|
||
For the rest, discard all SNPs with a minor allele frequency below 1%: | ||
```{r} | ||
thresh <- 0.01 | ||
dim(X <- X[, estimSnpMaf(X) >= thresh]) | ||
summary(afs <- estimSnpAf(X)) | ||
summary(mafs <- estimSnpMaf(X)) | ||
A <- estimGenRel(X=X, relationships="additive", method="vanraden1") | ||
summary(diag(A)) | ||
summary(A[upper.tri(A)]) | ||
``` | ||
|
||
|
||
# Simulate phenotypes | ||
|
||
```{r simul_pheno} | ||
model <- simulAnimalModel(T=1, Q=3, A=A, V.G.A=15, V.E=5) | ||
names(model) | ||
c(model$C) | ||
model$V.G.A | ||
model$V.E | ||
mean(model$dat$response1) | ||
var(model$dat$response1) | ||
summary(model$G.A[,1]) | ||
hist(model$G.A[,1], breaks="FD", main="True breeding values", | ||
las=1, col="grey", border="white") | ||
``` | ||
|
||
|
||
# Save data | ||
|
||
Path to the package: | ||
```{r} | ||
p2pkg <- "~/src/rgs3" | ||
``` | ||
|
||
Phenotypes: | ||
```{r save_phenos} | ||
if(! is.null(p2pkg)){ | ||
phenos.file <- paste0(p2pkg, "/inst/extdata/phenos_df.txt.gz") | ||
write.table(x=model$dat, file=gzfile(phenos.file), quote=FALSE, sep="\t") | ||
print(tools::md5sum(path.expand(phenos.file))) | ||
} | ||
``` | ||
|
||
Genotypes: | ||
```{r save_genos} | ||
if(! is.null(p2pkg)){ | ||
genos.file <- paste0(p2pkg, "/inst/extdata/genos_mat.txt.gz") | ||
write.table(x=X, file=gzfile(genos.file), quote=FALSE, sep="\t") | ||
print(tools::md5sum(path.expand(genos.file))) | ||
} | ||
``` | ||
|
||
Genotypic values: | ||
```{r save_genovals} | ||
if(! is.null(p2pkg)){ | ||
genovals.file <- paste0(p2pkg, "/inst/extdata/genovals.txt.gz") | ||
write.table(x=model$G.A[,1], file=gzfile(genovals.file), quote=FALSE, | ||
sep="\t", col.names=FALSE) | ||
print(tools::md5sum(path.expand(genovals.file))) | ||
} | ||
``` | ||
|
||
|
||
# Quick check | ||
|
||
## Use A | ||
|
||
Give the additive genetic relationship matrix to the [rrBLUP](https://cran.r-project.org/package=rrBLUP) package: | ||
```{r check_rrBLUP_A} | ||
fit.rrBLUP.A <- mixed.solve(y=model$Y[,1], Z=model$Z, K=A, X=model$W, | ||
method="REML", SE=TRUE, return.Hinv=TRUE) | ||
cbind(truth=model$C, estim=fit.rrBLUP.A$beta) | ||
c(truth=model$V.E, estim=fit.rrBLUP.A$Ve) | ||
c(truth=model$V.G.A, estim=fit.rrBLUP.A$Vu) | ||
summary(fit.rrBLUP.A$u) | ||
hist(fit.rrBLUP.A$u, breaks="FD", main="Predicted breeding values (via A)", | ||
las=1, col="grey", border="white") | ||
regplot(x=model$G.A, y=fit.rrBLUP.A$u, asp=1, legend.x="bottomright", las=1, | ||
xlab="True breeding values", ylab="Predicted breeding values (via A)") | ||
abline(h=0, v=0, lty=2) | ||
``` | ||
|
||
## Use X | ||
|
||
Give the SNP genotypes matrix to the [rrBLUP](https://cran.r-project.org/package=rrBLUP) package: | ||
```{r check_rrBLUP_X} | ||
fit.rrBLUP.X <- mixed.solve(y=model$Y[,1], Z=model$Z %*% (X - 1), K=NULL, X=model$W, | ||
method="REML", SE=TRUE, return.Hinv=TRUE) | ||
cbind(truth=model$C, estim=fit.rrBLUP.X$beta) | ||
c(truth=model$V.E, estim=fit.rrBLUP.X$Ve) | ||
fit.rrBLUP.X$Vu | ||
summary(fit.rrBLUP.X$u) | ||
hist(fit.rrBLUP.X$u, breaks="FD", main="Estimated additive SNP effects", | ||
las=1, col="grey", border="white") | ||
genovals.hat <- ((X - 1) %*% fit.rrBLUP.X$u)[,1] | ||
summary(genovals.hat) | ||
hist(genovals.hat, breaks="FD", main="Predicted breeding values (via X)", | ||
las=1, col="grey", border="white") | ||
regplot(x=model$G.A, y=genovals.hat, asp=1, legend.x="bottomright", las=1, | ||
xlab="True breeding values", ylab="Predicted breeding values (via X)") | ||
abline(h=0, v=0, lty=2) | ||
``` | ||
|
||
|
||
# Appendix | ||
|
||
```{r info} | ||
t1 <- proc.time(); t1 - t0 | ||
print(sessionInfo(), locale=FALSE) | ||
``` |