add Rmd to simulate data for vignettes

INRA · Jan 17, 2017 · 396d625 · 396d625
1 parent 34f1f98
commit 396d625
Showing 1 changed file with 217 additions and 0 deletions.
diff --git a/misc/simul_for_rgs3.Rmd b/misc/simul_for_rgs3.Rmd
@@ -0,0 +1,217 @@
+---
+title: "Data simulation for rgs3 vignettes"
+author: "Timothée Flutre (INRA)"
+date: "`r format(Sys.time(), '%d/%m/%Y %H:%M:%S')`"
+output:
+  html_document:
+    toc: true
+    toc_depth: 3
+    toc_float: true
+    number_sections: TRUE
+  pdf_document:
+    toc: true
+    toc_depth: 3
+    number_sections: TRUE
+urlcolor: blue
+---
+
+<!--
+setwd("~/src/rgs3/misc/")
+library(rmarkdown)
+render("simul_for_rgs3.Rmd", "html_document")
+-->
+
+
+# Preamble
+
+This document requires the [rutilstimflutre](https://github.com/timflutre/rutilstimflutre) package to be installed, which itself may require other packages, depending on the functions:
+```{r load_pkg}
+suppressPackageStartupMessages(library(rutilstimflutre))
+packageVersion("rutilstimflutre")
+library(rrBLUP)
+packageVersion("rrBLUP")
+```
+
+This R chunk is used to assess how much time it takes to execute the R code in this document until the end:
+```{r time_0}
+t0 <- proc.time()
+```
+
+To make all this reproducible, the seed of the pseudo-random number generator is set:
+```{r set_seed}
+set.seed(1859)
+```
+
+
+# Simulate SNP genotypes
+
+```{r simul_geno}
+nb.inds <- 500
+nb.chrs <- 10
+Ne <- 10^4
+L <- 5*10^5
+mu <- 10^(-8)
+theta <- 4 * Ne * mu * L
+c <- 10^(-8)
+rho <- 4 * Ne * c * L
+genomes <- simulCoalescent(nb.inds=nb.inds, nb.reps=nb.chrs, chrom.len=L,
+                           pop.mut.rate=theta, pop.recomb.rate=rho)
+dim(X <- genomes$genos)
+X[1:3,1:4]
+afs <- estimSnpAf(X)
+summary(afs) # mean should be 0.5
+hist(afs, breaks="FD", xlim=c(0,1), las=1, col="grey", border="white",
+     main=paste0("AFs of ", ncol(X), " SNPs"),
+     xlab="Allele frequency", ylab="Number of SNPs")
+mafs <- estimSnpMaf(X)
+sum(mafs < 0.01)
+plotHistMinAllelFreq(mafs=mafs)
+```
+
+Check that the linkage disequilibrium is as expected:
+```{r ld, eval=TRUE}
+chr <- "chr1"
+min.maf <- 0.2
+min.pos <- 0
+max.pos <- L
+(length(snps.tokeep <-
+          rownames(genomes$snp.coords[mafs >= min.maf &
+                                      genomes$snp.coords$chr == chr &
+                                      genomes$snp.coords$pos >= min.pos &
+                                      genomes$snp.coords$pos <= max.pos,])))
+(nrow(ld <- estimLd(X=genomes$genos[,snps.tokeep],
+                    snp.coords=genomes$snp.coords[snps.tokeep,],
+                    use.ldcorsv=FALSE)))
+summary(ld$cor2)
+snp.dist <- distSnpPairs(snp.pairs=ld[, c("loc1","loc2")],
+                         snp.coords=genomes$snp.coords[snps.tokeep,])
+plotLd(snp.dist,
+       ## ld$cor2, estim="r2",
+       sqrt(ld$cor2), estim="r",
+       main=paste0(length(snps.tokeep), " SNPs with MAF >= ", min.maf,
+                   " on ", chr),
+       use.density=TRUE,
+       span=1/20,
+       sample.size=2 * nb.inds,
+       Ne=Ne, c=c)
+abline(v=500, lty=2)
+```
+
+Check that the matrix of additive genetic relationships is as expected:
+```{r matrix_A}
+A <- estimGenRel(X=X, relationships="additive", method="vanraden1")
+summary(diag(A)) # mean should be 1
+summary(A[upper.tri(A)]) # mean should be 0
+```
+
+For the rest, discard all SNPs with a minor allele frequency below 1%:
+```{r}
+thresh <- 0.01
+dim(X <- X[, estimSnpMaf(X) >= thresh])
+summary(afs <- estimSnpAf(X))
+summary(mafs <- estimSnpMaf(X))
+A <- estimGenRel(X=X, relationships="additive", method="vanraden1")
+summary(diag(A))
+summary(A[upper.tri(A)])
+```
+
+
+# Simulate phenotypes
+
+```{r simul_pheno}
+model <- simulAnimalModel(T=1, Q=3, A=A, V.G.A=15, V.E=5)
+names(model)
+c(model$C)
+model$V.G.A
+model$V.E
+mean(model$dat$response1)
+var(model$dat$response1)
+summary(model$G.A[,1])
+hist(model$G.A[,1], breaks="FD", main="True breeding values",
+     las=1, col="grey", border="white")
+```
+
+
+# Save data
+
+Path to the package:
+```{r}
+p2pkg <- "~/src/rgs3"
+```
+
+Phenotypes:
+```{r save_phenos}
+if(! is.null(p2pkg)){
+  phenos.file <- paste0(p2pkg, "/inst/extdata/phenos_df.txt.gz")
+  write.table(x=model$dat, file=gzfile(phenos.file), quote=FALSE, sep="\t")
+  print(tools::md5sum(path.expand(phenos.file)))
+}
+```
+
+Genotypes:
+```{r save_genos}
+if(! is.null(p2pkg)){
+  genos.file <- paste0(p2pkg, "/inst/extdata/genos_mat.txt.gz")
+  write.table(x=X, file=gzfile(genos.file), quote=FALSE, sep="\t")
+  print(tools::md5sum(path.expand(genos.file)))
+}
+```
+
+Genotypic values:
+```{r save_genovals}
+if(! is.null(p2pkg)){
+  genovals.file <- paste0(p2pkg, "/inst/extdata/genovals.txt.gz")
+  write.table(x=model$G.A[,1], file=gzfile(genovals.file), quote=FALSE,
+              sep="\t", col.names=FALSE)
+  print(tools::md5sum(path.expand(genovals.file)))
+}
+```
+
+
+# Quick check
+
+## Use A
+
+Give the additive genetic relationship matrix to the [rrBLUP](https://cran.r-project.org/package=rrBLUP) package:
+```{r check_rrBLUP_A}
+fit.rrBLUP.A <- mixed.solve(y=model$Y[,1], Z=model$Z, K=A, X=model$W,
+                            method="REML", SE=TRUE, return.Hinv=TRUE)
+cbind(truth=model$C, estim=fit.rrBLUP.A$beta)
+c(truth=model$V.E, estim=fit.rrBLUP.A$Ve)
+c(truth=model$V.G.A, estim=fit.rrBLUP.A$Vu)
+summary(fit.rrBLUP.A$u)
+hist(fit.rrBLUP.A$u, breaks="FD", main="Predicted breeding values (via A)",
+      las=1, col="grey", border="white")
+regplot(x=model$G.A, y=fit.rrBLUP.A$u, asp=1, legend.x="bottomright", las=1,
+        xlab="True breeding values", ylab="Predicted breeding values (via A)")
+abline(h=0, v=0, lty=2)
+```
+
+## Use X
+
+Give the SNP genotypes matrix to the [rrBLUP](https://cran.r-project.org/package=rrBLUP) package:
+```{r check_rrBLUP_X}
+fit.rrBLUP.X <- mixed.solve(y=model$Y[,1], Z=model$Z %*% (X - 1), K=NULL, X=model$W,
+                            method="REML", SE=TRUE, return.Hinv=TRUE)
+cbind(truth=model$C, estim=fit.rrBLUP.X$beta)
+c(truth=model$V.E, estim=fit.rrBLUP.X$Ve)
+fit.rrBLUP.X$Vu
+summary(fit.rrBLUP.X$u)
+hist(fit.rrBLUP.X$u, breaks="FD", main="Estimated additive SNP effects",
+     las=1, col="grey", border="white")
+genovals.hat <- ((X - 1) %*% fit.rrBLUP.X$u)[,1]
+summary(genovals.hat)
+hist(genovals.hat, breaks="FD", main="Predicted breeding values (via X)",
+     las=1, col="grey", border="white")
+regplot(x=model$G.A, y=genovals.hat, asp=1, legend.x="bottomright", las=1,
+        xlab="True breeding values", ylab="Predicted breeding values (via X)")
+abline(h=0, v=0, lty=2)
+```
+
+
+# Appendix
+
+```{r info}
+t1 <- proc.time(); t1 - t0
+print(sessionInfo(), locale=FALSE)
+```