From f89eb2e52ebe028e71b2e98158bbf0f60a4f8059 Mon Sep 17 00:00:00 2001
From: Gavin Ha <gavinha@broadinstitute.org>
Date: Fri, 3 Aug 2018 14:44:44 -0400
Subject: [PATCH] qsub params; other bug fixes

---
 README.md                          |  4 ++--
 TitanCNA.snakefile                 | 15 ++++++++++++---
 code/getMoleculeCoverage.R         |  6 +++++-
 code/getPhasedHETSitesFromLLRVCF.R | 14 ++++++++------
 config/config.yaml                 | 17 +++++++++++++++--
 getPhasedAlleleCounts.snakefile    | 12 +++++++++---
 moleculeCoverage.snakefile         | 17 +++++++++++------
 7 files changed, 62 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index 31d0588..86a1882 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ Viswanathan SR*, Ha G*, Hoff A*, et al. Structural Alterations Driving Castratio
 Gavin Ha  
 Fred Hutchinson Cancer Research Center  
 contact: <gavinha@gmail.com> or <gha@fredhutch.org>  
-Date: July 26, 2018  
+Date: August 3, 2018  
 
 ## Requirements
 ### Software packages or libraries
@@ -62,7 +62,7 @@ snakemake -s TitanCNA.snakefile -np
 # run the workflow locally using 5 cores
 snakemake -s TitanCNA.snakefile --cores 5
 # run the workflow on qsub using a maximum of 50 jobs. Broad UGER cluster parameters can be set directly in config/cluster.sh. 
-snakemake -s TitanCNA.snakefile --cluster-sync "qsub" -j 50 --jobscript config/cluster.sh
+snakemake -s TitanCNA.snakefile --cluster-sync "qsub -l h_vmem={params.mem},h_rt={params.runtime}" -j 50 --jobscript config/cluster.sh
 ```
 This will also run both `moleculeCoverage.snakefile` and `getPhasedAlleleCounts.snakefile` which generate the necessary inputs for `TitanCNA.snakfile`.
 
diff --git a/TitanCNA.snakefile b/TitanCNA.snakefile
index d84ee4a..e3fc06c 100644
--- a/TitanCNA.snakefile
+++ b/TitanCNA.snakefile
@@ -55,7 +55,10 @@ rule runTitanCNA:
 		alphaR=config["TitanCNA_alphaR"],
 		#alleleModel=config["TitanCNA_alleleModel"],
 		txnExpLen=config["TitanCNA_txnExpLen"],
-		plotYlim=config["TitanCNA_plotYlim"]
+		plotYlim=config["TitanCNA_plotYlim"],
+		mem=config["TitanCNA_mem"],
+		runtime=config["TitanCNA_runtime"],
+		pe=config["TitanCNA_numCores"]
 	log:
 		"logs/titan/titanCNA_ploidy{ploidy}/{tumor}_cluster{clustNum}.log"
 	shell:
@@ -77,7 +80,10 @@ rule combineTitanAndIchorCNA:
 		combineScript=config["TitanCNA_combineTitanIchorCNA"],
 		libdir=config["TitanCNA_libdir"],
 		centromere=config["centromere"],
-		gender=config["gender"]
+		gender=config["gender"],
+		mem=config["std_mem"],
+		runtime=config["std_runtime"],
+		pe=config["std_numCores"]
 	log:
 		"logs/titan/titanCNA_ploidy{ploidy}/{tumor}_cluster{clustNum}.combineTitanIchorCNA.log"
 	shell:
@@ -92,7 +98,10 @@ rule selectSolution:
 		solutionsTxt="results/titan/optimalClusterSolution.txt",
 	params:
 		solutionRscript=config["TitanCNA_selectSolutionRscript"],
-		threshold=config["TitanCNA_solutionThreshold"]
+		threshold=config["TitanCNA_solutionThreshold"],
+		mem=config["std_mem"],
+		runtime=config["std_runtime"],
+		pe=config["std_numCores"]
 	log:
 		"logs/titan/optSolution/selectSolution.log"
 	shell:
diff --git a/code/getMoleculeCoverage.R b/code/getMoleculeCoverage.R
index 6764cb6..0c4258a 100644
--- a/code/getMoleculeCoverage.R
+++ b/code/getMoleculeCoverage.R
@@ -92,6 +92,7 @@ normalizeMaleX <- as.logical(opt$normalizeMaleX)
 includeHOMD <- as.logical(opt$includeHOMD)
 fracReadsInChrYForMale <- opt$fracReadsInChrYForMale
 outDir <- opt$outDir
+outPlotDir <- paste0(outDir, "/", patientID) 
 libdirTitanCNA <- opt$libdirTitanCNA
 libdirIchorCNA <- opt$libdirIchorCNA
 plotFileType <- opt$plotFileType
@@ -142,6 +143,8 @@ if (is.null(centromere) || centromere == "None" || centromere == "NULL"){ # no c
 }
 centromere <- read.delim(centromere,header=T,stringsAsFactors=F,sep="\t")
 
+save.image(outImage)
+
 ## LOAD IN WIG FILES ##
 numSamples <- 1
 tumour_counts <- list()
@@ -150,6 +153,7 @@ for (i in 1:numSamples) {
   id <- patientID
   ## create output directories for each sample ##
   dir.create(paste0(outDir, "/"), recursive = TRUE)
+  dir.create(paste0(outPlotDir, "/"), recursive = TRUE)
   ### LOAD TUMOUR AND NORMAL FILES ###
   message("Loading tumour files from ", tumour_file)
   tumour_doc <- loadBXcountsFromBEDDir(tumour_file, chrs = chrsAll, minReads = minReadsPerBX)
@@ -302,7 +306,7 @@ for (n in normal){
 				hmmResults.cor$results$n[s, iter] <- 1.0
     		}
       ## plot solution ##
-      outPlotFile <- paste0(outDir, "/", id, "/", id, "_genomeWide_", "n", n, "-p", p)
+      outPlotFile <- paste0(outPlotDir, "/", id, "_genomeWide_", "n", n, "-p", p)
       mainName[counter] <- paste0(id, ", n: ", n, ", p: ", p, ", log likelihood: ", signif(hmmResults.cor$results$loglik[hmmResults.cor$results$iter], digits = 4))
       plotGWSolution(hmmResults.cor, s=s, outPlotFile=outPlotFile, plotFileType=plotFileType, 
                      plotYLim=plotYLim, estimateScPrevalence=estimateScPrevalence, main=mainName[counter])
diff --git a/code/getPhasedHETSitesFromLLRVCF.R b/code/getPhasedHETSitesFromLLRVCF.R
index 6de7deb..715c3c6 100644
--- a/code/getPhasedHETSitesFromLLRVCF.R
+++ b/code/getPhasedHETSitesFromLLRVCF.R
@@ -20,7 +20,7 @@ option_list <- list(
   make_option(c("--genomeStyle"), type = "character", default="NCBI", help = "Chr naming convention. NCBI (e.g. 1) or UCSC (e.g. chr1). Default: [%default]"),
   make_option(c("-s","--snpDB"), type="character", default=NULL, help= "VCF file of list of known SNPs (e.g. HapMap, 1000 Genomes, dbSNP)"),
   make_option(c("--altCountField"), type="character", default="AD", help="Alternate allele count field name in genotype. [Default: %default]"),
-  make_option(c("-o","--outVCFgz"), type = "character", help = "Output VCF file with suffix \"_phasedHets.vcf\" [Required]"),
+  make_option(c("-o","--outVCF"), type = "character", help = "Output VCF file with suffix \"_phasedHets.vcf\" [Required]"),
   make_option(c("--libdir"), type="character", default=NULL, help="Path to TitanCNA package directory. If provided, will source scripts after loading installed TitanCNA package.")
 )
 
@@ -29,6 +29,7 @@ opt <- parse_args(parseobj)
 print(opt)
 
 library(TitanCNA)
+library(GenomeInfoDb)
 library(VariantAnnotation)
 
 vcfFile <- opt$inVCF
@@ -39,8 +40,8 @@ chrs <- eval(parse(text = opt$chrs))
 minQUAL <- opt$minQuality
 minDepth <- opt$minDepth
 minVAF <- opt$minVAF
-outVCFgz <- opt$outVCFgz
-outVCF <- gsub("gz", "", outVCFgz)
+outVCF <- opt$outVCF
+outVCFgz <- paste0(outVCF, ".gz")
 libdir <- opt$libdir
 altCountField <- opt$altCountField
 
@@ -59,14 +60,14 @@ filterFlags <- c("PASS", "10X_RESCUED_MOLECULE_HIGH_DIVERSITY")
 #minQUAL <- 100
 keepGenotypes <- c("1|0", "0|1", "0/1")
 
-save.image("tmp.RData")
+
 
 #vcfFiles <- list.files(LRdir, pattern = "_phased_variants.vcf.gz$", full.name = TRUE)
 
 #for (i in 1:length(vcfFiles)){
 #id <- gsub("_phased_variants.vcf.gz", "", basename(vcfFile))
 hap <- getHaplotypesFromVCF(vcfFile, 
-			chrs = chrs, build = genomeBuild, style = genomeStyle,
+			chrs = chrs, build = genomeBuild, genomeStyle = genomeStyle,
 			filterFlags = filterFlags, minQUAL = minQUAL, minDepth = minDepth,
 			minVAF = minVAF, keepGenotypes = keepGenotypes, 
 			altCountField = altCountField, snpDB = snpFile)
@@ -75,10 +76,11 @@ hap <- getHaplotypesFromVCF(vcfFile,
 ## remove BX genotype field to make things faster
 geno(hap$vcf)$BX <- NULL
 
+message("Writing to file: ", outVCF)
 writeVcf(hap$vcf, filename = outVCF)
 bgzipFile <- bgzip(outVCF, dest = outVCFgz, overwrite = TRUE)
 indexTabix(bgzipFile, format = "vcf")
-	
+
 #	outFile <- paste0(outDir, "/", id, "_phasedGR.rds")
 #	saveRDS(hap$geno, file = outFile)
 								
diff --git a/config/config.yaml b/config/config.yaml
index f8d869e..ff49555 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -24,8 +24,15 @@ gender:  male
 bamFileName:  phased_possorted_bam.bam
 phaseVariantFileName:  phased_variants.vcf.gz
 
-## params for each step ##
+## standard parameters for qsub memory, runtime, pe (parallel environment)
+# use what your scheduler designates for parallel environment; does not apply to local run
+# invoke using:
+# snakemake -s TitanCNA.snakefile --cluster-sync "qsub -l h_vmem={params.mem},h_rt={params.runtime} {params.pe}" -j 200 --jobscript config/cluster.sh
+std_mem:  4G
+std_runtime:  "03:00:00" 
+std_numCores:  -pe smp 1 -binding linear:1
 
+## params for each step ##
 ## bxTools ##
 bx_mapQual:  60
 bx_bedFileRoot:  data/10kb_hg38/10kb_hg38
@@ -41,13 +48,14 @@ molCov_maxCN:  8
 het_minVCFQuality:  100
 het_minDepth:  10
 het_minVAF:  0.25
+het_mem:  16G
+het_runtime:  "06:00:00"
 
 ## bam filters for tumor allelic counts ##
 het_minBaseQuality:  10
 het_minMapQuality:  20
 
 ## TitanCNA params ##
-TitanCNA_numCores: 1
 TitanCNA_maxNumClonalClusters: 1
 TitanCNA_chrs:  c(1:22, \"X\")
 TitanCNA_normalInit:  0.5
@@ -61,4 +69,9 @@ TitanCNA_alphaR:  5000
 TitanCNA_txnExpLen: 1e15
 TitanCNA_plotYlim:  c(-2,4)
 TitanCNA_solutionThreshold: 0.05
+TitanCNA_mem:  16G
+TitanCNA_runtime:  "10:00:00"
+TitanCNA_numCores: -pe smp 1 -binding linear:1  
+
+
 
diff --git a/getPhasedAlleleCounts.snakefile b/getPhasedAlleleCounts.snakefile
index e3b3cb4..b1ee679 100644
--- a/getPhasedAlleleCounts.snakefile
+++ b/getPhasedAlleleCounts.snakefile
@@ -26,11 +26,14 @@ rule getHETsites:
 		minQual=config["het_minVCFQuality"],
 		minDepth=config["het_minDepth"],
 		minVAF=config["het_minVAF"],
-		libdir=config["TitanCNA_libdir"]
+		libdir=config["TitanCNA_libdir"],
+		mem=config["het_mem"],
+		runtime=config["het_runtime"],
+		pe=config["std_numCores"]
 	log:
 		"logs/phasedCounts/hetPosns/{tumor}.phasedHETsites.log"
 	shell:
-		"Rscript {params.getHETsitesScript} --inVCF {input} --genomeBuild {params.genomeBuild} -- genomeStyle {params.genomeStyle} --snpDB {params.snpDB} --minQuality {params.minQual} --minDepth {params.minDepth} --minVAF {params.minVAF} --altCountField AD --libdir {params.libdir} --outVCFgz {output} 2> {log}"
+		"Rscript {params.getHETsitesScript} --inVCF {input} --genomeBuild {params.genomeBuild} --genomeStyle {params.genomeStyle} --snpDB {params.snpDB} --minQuality {params.minQual} --minDepth {params.minDepth} --minVAF {params.minVAF} --altCountField AD --libdir {params.libdir} --outVCF {output} 2> {log}"
 
 
 rule getAlleleCountsByChr:
@@ -43,7 +46,10 @@ rule getAlleleCountsByChr:
 		countScript=config["phaseCounts_counts_script"],
 		mapQ=config["het_minMapQuality"],
 		baseQ=config["het_minBaseQuality"],
-		vcfQ=config["het_minVCFQuality"]
+		vcfQ=config["het_minVCFQuality"],
+		mem=config["std_mem"],
+		runtime=config["std_runtime"],
+		pe=config["std_numCores"]
 	log:
 		"logs/phasedCounts/tumCounts/{tumor}/{tumor}.tumCounts.{chr}.log"
 	shell:
diff --git a/moleculeCoverage.snakefile b/moleculeCoverage.snakefile
index 5978e4f..b366e4a 100644
--- a/moleculeCoverage.snakefile
+++ b/moleculeCoverage.snakefile
@@ -13,6 +13,8 @@ rule correctMolCov:
   input: 
   	expand("results/bxTile/{samples}/{samples}.bxTile.{chr}.bed", samples=config["samples"], chr=CHRS),
   	#expand("results/moleculeCoverage/{tumor}/{tumor}.cna.seg", tumor=config["pairings"]),
+  	#expand("results/moleculeCoverage/{tumor}/{tumor}.seg.txt", tumor=config["pairings"]),
+  	#expand("results/moleculeCoverage/{tumor}/{tumor}.params.txt", tumor=config["pairings"]),
   	expand("results/moleculeCoverage/{tumor}/{tumor}.BXcounts.txt", tumor=config["pairings"]),
   	expand("results/bxTile/{samples}/", samples=config["samples"])
 
@@ -33,8 +35,9 @@ rule bxTile:
 		samTools=config["samTools"],
 		mapQual=config["bx_mapQual"],
 		bedFile=config["bx_bedFileRoot"]
-	resources:
-		mem=4
+		mem=config["std_mem"],
+		runtime=config["std_runtime"],
+		pe=config["std_numCores"]
 	log:
 		"logs/bxTile/{samples}/{samples}.bxTile.{chr}.log"
 	shell:
@@ -48,7 +51,8 @@ rule moleculeCoverage:
 	output:
 		corrDepth="results/moleculeCoverage/{tumor}/{tumor}.BXcounts.txt",
 		cna="results/moleculeCoverage/{tumor}/{tumor}.cna.seg",
-		#segTxt="results/moleculeCoverage/{tumor}/{tumor}.seg.txt",
+		segTxt="results/moleculeCoverage/{tumor}/{tumor}.seg.txt",
+		paramTxt="results/moleculeCoverage/{tumor}/{tumor}.params.txt",
 		outDir="results/moleculeCoverage/{tumor}/",
 	params:
 		molCovScript=config["molCov_script"],
@@ -61,9 +65,10 @@ rule moleculeCoverage:
 		mapwig=config["molCov_mapWig"],
 		titanLibDir=config["TitanCNA_libdir"],
 		ichorLibDir=config["ichorCNA_libdir"],
-		centromere=config["centromere"]
-	resources:
-		mem=4
+		centromere=config["centromere"],
+		mem=config["std_mem"],
+		runtime=config["std_runtime"],
+		pe=config["std_numCores"]
 	log:
 		"logs/moleculeCoverage/{tumor}.molCov.log"	
 	shell: