From f89eb2e52ebe028e71b2e98158bbf0f60a4f8059 Mon Sep 17 00:00:00 2001 From: Gavin Ha Date: Fri, 3 Aug 2018 14:44:44 -0400 Subject: [PATCH] qsub params; other bug fixes --- README.md | 4 ++-- TitanCNA.snakefile | 15 ++++++++++++--- code/getMoleculeCoverage.R | 6 +++++- code/getPhasedHETSitesFromLLRVCF.R | 14 ++++++++------ config/config.yaml | 17 +++++++++++++++-- getPhasedAlleleCounts.snakefile | 12 +++++++++--- moleculeCoverage.snakefile | 17 +++++++++++------ 7 files changed, 62 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 31d0588..86a1882 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ Viswanathan SR*, Ha G*, Hoff A*, et al. Structural Alterations Driving Castratio Gavin Ha Fred Hutchinson Cancer Research Center contact: or -Date: July 26, 2018 +Date: August 3, 2018 ## Requirements ### Software packages or libraries @@ -62,7 +62,7 @@ snakemake -s TitanCNA.snakefile -np # run the workflow locally using 5 cores snakemake -s TitanCNA.snakefile --cores 5 # run the workflow on qsub using a maximum of 50 jobs. Broad UGER cluster parameters can be set directly in config/cluster.sh. -snakemake -s TitanCNA.snakefile --cluster-sync "qsub" -j 50 --jobscript config/cluster.sh +snakemake -s TitanCNA.snakefile --cluster-sync "qsub -l h_vmem={params.mem},h_rt={params.runtime}" -j 50 --jobscript config/cluster.sh ``` This will also run both `moleculeCoverage.snakefile` and `getPhasedAlleleCounts.snakefile` which generate the necessary inputs for `TitanCNA.snakfile`. diff --git a/TitanCNA.snakefile b/TitanCNA.snakefile index d84ee4a..e3fc06c 100644 --- a/TitanCNA.snakefile +++ b/TitanCNA.snakefile @@ -55,7 +55,10 @@ rule runTitanCNA: alphaR=config["TitanCNA_alphaR"], #alleleModel=config["TitanCNA_alleleModel"], txnExpLen=config["TitanCNA_txnExpLen"], - plotYlim=config["TitanCNA_plotYlim"] + plotYlim=config["TitanCNA_plotYlim"], + mem=config["TitanCNA_mem"], + runtime=config["TitanCNA_runtime"], + pe=config["TitanCNA_numCores"] log: "logs/titan/titanCNA_ploidy{ploidy}/{tumor}_cluster{clustNum}.log" shell: @@ -77,7 +80,10 @@ rule combineTitanAndIchorCNA: combineScript=config["TitanCNA_combineTitanIchorCNA"], libdir=config["TitanCNA_libdir"], centromere=config["centromere"], - gender=config["gender"] + gender=config["gender"], + mem=config["std_mem"], + runtime=config["std_runtime"], + pe=config["std_numCores"] log: "logs/titan/titanCNA_ploidy{ploidy}/{tumor}_cluster{clustNum}.combineTitanIchorCNA.log" shell: @@ -92,7 +98,10 @@ rule selectSolution: solutionsTxt="results/titan/optimalClusterSolution.txt", params: solutionRscript=config["TitanCNA_selectSolutionRscript"], - threshold=config["TitanCNA_solutionThreshold"] + threshold=config["TitanCNA_solutionThreshold"], + mem=config["std_mem"], + runtime=config["std_runtime"], + pe=config["std_numCores"] log: "logs/titan/optSolution/selectSolution.log" shell: diff --git a/code/getMoleculeCoverage.R b/code/getMoleculeCoverage.R index 6764cb6..0c4258a 100644 --- a/code/getMoleculeCoverage.R +++ b/code/getMoleculeCoverage.R @@ -92,6 +92,7 @@ normalizeMaleX <- as.logical(opt$normalizeMaleX) includeHOMD <- as.logical(opt$includeHOMD) fracReadsInChrYForMale <- opt$fracReadsInChrYForMale outDir <- opt$outDir +outPlotDir <- paste0(outDir, "/", patientID) libdirTitanCNA <- opt$libdirTitanCNA libdirIchorCNA <- opt$libdirIchorCNA plotFileType <- opt$plotFileType @@ -142,6 +143,8 @@ if (is.null(centromere) || centromere == "None" || centromere == "NULL"){ # no c } centromere <- read.delim(centromere,header=T,stringsAsFactors=F,sep="\t") +save.image(outImage) + ## LOAD IN WIG FILES ## numSamples <- 1 tumour_counts <- list() @@ -150,6 +153,7 @@ for (i in 1:numSamples) { id <- patientID ## create output directories for each sample ## dir.create(paste0(outDir, "/"), recursive = TRUE) + dir.create(paste0(outPlotDir, "/"), recursive = TRUE) ### LOAD TUMOUR AND NORMAL FILES ### message("Loading tumour files from ", tumour_file) tumour_doc <- loadBXcountsFromBEDDir(tumour_file, chrs = chrsAll, minReads = minReadsPerBX) @@ -302,7 +306,7 @@ for (n in normal){ hmmResults.cor$results$n[s, iter] <- 1.0 } ## plot solution ## - outPlotFile <- paste0(outDir, "/", id, "/", id, "_genomeWide_", "n", n, "-p", p) + outPlotFile <- paste0(outPlotDir, "/", id, "_genomeWide_", "n", n, "-p", p) mainName[counter] <- paste0(id, ", n: ", n, ", p: ", p, ", log likelihood: ", signif(hmmResults.cor$results$loglik[hmmResults.cor$results$iter], digits = 4)) plotGWSolution(hmmResults.cor, s=s, outPlotFile=outPlotFile, plotFileType=plotFileType, plotYLim=plotYLim, estimateScPrevalence=estimateScPrevalence, main=mainName[counter]) diff --git a/code/getPhasedHETSitesFromLLRVCF.R b/code/getPhasedHETSitesFromLLRVCF.R index 6de7deb..715c3c6 100644 --- a/code/getPhasedHETSitesFromLLRVCF.R +++ b/code/getPhasedHETSitesFromLLRVCF.R @@ -20,7 +20,7 @@ option_list <- list( make_option(c("--genomeStyle"), type = "character", default="NCBI", help = "Chr naming convention. NCBI (e.g. 1) or UCSC (e.g. chr1). Default: [%default]"), make_option(c("-s","--snpDB"), type="character", default=NULL, help= "VCF file of list of known SNPs (e.g. HapMap, 1000 Genomes, dbSNP)"), make_option(c("--altCountField"), type="character", default="AD", help="Alternate allele count field name in genotype. [Default: %default]"), - make_option(c("-o","--outVCFgz"), type = "character", help = "Output VCF file with suffix \"_phasedHets.vcf\" [Required]"), + make_option(c("-o","--outVCF"), type = "character", help = "Output VCF file with suffix \"_phasedHets.vcf\" [Required]"), make_option(c("--libdir"), type="character", default=NULL, help="Path to TitanCNA package directory. If provided, will source scripts after loading installed TitanCNA package.") ) @@ -29,6 +29,7 @@ opt <- parse_args(parseobj) print(opt) library(TitanCNA) +library(GenomeInfoDb) library(VariantAnnotation) vcfFile <- opt$inVCF @@ -39,8 +40,8 @@ chrs <- eval(parse(text = opt$chrs)) minQUAL <- opt$minQuality minDepth <- opt$minDepth minVAF <- opt$minVAF -outVCFgz <- opt$outVCFgz -outVCF <- gsub("gz", "", outVCFgz) +outVCF <- opt$outVCF +outVCFgz <- paste0(outVCF, ".gz") libdir <- opt$libdir altCountField <- opt$altCountField @@ -59,14 +60,14 @@ filterFlags <- c("PASS", "10X_RESCUED_MOLECULE_HIGH_DIVERSITY") #minQUAL <- 100 keepGenotypes <- c("1|0", "0|1", "0/1") -save.image("tmp.RData") + #vcfFiles <- list.files(LRdir, pattern = "_phased_variants.vcf.gz$", full.name = TRUE) #for (i in 1:length(vcfFiles)){ #id <- gsub("_phased_variants.vcf.gz", "", basename(vcfFile)) hap <- getHaplotypesFromVCF(vcfFile, - chrs = chrs, build = genomeBuild, style = genomeStyle, + chrs = chrs, build = genomeBuild, genomeStyle = genomeStyle, filterFlags = filterFlags, minQUAL = minQUAL, minDepth = minDepth, minVAF = minVAF, keepGenotypes = keepGenotypes, altCountField = altCountField, snpDB = snpFile) @@ -75,10 +76,11 @@ hap <- getHaplotypesFromVCF(vcfFile, ## remove BX genotype field to make things faster geno(hap$vcf)$BX <- NULL +message("Writing to file: ", outVCF) writeVcf(hap$vcf, filename = outVCF) bgzipFile <- bgzip(outVCF, dest = outVCFgz, overwrite = TRUE) indexTabix(bgzipFile, format = "vcf") - + # outFile <- paste0(outDir, "/", id, "_phasedGR.rds") # saveRDS(hap$geno, file = outFile) diff --git a/config/config.yaml b/config/config.yaml index f8d869e..ff49555 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -24,8 +24,15 @@ gender: male bamFileName: phased_possorted_bam.bam phaseVariantFileName: phased_variants.vcf.gz -## params for each step ## +## standard parameters for qsub memory, runtime, pe (parallel environment) +# use what your scheduler designates for parallel environment; does not apply to local run +# invoke using: +# snakemake -s TitanCNA.snakefile --cluster-sync "qsub -l h_vmem={params.mem},h_rt={params.runtime} {params.pe}" -j 200 --jobscript config/cluster.sh +std_mem: 4G +std_runtime: "03:00:00" +std_numCores: -pe smp 1 -binding linear:1 +## params for each step ## ## bxTools ## bx_mapQual: 60 bx_bedFileRoot: data/10kb_hg38/10kb_hg38 @@ -41,13 +48,14 @@ molCov_maxCN: 8 het_minVCFQuality: 100 het_minDepth: 10 het_minVAF: 0.25 +het_mem: 16G +het_runtime: "06:00:00" ## bam filters for tumor allelic counts ## het_minBaseQuality: 10 het_minMapQuality: 20 ## TitanCNA params ## -TitanCNA_numCores: 1 TitanCNA_maxNumClonalClusters: 1 TitanCNA_chrs: c(1:22, \"X\") TitanCNA_normalInit: 0.5 @@ -61,4 +69,9 @@ TitanCNA_alphaR: 5000 TitanCNA_txnExpLen: 1e15 TitanCNA_plotYlim: c(-2,4) TitanCNA_solutionThreshold: 0.05 +TitanCNA_mem: 16G +TitanCNA_runtime: "10:00:00" +TitanCNA_numCores: -pe smp 1 -binding linear:1 + + diff --git a/getPhasedAlleleCounts.snakefile b/getPhasedAlleleCounts.snakefile index e3b3cb4..b1ee679 100644 --- a/getPhasedAlleleCounts.snakefile +++ b/getPhasedAlleleCounts.snakefile @@ -26,11 +26,14 @@ rule getHETsites: minQual=config["het_minVCFQuality"], minDepth=config["het_minDepth"], minVAF=config["het_minVAF"], - libdir=config["TitanCNA_libdir"] + libdir=config["TitanCNA_libdir"], + mem=config["het_mem"], + runtime=config["het_runtime"], + pe=config["std_numCores"] log: "logs/phasedCounts/hetPosns/{tumor}.phasedHETsites.log" shell: - "Rscript {params.getHETsitesScript} --inVCF {input} --genomeBuild {params.genomeBuild} -- genomeStyle {params.genomeStyle} --snpDB {params.snpDB} --minQuality {params.minQual} --minDepth {params.minDepth} --minVAF {params.minVAF} --altCountField AD --libdir {params.libdir} --outVCFgz {output} 2> {log}" + "Rscript {params.getHETsitesScript} --inVCF {input} --genomeBuild {params.genomeBuild} --genomeStyle {params.genomeStyle} --snpDB {params.snpDB} --minQuality {params.minQual} --minDepth {params.minDepth} --minVAF {params.minVAF} --altCountField AD --libdir {params.libdir} --outVCF {output} 2> {log}" rule getAlleleCountsByChr: @@ -43,7 +46,10 @@ rule getAlleleCountsByChr: countScript=config["phaseCounts_counts_script"], mapQ=config["het_minMapQuality"], baseQ=config["het_minBaseQuality"], - vcfQ=config["het_minVCFQuality"] + vcfQ=config["het_minVCFQuality"], + mem=config["std_mem"], + runtime=config["std_runtime"], + pe=config["std_numCores"] log: "logs/phasedCounts/tumCounts/{tumor}/{tumor}.tumCounts.{chr}.log" shell: diff --git a/moleculeCoverage.snakefile b/moleculeCoverage.snakefile index 5978e4f..b366e4a 100644 --- a/moleculeCoverage.snakefile +++ b/moleculeCoverage.snakefile @@ -13,6 +13,8 @@ rule correctMolCov: input: expand("results/bxTile/{samples}/{samples}.bxTile.{chr}.bed", samples=config["samples"], chr=CHRS), #expand("results/moleculeCoverage/{tumor}/{tumor}.cna.seg", tumor=config["pairings"]), + #expand("results/moleculeCoverage/{tumor}/{tumor}.seg.txt", tumor=config["pairings"]), + #expand("results/moleculeCoverage/{tumor}/{tumor}.params.txt", tumor=config["pairings"]), expand("results/moleculeCoverage/{tumor}/{tumor}.BXcounts.txt", tumor=config["pairings"]), expand("results/bxTile/{samples}/", samples=config["samples"]) @@ -33,8 +35,9 @@ rule bxTile: samTools=config["samTools"], mapQual=config["bx_mapQual"], bedFile=config["bx_bedFileRoot"] - resources: - mem=4 + mem=config["std_mem"], + runtime=config["std_runtime"], + pe=config["std_numCores"] log: "logs/bxTile/{samples}/{samples}.bxTile.{chr}.log" shell: @@ -48,7 +51,8 @@ rule moleculeCoverage: output: corrDepth="results/moleculeCoverage/{tumor}/{tumor}.BXcounts.txt", cna="results/moleculeCoverage/{tumor}/{tumor}.cna.seg", - #segTxt="results/moleculeCoverage/{tumor}/{tumor}.seg.txt", + segTxt="results/moleculeCoverage/{tumor}/{tumor}.seg.txt", + paramTxt="results/moleculeCoverage/{tumor}/{tumor}.params.txt", outDir="results/moleculeCoverage/{tumor}/", params: molCovScript=config["molCov_script"], @@ -61,9 +65,10 @@ rule moleculeCoverage: mapwig=config["molCov_mapWig"], titanLibDir=config["TitanCNA_libdir"], ichorLibDir=config["ichorCNA_libdir"], - centromere=config["centromere"] - resources: - mem=4 + centromere=config["centromere"], + mem=config["std_mem"], + runtime=config["std_runtime"], + pe=config["std_numCores"] log: "logs/moleculeCoverage/{tumor}.molCov.log" shell: