MoTrPAC · mihirsamdarshi · Aug 14, 2022 · Aug 15, 2022 · Aug 23, 2022 · Sep 2, 2022
diff --git a/src/align_stats.sh b/src/align_stats.sh
@@ -85,12 +85,12 @@ align_stats() {
 export -f align_stats
 
 if [ "$type" == "glob" ]; then
-  parallel --verbose --jobs "$cores" align_stats ::: "$(ls "${bamdir%/}"/*/*/align/rep*/*.trim.bam)"
+  parallel --joblog ~/mnt/tmp/"$(basename bamdir)"_align_stats_joblog.log --progress --verbose --jobs "$cores" align_stats ::: "$(ls "${bamdir%/}"/*/*/align/rep*/*_R1.trim.bam)"
 elif [ "$type" == "file" ]; then
   readarray -t raw_bam_list <<<"$bamdir"
-  parallel --verbose --jobs "$cores" align_stats ::: "${raw_bam_list[@]}"
+  parallel --joblog ~/mnt/tmp/"$(basename bamdir)"_align_stats_joblog.log --progress --verbose --jobs "$cores" align_stats ::: "${raw_bam_list[@]}"
 elif [ "$type" == "find" ]; then
-  parallel --verbose --jobs "$cores" align_stats ::: "$(find -name "*_R1.trim.bam" "$bamdir")"
+  parallel --joblog ~/mnt/tmp/"$(basename bamdir)"_align_stats_joblog.log --progress --verbose --jobs --jobs "$cores" align_stats ::: "$(find -name "*_R1.trim.bam" "$bamdir")"
 else
   echo "type must be glob, file, or find"
   exit 1

diff --git a/src/align_stats_concat_only.sh b/src/align_stats_concat_only.sh
@@ -69,12 +69,12 @@ align_stats() {
 export -f align_stats
 
 if [ "$type" == "glob" ]; then
-  parallel --verbose --jobs "$cores" align_stats ::: "$(ls "${bamdir%/}"/*/*/align/rep*/*.trim.bam)"
+  parallel --joblog ~/mnt/tmp/"$(basename bamdir)"_align_stats_concat_joblog.log --progress --verbose --jobs "$cores" align_stats ::: "$(ls "${bamdir%/}"/*/*/align/rep*/*_R1.trim.bam)"
 elif [ "$type" == "file" ]; then
   readarray -t raw_bam_list <<<"$bamdir"
-  parallel --verbose --jobs "$cores" align_stats ::: "${raw_bam_list[@]}"
+  parallel --joblog ~/mnt/tmp/"$(basename bamdir)"_align_stats_concat_joblog.log --progress --verbose --jobs "$cores" align_stats ::: "${raw_bam_list[@]}"
 elif [ "$type" == "find" ]; then
-  parallel --verbose --jobs "$cores" align_stats ::: "$(find -name "*_R1.trim.bam" "$bamdir")"
+  parallel --joblog ~/mnt/tmp/"$(basename bamdir)"_align_stats_concat_joblog.log --progress --verbose --jobs "$cores" align_stats ::: "$(find -name "*_R1.trim.bam" "$bamdir")"
 else
   echo "type must be glob, file, or find"
   exit 1

diff --git a/src/croo.sh b/src/croo.sh
@@ -1,48 +1,65 @@
 #!/bin/bash
+
+set -eux
 #Usage : src/croo.sh <list_of_atac-seq_workflow_ids> <GCP_PATH_to_atac_seq_pipeline_workflows_without_trailing_slash> <gcp-path-to-output-without-trailing-slash> >>croo_copy_jobs.txt
 #Contributors : Archana Raja, Nicole Gay
 
 if [ $# -lt 3 ]; then
-  echo "Usage: ./croo.sh [WORKFLOW_ID_LIST] [GCP_PATH] [OUT_PATH]"
+  echo "Usage: ./croo.sh [WORKFLOW_SUBMISSION_MAP] [GCP_PATH] [OUT_PATH]"
   echd
-  echo "Example: croo.sh ids.txt gs://my-bucket/my_workflow/outputs/croo gs://my-bucket/my_workflow/processed"
+  echo "Example: croo.sh out.json gs://my-bucket/my_workflow/outputs/croo gs://my-bucket/my_workflow/processed"
   echo
-  echo "[WORKFLOW_ID_LIST]: A list of workflow ids to process"
+  echo "[WORKFLOW_SUBMISSION_MAP]: A list of workflow ids to process"
   echo "[GCP_PATH]: This directory with the outputs of the pipeline"
   echo "[OUT_PATH]: The location to output the croo files to"
+  echo "[PARSE_FROM_ID_LIST] (Optional): Whether to use the workflow id list to parse the files to copy. If false/not set will use qc json to create a file name"
   echo
   exit 1
 fi
 
-WORKFLOW_ID_LIST=$1
+WORKFLOW_SUBMISSION_MAP=$1
 GCP_PATH=$2
 OUT_PATH=${3%/}
-
+PARSE_FROM_ID_LIST=$4
 
 function run_croo() {
   local line=$1
+  local sample_dir
   local out_dir
+  local descrip
 
   sample_dir=$GCP_PATH/${line%/}
   out_dir=${OUT_PATH%/}/${sample_dir#gs://}
 
-  # as long as the description is hyphenated and don't contain any spaces or special characters below would work
-  descrip=$(gsutil cat "$sample_dir"/call-qc_report/glob-*/qc.json | grep "description" | sed -e 's/.*": "//' -e 's/".*//')
-
-  if [ "$descrip" = "No description" ]; then
-    descrip=$(gstil cat "$sample_dir"/call-qc_report/glob-*/qc.json | grep "title" | sed -e 's/.*": "//' -e 's/".*//')
+  if [[ "$PARSE_FROM_ID_LIST" ]]; then
+    out_dir="$out_dir"/$(jq -r '.[] | select(.workflow_id == "'"$line"'") | .label' "$WORKFLOW_SUBMISSION_MAP")
+    echo "out_dir: $out_dir"
   else
-    descrip=$descrip
+    # as long as the description is hyphenated and don't contain any spaces or special characters below would work
+    descrip=$(gsutil cat "$sample_dir"/call-qc_report/glob-*/qc.json | grep "description" | sed -e 's/.*": "//' -e 's/".*//')
+
+    if [ "$descrip" = "No description" ]; then
+      descrip=$(gstil cat "$sample_dir"/call-qc_report/glob-*/qc.json | grep "title" | sed -e 's/.*": "//' -e 's/".*//')
+    else
+      descrip=$descrip
+    fi
+
+    out_dir="$out_dir"/"${descrip/gs:\/\///}"/
   fi
 
-  out_dir="$out_dir"/"${descrip/gs:\/\///}"/
   croo --method copy "$sample_dir"/metadata.json --out-dir "$out_dir"
 }
 
 export GCP_PATH
 export OUT_PATH
+export WORKFLOW_SUBMISSION_MAP
+export PARSE_FROM_ID_LIST
 export -f run_croo
+cores=10
+
+# shellcheck disable=SC2046
+parallel --joblog ~/mnt/tmp/"${WORKFLOW_SUBMISSION_MAP%%.*}"_croo.log --progress --verbose --jobs "$cores" run_croo ::: $(jq -r '.[].workflow_id' "$WORKFLOW_SUBMISSION_MAP")
 
-while IFS= read -r line; do
-  run_croo "$line" &
-done <"$WORKFLOW_ID_LIST"
+#for line in $(jq -r '.[].workflow_id' "$WORKFLOW_SUBMISSION_MAP"); do
+#  run_croo "$line"
+#done
diff --git a/src/merge_atac_qc_human.R b/src/merge_atac_qc_human.R
@@ -0,0 +1,96 @@
+#!/bin/R
+# Nicole Gay
+# 15 May 2020 
+# Fix and merge ATAC-seq QC 
+
+#Usage : Rscript src/merge_atac_qc.R -w ~/test_mnt/PASS/atac-seq/stanford/batch4_20200928/Output/final/sample_metadata_20200928.csv -q ~/test_mnt/PASS/atac-seq/stanford/batch4_20200928/Output/final/qc/atac_qc.tsv -m ~/test_mnt/PASS/atac-seq/stanford/batch4_20200928/Output/final/rep_to_sample_map.csv -a ~/test_mnt/PASS/atac-seq/stanford/batch4_20200928/Output/final/merged_chr_info.csv -o ~/test_mnt/PASS/atac-seq/stanford/batch4_20200928/Output/final/
+
+library(data.table)
+library(optparse)
+
+option_list <- list(
+  make_option(c("-w", "--sample_meta"), help = "Absolute path to wetlab sample metadata file, e.g. sample_metadata_YYYYMMDD.csv"),
+  make_option(c("-q", "--atac_qc"), help = "Absolute path to pipeline qc metrics file output of qc2tsv tool, e.g. atac_qc.tsv"),
+  make_option(c("-a", "--align_stats"), help = "Absolute path to genome alignment stats file, e.g. merged_chr_info.csv"),
+  make_option(c("-o", "--outdir", help = "Absolute path to output directory for the merged qc reports"))
+
+)
+
+opt_parse_inst <- OptionParser(option_list = option_list)
+
+opt <- parse_args(opt_parse_inst)
+
+if (is.null(opt$sample_meta) |
+  is.null(opt$atac_qc) |
+  is.null(opt$align_stats) |
+  is.null(opt$outdir)) {
+  message("\033[31mERROR! Please provide all required arguments")
+  message("\033[34mExample: Rscript src/merge_atac_qc.R -w <sample_metadata_YYYYMMDD.csv> -q <atac_qc.tsv> -m <rep_to_sample_map.csv> -a <merged_chr_info.csv> -o <output_directory>")
+  print_help(opt_parse_inst)
+  quit("no")
+}
+
+wet <- fread(opt$sample_meta, sep = ',', header = TRUE)
+wet <- unique(wet) # remove duplicate rows
+encode <- fread(opt$atac_qc, sep = '\t', header = T)
+align_stat <- fread(opt$align_stats, sep = ',', header = T)
+
+###################################################################################
+## fix ENCODE QC
+###################################################################################
+
+# fix other "general" cols
+cols <- colnames(encode)[grepl('general', colnames(encode))]
+cols <- cols[cols != 'general.description']
+for (col in cols) {
+  print(col)
+  t1 <- encode[1, get(col)]
+  for (i in seq_len(nrow(encode))) {
+    if (as.character(encode[i, get(col)]) == '') {
+      encode[i, (col) := t1]
+    } else {
+      t1 <- encode[i, get(col)]
+    }
+  }
+}
+
+# separate workflow-level and sample-level QC
+workflow_level <- colnames(encode)[unlist(encode[, lapply(.SD, function(x) any(is.na(x) | as.character(x) == ''))])]
+workflow_qc <- encode[replicate == 'rep1', c('general.description', workflow_level), with = F]
+viallabel_qc <- encode[, colnames(encode)[!colnames(encode) %in% workflow_level], with = F]
+
+###################################################################################
+## merge all sample-level QC
+###################################################################################
+
+# merge with wet lab QC
+m1 <- merge(viallabel_qc, wet, by.x = 'general.title', by.y = 'vial_label')
+stopifnot(nrow(m1) == nrow(dt))
+# merge with align stats
+print(colnames(m1))
+print(colnames(align_stat))
+m2 <- merge(m1, align_stat, by.x = 'general.title', by.y = 'viallabel')
+stopifnot(nrow(m2) == nrow(dt))
+# remove columns of all 0 or all 100
+check_col <- function(x) {
+  if (is.numeric(x)) {
+    if (sum(as.numeric(x)) == 0 | all(x == 100)) {
+      return(x)
+    }
+  }
+  return(NA)
+}
+
+res <- lapply(m2, check_col)
+res <- res[!is.na(res)]
+m2[, names(res) := NULL]
+
+head(m2)
+
+# write out merged QC
+outfile <- paste0(trimws(opt$outdir, which = 'right', whitespace = '/'), '/', 'merged_atac_qc.tsv')
+write.table(m2, file = outfile, sep = '\t', col.names = T, row.names = F, quote = F)
+
+# write out workflow-level QC
+outfile_workflow <- paste0(opt$outdir, '/', 'encode_workflow_level_atac_qc.tsv')
+write.table(workflow_qc, file = outfile_workflow, sep = '\t', col.names = T, row.names = F, quote = F)
diff --git a/src/pass_extract_atac_from_gcp.sh b/src/pass_extract_atac_from_gcp.sh
@@ -66,7 +66,7 @@ else
   gsutil -m cp -n "${CROO_OUTPUT_PATH}/*/*/align/rep?/*tagAlign.gz" tagalign
 
   # individual signal track (p-value)
-  gsutil -m cp -n "${CROO_OUTPUT_PATH}/*/*/signal/rep?/*pval.signal.bigwig signal"
+  gsutil -m cp -n "${CROO_OUTPUT_PATH}/*/*/signal/rep?/*pval.signal.bigwig" signal
 fi
 
 gs_copy() {
@@ -115,8 +115,8 @@ gs_copy_gcp() {
 
 if [[ "$copy_dest" == "gcp" ]]; then
   export -f gs_copy_gcp
-  parallel --verbose --jobs "$NUM_CORES" gs_copy_gcp ::: "$(gsutil ls "$CROO_OUTPUT_PATH" | grep -E "/$" | grep -v "final")"
+  parallel --progress --verbose --jobs "$NUM_CORES" gs_copy_gcp ::: "$(gsutil ls "$CROO_OUTPUT_PATH" | grep -E "/$" | grep -v "final")"
 else
   export -f gs_copy
-  parallel --verbose --jobs "$NUM_CORES" gs_copy ::: "$(gsutil ls "$CROO_OUTPUT_PATH" | grep -E "/$" | grep -v "final")"
+  parallel --progress --verbose --jobs "$NUM_CORES" gs_copy ::: "$(gsutil ls "$CROO_OUTPUT_PATH" | grep -E "/$" | grep -v "final")"
 fi
diff --git a/src/qc2tsv.sh b/src/qc2tsv.sh
@@ -7,9 +7,9 @@ trap "echo ERR trap fired!" ERR
 gcp_path=$1
 outfile_name=$2
 
-gsutil ls ${gcp_path}/*/*/qc/qc.json >file_list.txt
+gsutil ls "$gcp_path"/*/*/qc/qc.json >file_list.txt
 echo "Done creating file list"
-/home/araja7/miniconda3/bin/qc2tsv --file file_list.txt --collapse-header >${outfile_name}
-gsutil mv ${outfile_name} ${gcp_path}/final/
+qc2tsv --file file_list.txt --collapse-header >"$outfile_name"
+gsutil mv "$outfile_name" "$gcp_path"/final/
 rm -rf file_list.txt
 echo "Done creating atac-seq qc report"