# scRNA-seq screen with force-sensitive enhancer library

Collab w/ Brian (BDC), this notebook is for processing the 10X data and performing DE testing using MAST in Seurat. <br>
Libraries generated in August 2021:
- Dual indexed expr libraries
- Single indexed grna libraries
- Aimed to recover 15K cells per lane, estimating ~12K singlets

### Create directory for logs

In [1]:
%%bash
mkdir -p /data/gersbachlab/lrb53/brianCollab/logs

### Download sequencing data from the Duke seq core

There are two sequencing runs:
- expression libraries: 7246
- grna libraries: 7247
<br>
<br>
Expr libraries = S4 flow cell (28x10x10x90), grna libraries (28x8x0x91) = S1 flow cell

In [1]:
%%bash

mkdir -p /data/gersbachlab/lrb53/brianCollab/data/expr
cd /data/gersbachlab/lrb53/brianCollab/data/expr

module load ddsclient

sbatch \
    --partition all \
    --mem 32G \
    --cpus-per-task 8 \
    -o /data/gersbachlab/lrb53/brianCollab/logs/download.expr.log \
    <<'EOF'
#!/bin/bash

ddsclient download -p Bounds_7246 /data/gersbachlab/lrb53/brianCollab/data/expr

EOF

Submitted batch job 26128696


In [2]:
%%bash

mkdir -p /data/gersbachlab/lrb53/brianCollab/data/grna
cd /data/gersbachlab/lrb53/brianCollab/data/grna

module load ddsclient

sbatch \
    --partition all \
    --mem 32G \
    --cpus-per-task 8 \
    -o /data/gersbachlab/lrb53/brianCollab/logs/download.grnas.log \
    <<'EOF'
#!/bin/bash

ddsclient download -p Bounds_7247 /data/gersbachlab/lrb53/brianCollab/data/grna

EOF

Submitted batch job 26099233


### Demultiplex sequencing data

#### Create directories for each type of library

In [1]:
%%bash
mkdir -p /data/gersbachlab/lrb53/brianCollab/data/expr/fastq
mkdir -p /data/gersbachlab/lrb53/brianCollab/data/grna/fastq

#### Expression libraries

In [11]:
%%bash
cat /data/gersbachlab/lrb53/brianCollab/data/samplesheets/expr.cellrangermkfastq.csv

Lane,Sample,Index
1,expr1,SI-TT-A6
1,expr2,SI-TT-B6
1,expr3,SI-TT-C6
1,expr4,SI-TT-D6
1,expr5,SI-TT-E6
1,expr6,SI-TT-F6
1,expr7,SI-TT-G6
1,expr8,SI-TT-H6
1,expr9,SI-TT-A7
1,expr10,SI-TT-B7
1,expr11,SI-TT-H3
1,expr12,SI-TT-H5
2,expr1,SI-TT-A6
2,expr2,SI-TT-B6
2,expr3,SI-TT-C6
2,expr4,SI-TT-D6
2,expr5,SI-TT-E6
2,expr6,SI-TT-F6
2,expr7,SI-TT-G6
2,expr8,SI-TT-H6
2,expr9,SI-TT-A7
2,expr10,SI-TT-B7
2,expr11,SI-TT-H3
2,expr12,SI-TT-H5
3,expr1,SI-TT-A6
3,expr2,SI-TT-B6
3,expr3,SI-TT-C6
3,expr4,SI-TT-D6
3,expr5,SI-TT-E6
3,expr6,SI-TT-F6
3,expr7,SI-TT-G6
3,expr8,SI-TT-H6
3,expr9,SI-TT-A7
3,expr10,SI-TT-B7
3,expr11,SI-TT-H3
3,expr12,SI-TT-H5
4,expr1,SI-TT-A6
4,expr2,SI-TT-B6
4,expr3,SI-TT-C6
4,expr4,SI-TT-D6
4,expr5,SI-TT-E6
4,expr6,SI-TT-F6
4,expr7,SI-TT-G6
4,expr8,SI-TT-H6
4,expr9,SI-TT-A7
4,expr10,SI-TT-B7
4,expr11,SI-TT-H3
4,expr12,SI-TT-H5


In [1]:
%%bash
source activate /data/gersbachlab/lrb53/envs/conda
cd /data/gersbachlab/lrb53/brianCollab/data/expr/fastq

sbatch \
    --job-name mkfastqExpr \
    --mail-user=lrb53@duke.edu \
    --mail-type=ALL \
    --partition all \
    --mem 32G \
    --cpus-per-task 8 \
    -o /data/gersbachlab/lrb53/brianCollab/logs/mkfastq.expr.log \
    <<'EOF'
#!/bin/bash

/data/gersbachlab/Software/cellranger-6.0.1/cellranger mkfastq \
    --id=expr \
    --localmem=4 \
    --localcores=8 \
    --run=/data/gersbachlab/lrb53/brianCollab/data/expr/210918_A00257_0692_BHML3KDSX2 \
    --csv=/data/gersbachlab/lrb53/brianCollab/data/samplesheets/expr.cellrangermkfastq.csv \
    --output-dir=/data/gersbachlab/lrb53/brianCollab/data/expr/fastq

EOF


Submitted batch job 26179129


#### sgRNA libraries

In [2]:
%%bash
cat /data/gersbachlab/lrb53/brianCollab/data/samplesheets/grna.bcl2fastq.samplesheet.csv

[Header],,,
IEMFileVersion,4,,
Investigator Name,LRB,,
Experiment Name,brian_collab_grnas,,
Date,09/18/21,,
Workflow,GenerateFASTQ,,
,,,
[Reads],,,
28,,,
90,,,
,,,
[Settings],,,
,,,
[Data],,,
Lane,Sample_ID,Sample_Name,index
1,grna1,grna1,TAAGGCGA
1,grna2,grna2,CGTACTAG
1,grna3,grna3,AGGCAGAA
1,grna4,grna4,TCCTGAGC
1,grna5,grna5,GGACTCCT
1,grna6,grna6,TAGGCATG
1,grna7,grna7,CTCTCTAC
1,grna8,grna8,CGAGGCTG
1,grna9,grna9,AAGAGGCA
1,grna10,grna10,GTAGAGGA
1,grna11,grna11,GCTCATGA
1,grna12,grna12,ATCTCAGG
2,grna1,grna1,TAAGGCGA
2,grna2,grna2,CGTACTAG
2,grna3,grna3,AGGCAGAA
2,grna4,grna4,TCCTGAGC
2,grna5,grna5,GGACTCCT
2,grna6,grna6,TAGGCATG
2,grna7,grna7,CTCTCTAC
2,grna8,grna8,CGAGGCTG
2,grna9,grna9,AAGAGGCA
2,grna10,grna10,GTAGAGGA
2,grna11,grna11,GCTCATGA
2,grna12,grna12,ATCTCAGG


In [10]:
%%bash
source activate /data/gersbachlab/lrb53/envs/conda
cd /data/gersbachlab/lrb53/brianCollab/data/grna/fastq

sbatch \
    --job-name mkfastqGrna \
    --mail-user=lrb53@duke.edu \
    --mail-type=ALL \
    --partition all \
    --mem 32G \
    --cpus-per-task 8 \
    -o /data/gersbachlab/lrb53/brianCollab/logs/mkfastq.grna.log \
    <<'EOF'
#!/bin/bash

/data/gersbachlab/Software/cellranger-6.0.1/cellranger mkfastq \
    --id=grna \
    --localmem=4 \
    --localcores=8 \
    --run=/data/gersbachlab/lrb53/brianCollab/data/grna/210909_A00257_0685_AHKK53DRXY \
    --csv=/data/gersbachlab/lrb53/brianCollab/data/samplesheets/grna.bcl2fastq.samplesheet.csv \
    --output-dir=/data/gersbachlab/lrb53/brianCollab/data/grna/fastq

EOF


Submitted batch job 26099326


#### Merge fastqs

In [1]:
%%bash
mkdir -p /data/gersbachlab/lrb53/brianCollab/data/expr/fastq/merged
for EXPR in $(seq 1 12);
do
    for READ_MATE in I1 R1 R2;
    do
        cat $(find /data/gersbachlab/lrb53/brianCollab/data/expr/fastq/HML3KDSX2 -type f -iname "*_L00[1234]_${READ_MATE}_001.fastq.gz" \
              | /bin/grep "expr${EXPR}_" | sort) \
        > /data/gersbachlab/lrb53/brianCollab/data/expr/fastq/merged/expr${EXPR}_S${EXPR}_L001_${READ_MATE}_001.fastq.gz; 
    done
done

In [1]:
%%bash
mkdir -p /data/gersbachlab/lrb53/brianCollab/data/grna/fastq/merged
for grna in $(seq 1 12);
do
    for READ_MATE in I1 R1 R2;
    do
        cat $(find /data/gersbachlab/lrb53/brianCollab/data/grna/fastq/ -type f -iname "*_L00[12]_${READ_MATE}_001.fastq.gz" \
              | /bin/grep "grna${grna}_" | sort) \
        > /data/gersbachlab/lrb53/brianCollab/data/grna/fastq/merged/grna${grna}_S${grna}_L001_${READ_MATE}_001.fastq.gz; 
    done
done

Delete unmerged files to save space

In [2]:
%%bash
rm /data/gersbachlab/lrb53/brianCollab/data/expr/fastq/*.gz
rm /data/gersbachlab/lrb53/brianCollab/data/expr/fastq/HML3KDSX2/*.gz

In [2]:
%%bash
rm /data/gersbachlab/lrb53/brianCollab/data/grna/fastq/*.gz

### Use cellranger count to generate `expr` count data

In [3]:
%%bash
mkdir -p /data/gersbachlab/lrb53/brianCollab/data/expr/cellrangercount
cd /data/gersbachlab/lrb53/brianCollab/data/expr/cellrangercount

sbatch \
    --job-name countexpr \
    --mail-user=lrb53@duke.edu \
    --mail-type=ALL \
    --array=1-12 \
    -p all \
    -o /data/gersbachlab/lrb53/brianCollab/logs/expr.cellranger_counts.%a.out \
    --mem=32G \
    --cpus-per-task=16 \
    <<'EOF'
#!/bin/bash
POOL=${SLURM_ARRAY_TASK_ID}

    
/data/gersbachlab/Software/cellranger-6.0.1/cellranger count \
    --id=expr${POOL} \
    --fastqs=/data/gersbachlab/lrb53/brianCollab/data/expr/fastq/merged \
    --transcriptome=/data/gersbachlab/Software/refdata-gex-GRCh38-2020-A \
    --sample=expr${POOL} \
    --expect-cells=15000 \
    --localcores=16 \
    --localmem=32
    
EOF

Submitted batch job 26194035


### Aggregate `expr` data across wells

In [4]:
%%writefile /data/gersbachlab/lrb53/brianCollab/data/samplesheets/cellranger_aggr_expect.expr.csv
sample_id,molecule_h5
expr1,/data/gersbachlab/lrb53/brianCollab/data/expr/cellrangercount/expr1/outs/molecule_info.h5
expr2,/data/gersbachlab/lrb53/brianCollab/data/expr/cellrangercount/expr2/outs/molecule_info.h5
expr3,/data/gersbachlab/lrb53/brianCollab/data/expr/cellrangercount/expr3/outs/molecule_info.h5
expr4,/data/gersbachlab/lrb53/brianCollab/data/expr/cellrangercount/expr4/outs/molecule_info.h5
expr5,/data/gersbachlab/lrb53/brianCollab/data/expr/cellrangercount/expr5/outs/molecule_info.h5
expr6,/data/gersbachlab/lrb53/brianCollab/data/expr/cellrangercount/expr6/outs/molecule_info.h5
expr7,/data/gersbachlab/lrb53/brianCollab/data/expr/cellrangercount/expr7/outs/molecule_info.h5
expr8,/data/gersbachlab/lrb53/brianCollab/data/expr/cellrangercount/expr8/outs/molecule_info.h5
expr9,/data/gersbachlab/lrb53/brianCollab/data/expr/cellrangercount/expr9/outs/molecule_info.h5
expr10,/data/gersbachlab/lrb53/brianCollab/data/expr/cellrangercount/expr10/outs/molecule_info.h5
expr11,/data/gersbachlab/lrb53/brianCollab/data/expr/cellrangercount/expr11/outs/molecule_info.h5
expr12,/data/gersbachlab/lrb53/brianCollab/data/expr/cellrangercount/expr12/outs/molecule_info.h5

Writing /data/gersbachlab/lrb53/brianCollab/data/samplesheets/cellranger_aggr_expect.expr.csv


In [5]:
%%bash
cd /data/gersbachlab/lrb53/brianCollab/data/expr/cellrangercount

sbatch \
    --depend=afterok:26194035 \
    --job-name=expraggr \
    --mail-user=lrb53@duke.edu \
    --mail-type=ALL \
    -p all \
    -o /data/gersbachlab/brianCollab/logs/cellranger_aggr_expect.out \
    --mem=64G \
    --cpus-per-task=8 \
    <<'EOF'
#!/bin/bash

/data/gersbachlab/Software/cellranger-6.0.1/cellranger aggr \
    --id=aggr \
    --csv=/data/gersbachlab/lrb53/brianCollab/data/samplesheets/cellranger_aggr_expect.expr.csv \
    --nosecondary \
    --localcores=8 \
    --localmem=64 \
    --normalize=none

EOF

Submitted batch job 26194062


### Custom gRNA-cell assignment pipeline

#### Use cellranger count to generate `grna` count data

In [3]:
%%bash
mkdir -p /data/gersbachlab/lrb53/brianCollab/data/grna/cellrangercount
cd /data/gersbachlab/lrb53/brianCollab/data/grna/cellrangercount

sbatch \
    --job-name countGrna \
    --mail-user=lrb53@duke.edu \
    --mail-type=ALL \
    --array=1-12 \
    -p all \
    -o /data/gersbachlab/lrb53/brianCollab/logs/grna.cellranger_counts.%a.out \
    --mem=32G \
    --cpus-per-task=16 \
    <<'EOF'
#!/bin/bash
POOL=${SLURM_ARRAY_TASK_ID}

    
/data/gersbachlab/Software/cellranger-6.0.1/cellranger count \
    --id=grna${POOL} \
    --fastqs=/data/gersbachlab/lrb53/brianCollab/data/grna/fastq/merged \
    --transcriptome=/data/gersbachlab/Software/refdata-gex-GRCh38-2020-A \
    --sample=grna${POOL} \
    --expect-cells=15000 \
    --nosecondary \
    --localcores=16 \
    --localmem=32
    
EOF

Submitted batch job 26099328


#### Align grnas to custom bowtie index

In [4]:
%%bash
module load bowtie2
module load samtools

sbatch \
    --depend=afterok:26099328 \
    --mail-user=lrb53@duke.edu \
    --mail-type=ALL \
    --job-name=bowtieGrnas \
    -p all \
    --mem 32G \
    --cpus-per-task 8 \
    --array 1-12 \
    --output /data/gersbachlab/lrb53/brianCollab/logs/bowtie2.extract_umi_counts_from_grna_bam.pool_%a.out \
    <<'EOF'
#!/bin/bash
if [[ ! -e /data/gersbachlab/lrb53/brianCollab/data/grna/cellrangercount/grna${SLURM_ARRAY_TASK_ID}/outs/possorted_genome_bam.fastq.gz ]];
then
    samtools bam2fq \
        -n \
        -0 /data/gersbachlab/lrb53/brianCollab/data/grna/cellrangercount/grna${SLURM_ARRAY_TASK_ID}/outs/possorted_genome_bam.fastq.gz \
        -f 4 \
        /data/gersbachlab/lrb53/brianCollab/data/grna/cellrangercount/grna${SLURM_ARRAY_TASK_ID}/outs/possorted_genome_bam.bam
fi
bowtie2 \
    --trim5 23 \
    --trim3 48 \
    --no-unal \
    --end-to-end \
    -D 15 -R 2 -N 1 -L 18 -i S,1,0 \
    --score-min G,0,0 \
    --ignore-quals \
    -x /data/gersbachlab/lrb53/brianCollab/data/bowtieindex/scMigLibSecondary.bt2_ix \
    -U /data/gersbachlab/lrb53/brianCollab/data/grna/cellrangercount/grna${SLURM_ARRAY_TASK_ID}/outs/possorted_genome_bam.fastq.gz \
    --threads 8 \
| samtools view -b \
> /data/gersbachlab/lrb53/brianCollab/data/grna/cellrangercount/grna${SLURM_ARRAY_TASK_ID}/outs/possorted_genome_bam.bt2.bam
EOF


Submitted batch job 26099340


#### Get UMI (`UB`) and cell barcode (`CB`) from aligned fastqs

In [1]:
%%bash
module load samtools
sbatch  \
    --depend=afterok:26099340 \
    --job-name=grnaUB \
    --array=12 \
    -p all \
    -o /data/gersbachlab/lrb53/brianCollab/logs/create_UB_fastq.%a.out \
<<'EOF'
#!/bin/bash

samtools view -f4 \
/data/gersbachlab/lrb53/brianCollab/data/grna/cellrangercount/grna${SLURM_ARRAY_TASK_ID}/outs/possorted_genome_bam.bam \
| /bin/grep UB: | /bin/grep CB: \
| awk -vOFS='\n' '{print "@"$1, gensub(/.*UB:Z:([CAGT]+).*/, "\\1", "g", $0), "+", "EEEEEEEEEEEE"}' \
> /data/gersbachlab/lrb53/brianCollab/data/grna/cellrangercount/grna${SLURM_ARRAY_TASK_ID}/outs/possorted_genome_bam.UB.fastq ; 
EOF


Submitted batch job 26099968


In [2]:
%%bash
module load samtools
sbatch \
    --depend=afterok:26099340 \
    --job-name=grnaCB \
    --array=12 \
    -p all \
    -o /data/gersbachlab/lrb53/brianCollab/logs/create_CB_fastq.%a.out \
<<'EOF'
#!/bin/bash

samtools view -f4 \
/data/gersbachlab/lrb53/brianCollab/data/grna/cellrangercount/grna${SLURM_ARRAY_TASK_ID}/outs/possorted_genome_bam.bam \
| /bin/grep UB: | /bin/grep CB: \
| awk -vOFS='\n' '{print "@"$1, gensub(/.*CB:Z:(.+)\-1.*/, "\\1", "g", $0), "+", "EEEEEEEEEEEEEEEE"}' \
> /data/gersbachlab/lrb53/brianCollab/data/grna/cellrangercount/grna${SLURM_ARRAY_TASK_ID}/outs/possorted_genome_bam.CB.fastq ; 
EOF


Submitted batch job 26099969


#### Now annotate bam files with `UB` and `CB`

In [1]:
%%bash
module load java
sbatch \
    --job-name=grnaMapBarcodes \
    --array=7 \
    --mem=128G \
    -p all \
    -o /data/gersbachlab/lrb53/brianCollab/logs/merge_bam_with_tags.%a.out \
<<'EOF'
#!/bin/bash

java -Xmx90g \
    -jar /data/reddylab/software/fgbio/0.8.1/fgbio.jar AnnotateBamWithUmis \
    -i /data/gersbachlab/lrb53/brianCollab/data/grna/cellrangercount/grna${SLURM_ARRAY_TASK_ID}/outs/possorted_genome_bam.bt2.bam \
    -o /data/gersbachlab/lrb53/brianCollab/data/grna/cellrangercount/grna${SLURM_ARRAY_TASK_ID}/outs/possorted_genome_bam.bt2.UB.bam \
    -f /data/gersbachlab/lrb53/brianCollab/data/grna/cellrangercount/grna${SLURM_ARRAY_TASK_ID}/outs/possorted_genome_bam.UB.fastq \
    -t UB
java -Xmx90g \
    -jar /data/reddylab/software/fgbio/0.8.1/fgbio.jar AnnotateBamWithUmis \
    -i /data/gersbachlab/lrb53/brianCollab/data/grna/cellrangercount/grna${SLURM_ARRAY_TASK_ID}/outs/possorted_genome_bam.bt2.UB.bam \
    -o /data/gersbachlab/lrb53/brianCollab/data/grna/cellrangercount/grna${SLURM_ARRAY_TASK_ID}/outs/possorted_genome_bam.bt2.with_UB_CB.bam \
    -f /data/gersbachlab/lrb53/brianCollab/data/grna/cellrangercount/grna${SLURM_ARRAY_TASK_ID}/outs/possorted_genome_bam.CB.fastq \
    -t CB

EOF


Submitted batch job 26117475


grna7 is throwing error that sequence and quality lengths are different... reran and completed first annotation fine but ran out of memory for the second annotation, rerunning...

#### Finally, count the `grna` UMIs for cells in the `expr` data

In [1]:
%%bash
source /data/reddylab/software/miniconda3/bin/activate alex_py3
module load bzip2

cd /data/gersbachlab/lrb53/brianCollab/data/grna/cellrangercount

sbatch \
    --job-name=grnaCountUmi \
    --array=1-12 \
    -p all \
    -o /data/gersbachlab/lrb53/brianCollab/logs/grna%a.extract_umi_counts_from_grna_bam.from_annotated_bam.out \
    --mem=32G \
    --cpus-per-task=1 \
    <<'EOF'
#!/bin/bash

mkdir -p /data/gersbachlab/lrb53/brianCollab/data/grna/cellrangercount/grna${SLURM_ARRAY_TASK_ID}/outs/mm

python /data/reddylab/Alex/reddylab_utils/scripts/scRNAseq.extract_umi_counts_from_grna_bam.py \
    --sam /data/gersbachlab/lrb53/brianCollab/data/grna/cellrangercount/grna${SLURM_ARRAY_TASK_ID}/outs/possorted_genome_bam.bt2.with_UB_CB.bam \
    --sam-has-all-tags \
    --protospacers /data/gersbachlab/lrb53/brianCollab/data/grna_info/scMigLib.protospacers.txt \
    --outdir /data/gersbachlab/lrb53/brianCollab/data/grna/cellrangercount/grna${SLURM_ARRAY_TASK_ID}/outs/mm \
    --cell-barcodes /data/gersbachlab/lrb53/brianCollab/data/expr/cellrangercount/expr${SLURM_ARRAY_TASK_ID}/outs/filtered_feature_bc_matrix/barcodes.tsv.gz \
    --cell-barcode-tag CB \
    --umi-tag UB \
    --outname grna${SLURM_ARRAY_TASK_ID} \
    --sam-flag 4 \
    --cell-pool ${SLURM_ARRAY_TASK_ID} \
&& echo "Done! scRNAseq.extract_umi_counts_from_grna_bam.py for pool${SLURM_ARRAY_TASK_ID} finished successfully." \
|| echo "Fail! scRNAseq.extract_umi_counts_from_grna_bam.py error out for pool${SLURM_ARRAY_TASK_ID}."

EOF



Submitted batch job 26207822


#### Make `grna` output files in correct format for Seurat

In [2]:
%%bash

sbatch \
    --mail-type=ALL \
    --mail-user=lrb53@duke.edu \
    --job-name=grnaMakeMM \
    --depend=afterok:26207822 \
    --array=1-12 \
    -p all \
    -o /data/gersbachlab/lrb53/brianCollab/logs/grna%a.format_mm_files_for_10X.out \
    --mem=2G \
    --cpus-per-task=1 \
    <<'EOF'
#!/bin/bash

cd /data/gersbachlab/lrb53/brianCollab/data/grna/cellrangercount/grna${SLURM_ARRAY_TASK_ID}/outs/mm
gzip -c grna${SLURM_ARRAY_TASK_ID}.barcodes.tsv > barcodes.tsv.gz
gzip -c grna${SLURM_ARRAY_TASK_ID}.matrix.tsv > matrix.mtx.gz
gzip -c /data/gersbachlab/lrb53/brianCollab/data/grna_info/scMigLib.protospacers.txt > features.tsv.gz

EOF


Submitted batch job 26207834


### Generate Seurat object to get coverage and MOI

In [4]:
%%bash
mkdir -p /data/gersbachlab/lrb53/brianCollab/scripts
mkdir -p /data/gersbachlab/lrb53/brianCollab/data/results

In [18]:
%%writefile /data/gersbachlab/lrb53/brianCollab/scripts/generateSeuratObject.nofilter.R
#!/usr/bin/env /data/reddylab/software/miniconda3/envs/alex_py3/bin/Rscript

suppressPackageStartupMessages(library(Seurat))
suppressPackageStartupMessages(library(GenomicRanges))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(SparseData))
suppressPackageStartupMessages(library(purrr))

sgrna_dirs <- unlist(lapply(c(1:12), function (x){paste0("/data/gersbachlab/lrb53/brianCollab/data/grna/cellrangercount/grna", x, "/outs/mm")}))

gRNAs.counts<- purrr::reduce(purrr::map(sgrna_dirs, Read10X, gene.column=1), combine)

# The combine function would not assign the label "-1" to the first batch of barcodes, 
# here we adjust for that by slapping a "-1" to all cell barcodes without a "-1"
colnames(gRNAs.counts)[1:sum(!grepl("-", colnames(gRNAs.counts)))] <- paste0(colnames(gRNAs.counts)[which(!grepl("-", colnames(gRNAs.counts)))], rep("-1", sum(!grepl("-", colnames(gRNAs.counts)))))

original_rownames <- row.names(gRNAs.counts)

filter_5tags <- gRNAs.counts>=5

gRNAs.counts@x <- gRNAs.counts@x / rep.int(Matrix::colSums(gRNAs.counts), diff(gRNAs.counts@p))

gRNAs.counts <- Matrix::drop0((gRNAs.counts>0.005)*filter_5tags)
        
cells.counts <- Read10X(data.dir="/data/gersbachlab/lrb53/brianCollab/data/expr/cellrangercount/aggr/outs/count/filtered_feature_bc_matrix/")

# cells <- CreateSeuratObject(counts=cells.counts$'Gene Expression')
cells <- CreateSeuratObject(counts=cells.counts, assay="RNA")
cells.counts <- NULL

cells <- NormalizeData(object = cells, assay = "RNA")

# Add gRNAs data
cells[["gRNAs"]] <- CreateAssayObject(counts = gRNAs.counts)
gRNAs.counts <- NULL


# Add MT percentage:
cells[["percent.mt"]] <- PercentageFeatureSet(cells, pattern = "^MT-")

saveRDS(cells, file = "../seuratobject.nofilter.rds", compress=FALSE)


Overwriting /data/gersbachlab/lrb53/brianCollab/scripts/generateSeuratObject.nofilter.R


In [19]:
%%bash
source /data/reddylab/software/miniconda3/bin/activate alex_py3
sbatch \
    --mail-type=END \
    --mail-user=lrb53@duke.edu \
    --job-name=Seuratobject \
    -p all \
    -o /data/gersbachlab/lrb53/brianCollab/logs/generateSeuratObject.nofilter.R \
    --mem 128G \
    --cpus-per-task 2 \
    <<'EOF'
#!/bin/bash

    Rscript /data/gersbachlab/lrb53/brianCollab/scripts/generateSeuratObject.nofilter.R
    
EOF

Submitted batch job 26207941


In [38]:
%%writefile /data/gersbachlab/lrb53/brianCollab/scripts/generateSeuratFeaturePlots.R
#!/usr/bin/env /data/reddylab/software/miniconda3/envs/alex_py3/bin/Rscript

suppressPackageStartupMessages(library(Seurat))
suppressPackageStartupMessages(library(GenomicRanges))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(SparseData))
suppressPackageStartupMessages(library(purrr))

sgrna_dirs <- unlist(lapply(c(1:12), function (x){paste0("/data/gersbachlab/lrb53/brianCollab/data/grna/cellrangercount/grna", x, "/outs/mm")}))

gRNAs.counts<- purrr::reduce(purrr::map(sgrna_dirs, Read10X, gene.column=1), combine)

# The combine function would not assign the label "-1" to the first batch of barcodes, 
# here we adjust for that by slapping a "-1" to all cell barcodes without a "-1"
colnames(gRNAs.counts)[1:sum(!grepl("-", colnames(gRNAs.counts)))] <- paste0(colnames(gRNAs.counts)[which(!grepl("-", colnames(gRNAs.counts)))], rep("-1", sum(!grepl("-", colnames(gRNAs.counts)))))

original_rownames <- row.names(gRNAs.counts)

filter_5tags <- gRNAs.counts>=5

gRNAs.counts@x <- gRNAs.counts@x / rep.int(Matrix::colSums(gRNAs.counts), diff(gRNAs.counts@p))

gRNAs.counts <- Matrix::drop0((gRNAs.counts>0.005)*filter_5tags)
        
cells.counts <- Read10X(data.dir="/data/gersbachlab/lrb53/brianCollab/data/expr/cellrangercount/aggr/outs/count/filtered_feature_bc_matrix/")

# cells <- CreateSeuratObject(counts=cells.counts$'Gene Expression')
cells <- CreateSeuratObject(counts=cells.counts, assay="RNA")
cells.counts <- NULL

cells <- NormalizeData(object = cells, assay = "RNA")

# Add gRNAs data
cells[["gRNAs"]] <- CreateAssayObject(counts = gRNAs.counts)
gRNAs.counts <- NULL


# Add MT percentage:
cells[["percent.mt"]] <- PercentageFeatureSet(cells, pattern = "^MT-")

cells

# Seurat: plots for nFeature, nCount, perc.mt
pdf("../brianCollab/SeuratFeaturePlots.nofilter.pdf")

myplot <- VlnPlot(cells, 
        features = c("nFeature_RNA","nCount_RNA","percent.mt"), 
        pt.size = 0
        )

print(myplot)
dev.off()

# subset cells same as for MAST
cells <- subset(cells, subset = nCount_RNA > 10000 & percent.mt < 20)
cells

# Seurat: plots for nFeature, nCount, perc.mt
pdf("../brianCollab/SeuratFeaturePlots.withfilter.pdf")

myplot <- VlnPlot(cells, 
        features = c("nFeature_RNA","nCount_RNA","percent.mt"), 
        pt.size = 0
        )

print(myplot)
dev.off()

Overwriting /data/gersbachlab/lrb53/brianCollab/scripts/generateSeuratFeaturePlots.R


In [39]:
%%bash
source /data/reddylab/software/miniconda3/bin/activate alex_py3
sbatch \
    --mail-type=END \
    --mail-user=lrb53@duke.edu \
    --job-name=SeuratFeaturePlot \
    -p all \
    -o /data/gersbachlab/lrb53/brianCollab/logs/generateFeaturePlots.R \
    --mem 128G \
    --cpus-per-task 2 \
    <<'EOF'
#!/bin/bash

    Rscript /data/gersbachlab/lrb53/brianCollab/scripts/generateSeuratFeaturePlots.R
    
EOF

Submitted batch job 26211039


In [36]:
%%writefile /data/gersbachlab/lrb53/brianCollab/scripts/generateHistograms.withfilter.R
#!/usr/bin/env /data/reddylab/software/miniconda3/envs/alex_py3/bin/Rscript

suppressPackageStartupMessages(library(Seurat))
suppressPackageStartupMessages(library(GenomicRanges))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(SparseData))
suppressPackageStartupMessages(library(purrr))

sgrna_dirs <- unlist(lapply(c(1:12), function (x){paste0("/data/gersbachlab/lrb53/brianCollab/data/grna/cellrangercount/grna", x, "/outs/mm")}))

gRNAs.counts<- purrr::reduce(purrr::map(sgrna_dirs, Read10X, gene.column=1), combine)

# The combine function would not assign the label "-1" to the first batch of barcodes, 
# here we adjust for that by slapping a "-1" to all cell barcodes without a "-1"
colnames(gRNAs.counts)[1:sum(!grepl("-", colnames(gRNAs.counts)))] <- paste0(colnames(gRNAs.counts)[which(!grepl("-", colnames(gRNAs.counts)))], rep("-1", sum(!grepl("-", colnames(gRNAs.counts)))))

original_rownames <- row.names(gRNAs.counts)

filter_5tags <- gRNAs.counts>=5

gRNAs.counts@x <- gRNAs.counts@x / rep.int(Matrix::colSums(gRNAs.counts), diff(gRNAs.counts@p))

gRNAs.counts <- Matrix::drop0((gRNAs.counts>0.005)*filter_5tags)
        
cells.counts <- Read10X(data.dir="/data/gersbachlab/lrb53/brianCollab/data/expr/cellrangercount/aggr/outs/count/filtered_feature_bc_matrix/")

# cells <- CreateSeuratObject(counts=cells.counts$'Gene Expression')
cells <- CreateSeuratObject(counts=cells.counts, assay="RNA")
cells.counts <- NULL

cells <- NormalizeData(object = cells, assay = "RNA")

# Add gRNAs data
cells[["gRNAs"]] <- CreateAssayObject(counts = gRNAs.counts)
gRNAs.counts <- NULL


# Add MT percentage:
cells[["percent.mt"]] <- PercentageFeatureSet(cells, pattern = "^MT-")

# Subset cells for:
# <20% MT reads (mostly mitochondrial reads)
# remove cells with <10K umis
cells <- subset(cells, subset = nCount_RNA > 10000 & percent.mt < 20)

suppressPackageStartupMessages(library(tidyverse))

# Coverage histogram (cells per sgRNA)
pdf("../brianCollab/sgRNACoverage.pdf")

df <- data.frame(rowSums(cells[['gRNAs']]))
df$grna <- row.names(df)
row.names(df) <- NULL
colnames(df) <- c("cells","grna")

myplot <- df %>% ggplot +
  geom_histogram(aes(x = cells), bins = 50) +
  geom_vline(xintercept = median(df$cells), color = "red") +
  geom_vline(xintercept = mean(df$cells), color = "blue") +
  ggtitle("Median (red) and mean (blue) cells per sgRNA (umi cutoff >= 5)") +
  xlab("Number of cells") +
  ylab("Number of sgRNAs")

print(myplot)
dev.off()

median(df$cells)
mean(df$cells)

# MOI histogram (sgRNAs per cell)
pdf("../brianCollab/sgRNAmoi.pdf")

df <- data.frame(colSums(cells[['gRNAs']]))
df$grna <- row.names(df)
row.names(df) <- NULL
colnames(df) <- c("umis","grna")


myplot2 <- df %>% ggplot +
  geom_bar(aes(x = umis)) +
  geom_vline(xintercept = median(df$umis), color = "red") +
  geom_vline(xintercept = mean(df$umis), color = "blue") +
  ggtitle("Median (red) and mean (blue) sgRNAs per cell (umi cutoff >= 5)") +
  xlab("Number of sgRNAs") +
  ylab("Number of cells")

print(myplot2)
dev.off()

median(df$umis)
mean(df$umis)

Overwriting /data/gersbachlab/lrb53/brianCollab/scripts/generateHistograms.withfilter.R


In [37]:
%%bash
source /data/reddylab/software/miniconda3/bin/activate alex_py3
sbatch \
    --mail-type=END \
    --mail-user=lrb53@duke.edu \
    --job-name=histograms \
    -p all \
    -o /data/gersbachlab/lrb53/brianCollab/logs/generateHistograms.withfilter.R \
    --mem 128G \
    --cpus-per-task 2 \
    <<'EOF'
#!/bin/bash

    Rscript /data/gersbachlab/lrb53/brianCollab/scripts/generateHistograms.withfilter.R
    
EOF

Submitted batch job 26210989


In [12]:
%%bash
rm /data/gersbachlab/lrb53/brianCollab/core*

### DE testing using MAST

In [16]:
%%writefile /data/gersbachlab/lrb53/brianCollab/scripts/grna.de.markers.all.region_of_interest.brianCollab.single_job.R
#!/usr/bin/env /data/reddylab/software/miniconda3/envs/alex_py3/bin/Rscript
options(future.globals.maxSize = 32000 * 1024^2)
suppressPackageStartupMessages(library(argparse))
suppressPackageStartupMessages(library(Seurat))
suppressPackageStartupMessages(library(GenomicRanges))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(SparseData))
suppressPackageStartupMessages(library(purrr))
parser <- ArgumentParser()

# specify our desired options 
# by default ArgumentParser will add an help option 
parser$add_argument("-j", "--job", help="Integer used as index in the list of gRNAs")
parser$add_argument("-s", "--sgrnas", nargs='+', 
                    help="List of folders containing sgRNAs data for Read10X function. Each folder should have 3 files: MarketMatrix file (matrix.mtx.gz); Cell barcodes file (barcodes.tsv.gz); sgRNA identifiers file (features.tsv.gz).")
parser$add_argument("-t", "--transcript-data", nargs='+', 
                    help="List of folders containing transcriptomic data for Read10X function. Each folder should have 3 files: MarketMatrix file (matrix.mtx.gz); Cell barcodes file (barcodes.tsv.gz); Gene/transcript identifiers file (features.tsv.gz).")
parser$add_argument("--de-method", default="MAST", help="Method to define DE genes (test.use argument in Seurat FindMarkers function)")
# parser$add_argument("--gene-annotations-file", help="Bed6 file with gene definitions. Do not include headers. Expected columns: c('chrom', 'start', 'end', 'gene_symbol', 'score', 'strand')")
parser$add_argument("--output-basename", help="Basename used to create output file (args$output-basename + args$de_method + args$job)")
parser$add_argument("--cell-barcodes-whitelist", help="When specified, only restrict analysis to these cell barcodes", required=F)
parser$add_argument("--do-not-rename-barcodes", action="store_true", default=FALSE, 
                    help="Do not rename cell barcodes (see code)", required=F)
parser$add_argument("--pseudocount", help="Pseudocount used to compute log2 fold-changes (Default: 1)", default=1)


# get command line options, if help option encountered print help and exit,
# otherwise if options not found on command line then set defaults, 
args <- parser$parse_args()
job <- as.integer(args$job)
sgrna_dirs <- args$sgrnas

gRNAs.counts<- purrr::reduce(purrr::map(sgrna_dirs, Read10X, gene.column=1), combine)
        
        
if (!is.null(args$cell_barcodes_whitelist)){
    cell_barcodes_whitelist <- read.table(args$cell_barcodes_whitelist, sep="\t")$V1
    gRNAs.counts <- gRNAs.counts[, colnames(gRNAs.counts) %in% cell_barcodes_whitelist]
}

        
# The combine function would not assign the label "-1" to the first batch of barcodes, 
# here we adjust for that by slapping a "-1" to all cell barcodes without a "-1"
if (!args$do_not_rename_barcodes){
    colnames(gRNAs.counts)[1:sum(!grepl("-", colnames(gRNAs.counts)))] <- paste0(colnames(gRNAs.counts)[which(!grepl("-", colnames(gRNAs.counts)))], 
           rep("-1", sum(!grepl("-", colnames(gRNAs.counts)))))    
}

# preserve original gRNA names (saved in row names)
original_rownames <- row.names(gRNAs.counts)

# Create a filter to identify gRNA presence in cells with at least 5 UMIs 
filter_5tags <- gRNAs.counts>=5

# Divide each UMI count by the total number of UMI counts per cell (analogous to library size)
gRNAs.counts@x <- gRNAs.counts@x / rep.int(Matrix::colSums(gRNAs.counts), diff(gRNAs.counts@p))

# Keep only gRNAs: 1) representing >=0.5% of cell UMI counts, and 2) at least 5 UMI counts per gRNA
gRNAs.counts <- Matrix::drop0((gRNAs.counts>0.005)*filter_5tags)

# Load 10X transcriptomic reads 
cells.counts<- purrr::reduce(purrr::map(args$transcript_data, Read10X), combine)

if (!is.null(args$cell_barcodes_whitelist)){
    cells.counts <- cells.counts[, colnames(cells.counts) %in% cell_barcodes_whitelist]
}

cells <- CreateSeuratObject(counts=cells.counts, assay="RNA")
cells.counts <- NULL

cells <- NormalizeData(object = cells, assay = "RNA")

# Add gRNAs data
cells[["gRNAs"]] <- CreateAssayObject(counts = gRNAs.counts)
gRNAs.counts <- NULL



# genes <- GRanges(read.csv('/data/reddylab/Reference_Data/Gencode/v19/gencode.v19.annotation.genes.bed', 
#                   sep='\t', col.names=c('chrom', 'start', 'end', 'gene_symbol', 'score', 'strand')))
# genes <- GRanges(genes)

# genes <- GRanges(read.csv(args$gene_annotations_file, 
#                   sep='\t', col.names=c('chrom', 'start', 'end', 'gene_symbol', 'score', 'strand')))

# grna_genes <- genes[queryHits(findOverlaps(genes, 
#                                            target_region, 
#                                            ignore.strand=TRUE)), ]$gene_symbol

grna_genes <- read.delim("/data/gersbachlab/lrb53/brianCollab/data/genes.txt", header = FALSE)

# Add MT percentage and filter cells:
#    - with mostly mitochondrial reads (>20%)
#    - with less than 10k mRNA UMIs
cells[["percent.mt"]] <- PercentageFeatureSet(cells, pattern = "^MT-")
cells <- subset(cells, subset = nCount_RNA > 10000 & percent.mt < 20)

### Run MAST through FindMarkers

grna_ix <- as.integer(args$job)
cat(grna_ix)

# If a sgRNA is present in a cell, assign as '1'. If cell has no sgRNA observed in the experiment, assign 2. 
# Test between cells with a given sgRNA vs all cells that received at least one sgRNA but not that one

Idents(object = cells) <- 0
Idents(object = cells, cells = which(as.vector(cells[['gRNAs']][grna_ix,]>0))) <- 1
Idents(object = cells, cells = which(as.vector(colSums(cells[['gRNAs']])==0))) <- 2

tryCatch({
    grna.de.markers <- FindMarkers(
                cells, 
                ident.1=1, 
                ident.2=0, 
                test.use= args$de_method, 
                features= which(row.names(cells[['RNA']]) %in% grna_genes$V1), 
                assay='RNA', 
                logfc.threshold=0,
                min.pct=0,
                min.diff.pct=-Inf,
                min.cells.feature=0,
                min.cells.group=0,
                pseudocount.use=as.numeric(args$pseudocount),
                verbose = FALSE)
    grna.de.markers['pval_fdr_corrected'] <- p.adjust(grna.de.markers$p_val, 
                                                      method = 'fdr', 
                                                      n = length(grna.de.markers$p_val))
    grna.de.markers['grna'] <- row.names(cells[['gRNAs']])[grna_ix] 

    grna.de.markers
}, error = function(e) {
    NA
})

foo <- list(grna.de.markers)

grna.de.markers.all <- data.table::rbindlist(foo[!is.na(foo)])
grna.de.markers.all[,gene_symbol := unlist(lapply(foo[!is.na(foo)], rownames))]
head(grna.de.markers.all)

write.table(grna.de.markers.all, 
            paste(args$output_basename, args$de_method, args$job, "txt", sep="."), 
            sep='\t', quote=F,  row.names=F)
            

Overwriting /data/gersbachlab/lrb53/brianCollab/scripts/grna.de.markers.all.region_of_interest.brianCollab.single_job.R


In [33]:
%%bash
mkdir -p /data/gersbachlab/lrb53/brianCollab/logs/MAST

In [34]:
%%bash
DE_METHOD=MAST
mkdir -p /data/gersbachlab/lrb53/brianCollab/data/results/${DE_METHOD}
source /data/reddylab/software/miniconda3/bin/activate alex_py3

sbatch \
    --job-name=brianMAST \
    --mail-type=END \
    --mail-user=lrb53@duke.edu \
    -p all \
    --mem=64G \
    --array=2-1005%100 \
    -o /data/gersbachlab/lrb53/brianCollab/logs/${DE_METHOD}/grna.de.markers.all.region_of_interest.${DE_METHOD}.filter_zeros.%a.out \
    <<'EOF'
#!/bin/bash

DE_METHOD=MAST

if [[ ! -e /data/gersbachlab/lrb53/brianCollab/data/results/${DE_METHOD}/grna.de.markers.MAST.${SLURM_ARRAY_TASK_ID}.txt ]];
then
    Rscript /data/gersbachlab/lrb53/brianCollab/scripts/grna.de.markers.all.region_of_interest.brianCollab.single_job.R \
        --job ${SLURM_ARRAY_TASK_ID} \
        --sgrnas $(echo /data/gersbachlab/lrb53/brianCollab/data/grna/cellrangercount/grna{1,2,3,4,5,6,7,8,9,10,11,12}/outs/mm) \
        --transcript-data /data/gersbachlab/lrb53/brianCollab/data/expr/cellrangercount/aggr/outs/count/filtered_feature_bc_matrix \
        --de-method ${DE_METHOD} \
        --output-basename \
            /data/gersbachlab/lrb53/brianCollab/data/results/${DE_METHOD}/grna.de.markers
else
    echo "${DE_METHOD} run of job${SLURM_ARRAY_TASK_ID} already found!"
fi

EOF

Submitted batch job 26210789


#### Aggregate MAST results

In [35]:
%%bash

sbatch \
    --depend=afterok:26210789 \
    --job-name=brianAggrMast \
    --partition all \
    --mem=2G \
    --mail-user=lrb53@duke.edu \
    --mail-type=END \
    << 'EOF'
#!/bin/bash
   
DE_METHOD=MAST

cd /data/gersbachlab/lrb53/brianCollab/data/results/${DE_METHOD}
mkdir -p aggr
head -n1 grna.de.markers.${DE_METHOD}.1.txt \
> aggr/grna.de.markers.${DE_METHOD}.all.txt \
&& cat grna.de.markers.${DE_METHOD}.*.txt \
| awk '$1 !~ /^p_val/' \
>> aggr/grna.de.markers.${DE_METHOD}.all.txt

EOF

Submitted batch job 26210890


### Update 2021/10/16: run MAST for given window around each DHS

In [2]:
%%bash
DE_METHOD=MAST
mkdir -p /data/gersbachlab/lrb53/brianCollab/data/results20211016/${DE_METHOD}
mkdir -p /data/gersbachlab/lrb53/brianCollab/logs/${DE_METHOD}_20211016
source /data/reddylab/software/miniconda3/bin/activate alex_py3

sbatch \
    --job-name=brianMAST \
    --mail-type=END \
    --mail-user=lrb53@duke.edu \
    -p all \
    --mem=128G \
    -o /data/gersbachlab/lrb53/brianCollab/logs/${DE_METHOD}_20211016/v2.grna.de.markers.all.region_of_interest.${DE_METHOD}_20211016.filter_zeros.fixggname.fixfilename.out \
    <<'EOF'
#!/bin/bash

    Rscript /data/gersbachlab/lrb53/brianCollab/scripts/brianCollab.DEtest.centeronDhs.FindMarkers.v2.R

EOF

Submitted batch job 26620241


In [2]:
%%bash
DE_METHOD=MAST
mkdir -p /data/gersbachlab/lrb53/brianCollab/data/results/${DE_METHOD}_20211023
mkdir -p /data/gersbachlab/lrb53/brianCollab/logs/${DE_METHOD}_20211023

source /data/reddylab/software/miniconda3/bin/activate alex_py3

sbatch \
    --job-name=brianMAST \
    --mail-type=END \
    --mail-user=lrb53@duke.edu \
    -p all \
    --mem=64G \
    --array=1 \
    -o /data/gersbachlab/lrb53/brianCollab/logs/${DE_METHOD}_20211023/grna.de.markers.all.region_of_interest.${DE_METHOD}.filter_zeros.%a.out \
    <<'EOF'
#!/bin/bash

DE_METHOD=MAST

if [[ ! -e /data/gersbachlab/lrb53/brianCollab/data/results/${DE_METHOD}_20211023/grna.de.markers.MAST.${SLURM_ARRAY_TASK_ID}.txt ]];
then
    Rscript /data/gersbachlab/lrb53/brianCollab/scripts/brianCollab.DEtest.centeronDhs.FindMarkers.v3.R \
        --job ${SLURM_ARRAY_TASK_ID} \
        --de-method ${DE_METHOD} \
        --output-basename \
            /data/gersbachlab/lrb53/brianCollab/data/results/${DE_METHOD}_20211023/grna.de.markers
else
    echo "${DE_METHOD} run of job${SLURM_ARRAY_TASK_ID} already found!"
fi

EOF

Submitted batch job 26624749


#### Test all genes within +/- 2Mb of each targeting sgRNA

In [1]:
%%bash
DE_METHOD=MAST
mkdir -p /data/gersbachlab/lrb53/brianCollab/data/results/${DE_METHOD}_20211023
mkdir -p /data/gersbachlab/lrb53/brianCollab/logs/${DE_METHOD}_20211023

source /data/reddylab/software/miniconda3/bin/activate alex_py3

sbatch \
    --job-name=v4mast \
    --mail-type=END \
    --mail-user=lrb53@duke.edu \
    -p all \
    --mem=64G \
    --array=1-1005%100 \
    -o /data/gersbachlab/lrb53/brianCollab/logs/${DE_METHOD}_20211023/v4.grna.de.markers.all.region_of_interest.${DE_METHOD}.filter_zeros.%a.out \
    <<'EOF'
#!/bin/bash

DE_METHOD=MAST

if [[ ! -e /data/gersbachlab/lrb53/brianCollab/data/results/${DE_METHOD}_20211023/v4.grna.de.markers.MAST.${SLURM_ARRAY_TASK_ID}.txt ]];
then
    Rscript /data/gersbachlab/lrb53/brianCollab/scripts/brianCollab.DEtest.centeronDhs.FindMarkers.v4.R \
        --job ${SLURM_ARRAY_TASK_ID} \
        --de-method ${DE_METHOD} \
        --output-basename \
            /data/gersbachlab/lrb53/brianCollab/data/results/${DE_METHOD}_20211023/v4.grna.de.markers
else
    echo "${DE_METHOD} run of job${SLURM_ARRAY_TASK_ID} already found!"
fi

EOF

Submitted batch job 26624751


In [2]:
%%bash

sbatch \
    --depend=afterany:26624751 \
    --job-name=brianAggrMast \
    --partition all \
    --mem=2G \
    --mail-user=lrb53@duke.edu \
    --mail-type=END \
    << 'EOF'
#!/bin/bash
   
DE_METHOD=MAST

cd /data/gersbachlab/lrb53/brianCollab/data/results/${DE_METHOD}_20211023
mkdir -p aggr
head -n1 v4.grna.de.markers.${DE_METHOD}.1.txt \
> aggr/v4.grna.de.markers.${DE_METHOD}.all.txt \
&& cat v4.grna.de.markers.${DE_METHOD}.*.txt \
| awk '$1 !~ /^p_val/' \
>> aggr/v4.grna.de.markers.${DE_METHOD}.all.txt

EOF

Submitted batch job 26625759


#### Test all genes within +/- 1Mb of each targeting sgRNA

In [1]:
%%bash
DE_METHOD=MAST
mkdir -p /data/gersbachlab/lrb53/brianCollab/data/results/${DE_METHOD}_20211030
mkdir -p /data/gersbachlab/lrb53/brianCollab/logs/${DE_METHOD}_20211030

source /data/reddylab/software/miniconda3/bin/activate alex_py3

sbatch \
    --job-name=v5mast \
    --mail-type=END \
    --mail-user=lrb53@duke.edu \
    -p all \
    --mem=64G \
    --array=1-1005%50 \
    -o /data/gersbachlab/lrb53/brianCollab/logs/${DE_METHOD}_20211030/v5.grna.de.markers.all.region_of_interest.${DE_METHOD}.filter_zeros.%a.out \
    <<'EOF'
#!/bin/bash

DE_METHOD=MAST

if [[ ! -e /data/gersbachlab/lrb53/brianCollab/data/results/${DE_METHOD}_20211030/v5.grna.de.markers.MAST.${SLURM_ARRAY_TASK_ID}.txt ]];
then
    Rscript /data/gersbachlab/lrb53/brianCollab/scripts/brianCollab.DEtest.centeronDhs.FindMarkers.v5.R \
        --job ${SLURM_ARRAY_TASK_ID} \
        --de-method ${DE_METHOD} \
        --output-basename \
            /data/gersbachlab/lrb53/brianCollab/data/results/${DE_METHOD}_20211030/v5.grna.de.markers
else
    echo "${DE_METHOD} run of job${SLURM_ARRAY_TASK_ID} already found!"
fi

EOF

Submitted batch job 26685029


In [2]:
%%bash

sbatch \
    --depend=afterany:26685029 \
    --job-name=brianAggrMast \
    --partition all \
    --mem=2G \
    --mail-user=lrb53@duke.edu \
    --mail-type=END \
    << 'EOF'
#!/bin/bash
   
DE_METHOD=MAST

cd /data/gersbachlab/lrb53/brianCollab/data/results/${DE_METHOD}_20211030
mkdir -p aggr
head -n1 v5.grna.de.markers.${DE_METHOD}.1.txt \
> aggr/v5.grna.de.markers.${DE_METHOD}.all.txt \
&& cat v5.grna.de.markers.${DE_METHOD}.*.txt \
| awk '$1 !~ /^p_val/' \
>> aggr/v5.grna.de.markers.${DE_METHOD}.all.txt

EOF

Submitted batch job 26685080
