In [None]:
###############################################################################
# NET-Seq analysis pipeline
#
# The script assumes that sample .fastq files are organized as follows:
#
# ..-data
#    |
#    |-sample1
#    | |-sample1.fastq.gz
#    |
#    |-sample2
#    | |-sample2.fastq.gz
#    .....
#    and so on.
###############################################################################

index="../ref/MG1655"
base_data="../data"
base_results="../results"

NUMPROC=30 # For snowflake, adjust as needed
READLEN=20 # Trim reads to this length

start_time=`date +%s`

for sample in $(ls $datadir)
do
    sampledir="$base_data/$sample"
    resultdir="$base_results/$sample"
    bedfile="$resultdir/$sample/${sample}_aligned.bed"
    zcat $sampledir/$sample.fastq.gz \
        | bowtie2 -p $NUMPROC --trim-to $READLEN -x $index -U - \
        | samtools view -bhS -F 4 \
        | bedtools bamtobed -i '-' \
        > $bedfile

    grep -w '+' $bedfile \
        | awk '{print $1"\t"$2"\t"($2+1)"\t"".""\t"$5"\t"$6}' \
        > $resultdir/plus_3end_2.bed
    
    grep -w '-' $bedfile \
        | awk '{print $1"\t"($3-1)"\t"($3)"\t"".""\t"$5"\t"$6}' \
        > $resultdir/minus_3end_2.bed 
    
done

echo "run time is $(expr `date +%s` - $start_time) s"