# Install Required Software

In [None]:
#Mambaforge
!curl -L -O https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-$(uname)-$(uname -m).sh
!bash Mambaforge-$(uname)-$(uname -m).sh -b -u -p $HOME/mambaforge

#Trimmomatic, Fastqc, bowtie, samtools, rsem
!$HOME/mambaforge/bin/mamba install -y -c conda-forge -c bioconda trimmomatic fastqc bowtie samtools rsem

# Download Files

We need to create the folders we will use, as well as download the fastq and reference files that will be used for analysis. 

In [None]:
## Create Directories to hold files
!mkdir -p data
!mkdir -p data/raw_fastq
!mkdir -p data/trimmed
!mkdir -p data/fastqc
!mkdir -p data/reference
!mkdir -p data/rsem

## Download Fastq files

# Download run SA161911
!gsutil cp gs://hon_350/fastq/SE8159_SA161911_S2_L007_R1_001.fastq.gz data/raw_fastq/SA161911_1.fastq.gz
!gsutil cp gs://hon_350/fastq/SE8159_SA161911_S2_L007_R2_001.fastq.gz data/raw_fastq/SA161911_2.fastq.gz
# Download run SA161915
!gsutil cp gs://hon_350/fastq/SE8159_SA161915_S6_L007_R1_001.fastq.gz data/raw_fastq/SA161915_1.fastq.gz
!gsutil cp gs://hon_350/fastq/SE8159_SA161915_S6_L007_R2_001.fastq.gz data/raw_fastq/SA161915_2.fastq.gz

## Adapter sequences file for trimmomatic
!gsutil cp gs://hon_350/reference/TruSeq3-PE.fa data/reference/TruSeq3-PE.fa

## (Optional) Download reference files for building transcriptome index
#!gsutil cp gs://hon_350/referebce/GRCz11.dna_sm.toplevel.fa.gz data/reference/GRCz11.dna_sm.toplevel.fa.gz
#!gsutil cp gs://hon_350/referebce/GRCz11.108.gtf.gz data/reference/GRCz11.108.gtf.gz
#!echo 'unzipping...'
#!gzip -d data/reference/GRCz11.dna_sm.toplevel.fa.gz
#!gzip -d data/reference/GRCz11.107.gtf.gz
#!echo 'finished unzipping.'

## Download and unzip premade index files
!gsutil cp gs://hon_350/reference/rsem_grcz11.108_transcriptome.tar.gz data/reference/rsem_grcz11.108_transcriptome.tar.gz
!echo 'unzipping...'
!tar -xzvf data/reference/rsem_grcz11.108_transcriptome.tar.gz
!rm data/reference/rsem_grcz11.108_transcriptome.tar.gz
!echo 'finished unzipping.'

# Trimmomatic

Trimmomatic is used to trim fastq files to remove adapters, and low quality score bases in reads.

In [None]:
# Run trimmomatic

# on SA161911
!trimmomatic PE -threads 2 data/raw_fastq/SA161911_1.fastq.gz data/raw_fastq/SA161911_2.fastq.gz data/trimmed/SA161911_1_trimmed.fastq.gz data/trimmed/SA161911_1_orphans.fastq.gz data/trimmed/SA161911_2_trimmed.fastq.gz data/trimmed/SA161911_2_orphans.fastq.gz ILLUMINACLIP:data/reference/TruSeq3-PE.fa:2:30:10:2:keepBothReads LEADING:3 TRAILING:3 MINLEN:36
# on SA161915
!trimmomatic PE -threads 2 data/raw_fastq/SA161915_1.fastq.gz data/raw_fastq/SA161915_2.fastq.gz data/trimmed/SA161915_1_trimmed.fastq.gz data/trimmed/SA161915_1_orphans.fastq.gz data/trimmed/SA161915_2_trimmed.fastq.gz data/trimmed/SA161915_2_orphans.fastq.gz ILLUMINACLIP:data/reference/TruSeq3-PE.fa:2:30:10:2:keepBothReads LEADING:3 TRAILING:3 MINLEN:36


# Fastqc

Fastqc is a tool to inspect the quality of reads in fastq files.

In [None]:
# Run fastqc

# on SA161911
!fastqc -o data/fastqc data/trimmed/SA161911_1_trimmed.fastq.gz
# on SA161915
!fastqc -o data/fastqc data/trimmed/SA161915_1_trimmed.fastq.gz


# Example of in-window display of fastqc output file (SA161911).
from IPython.display import IFrame
IFrame(src='./data/fastqc/SA161911_1_trimmed_fastqc.html', width=800, height=600)

# Rsem

In order to map reads to genes, and quantify read counts, an index must be created. In this case, we are using rsem for index creation and quantifying read counts.

In [None]:
## Build transcriptome reference

#!rsem-prepare-reference --gtf data/reference/GRCz11.108.gtf --bowtie -p 8 data/reference/GRCz11.dna_sm.toplevel.fa data/reference/zebrafish_rsem/GRCz11_108 


In [None]:
# Calculate read counts

# on SA161911
!rsem-calculate-expression -p 8 --strandedness reverse data/trimmed/SA161911_1_trimmed.fastq.gz data/reference/zebrafish_rsem/GRCz11_108 data/rsem/SA161911
# on SA161915
!rsem-calculate-expression -p 8 --strandedness reverse data/trimmed/SA161915_1_trimmed.fastq.gz data/reference/zebrafish_rsem/GRCz11_108 data/rsem/SA161915


# Read count file outputs

Rsem outputs readcount files (SA####.genes.results) which are tables that contain various information, including readcounts in transcripts per kilobase million (TPM) and fragments per kilobase of exon per million mapped fragments (FPKM) format.

In [None]:
# Example of read count table output
# (sorted by top 10 expressed genes)

## on SA161911
!head -1 data/rsem/SA161911.genes.results | column -t
!sort -nrk 6 data/rsem/SA161911.genes.results |  column -t | head -10  | column -t


In [None]:
## on SA161915
!head -1 data/rsem/SA161915.genes.results | column -t
!sort -nrk 6 data/rsem/SA161915.genes.results |  column -t | head -10  | column -t


# Upload files to bucket
In order for the files to be shared and downloaded, we need to upload them to the google bucket.

In [None]:
## Upload to bucket

## upload SA161911
!gsutil cp data/rsem/SA161911.genes.results gs://hon_350/results
## upload SA161915
!gsutil cp data/rsem/SA161915.genes.results gs://hon_350/results


# (Extra) Some examples of using variables and loops for easier batch processing

Bariables can be used to make writing commands easier. For example, we run trimmomatic multiple times, but our inputs are very similar, following the pattern of SA#####_1 / SA#####_2.

So for instance, we could instead have our commands reference a 'variable', instead of a specific run id. That way, we can just change the variable, instead of having to retype the entire command.

We can just copy and paste the command, but change the variable, to run trimmomatic on different runs.

In [None]:
# Example of trimmomatic referencing a variable instead of a specific run id
fastq_id='SA161911'
!trimmomatic PE -threads 2 data/raw_fastq/{fastq_id}_1.fastq.gz data/raw_fastq/{fastq_id}_2.fastq.gz data/trimmed/{fastq_id}_1_trimmed.fastq.gz data/trimmed/{fastq_id}_1_orphans.fastq.gz data/trimmed/{fastq_id}_2_trimmed.fastq.gz data/trimmed/{fastq_id}_2_orphans.fastq.gz ILLUMINACLIP:data/reference/TruSeq3-PE.fa:2:30:10:2:keepBothReads LEADING:3 TRAILING:3 MINLEN:36

# The same command will now run on SA161915, because the referenced variable has been changed.
fastq_id='SA161915'
!trimmomatic PE -threads 2 data/raw_fastq/{fastq_id}_1.fastq.gz data/raw_fastq/{fastq_id}_2.fastq.gz data/trimmed/{fastq_id}_1_trimmed.fastq.gz data/trimmed/{fastq_id}_1_orphans.fastq.gz data/trimmed/{fastq_id}_2_trimmed.fastq.gz data/trimmed/{fastq_id}_2_orphans.fastq.gz ILLUMINACLIP:data/reference/TruSeq3-PE.fa:2:30:10:2:keepBothReads LEADING:3 TRAILING:3 MINLEN:36


Using variables combined with loops, pipes, and lists can make writing commands even easier.

We can 'pipe' a list into a command to have it run over the entire list.

In [None]:
# First we need a list of run ids.

# you can create this however you like.
# This might be easier in R, python, or other languages.
# for the sake of the example, I'll create one using command line
# (below is a pretty poor way of doing it, 
# but I am bad at regular expressions)
!ls data/raw_fastq/*_1.fastq.gz | paste | sed 's|data/raw_fastq/||g' | sed 's|_1.fastq.gz||g' | paste > run_ids.txt
!cat run_ids.txt

In [None]:
# Now that list can be piped into a bash command
# similar to how variables were used above
# where here {} is replaced by each line in the list.
# this command will iterate trimmomatic on the entire above list

!cat run_ids.txt |  xargs -I {} trimmomatic PE -threads 2 data/raw_fastq/{}_1.fastq.gz data/raw_fastq/{}_2.fastq.gz data/trimmed/{}_1_trimmed.fastq.gz data/trimmed/{}_1_orphans.fastq.gz data/trimmed/{}_2_trimmed.fastq.gz data/trimmed/{}_2_orphans.fastq.gz ILLUMINACLIP:data/reference/TruSeq3-PE.fa:2:30:10:2:keepBothReads LEADING:3 TRAILING:3 MINLEN:36


In [None]:
scratch

In [107]:
!date

Tue Dec 27 22:30:35 UTC 2022


In [None]:
# Run trimmomatic
!date
# on SA161911
!trimmomatic PE -threads 4 data/raw_fastq/SA161911_1.fastq.gz data/raw_fastq/SA161911_2.fastq.gz data/trimmed/SA161911_1_trimmed.fastq.gz data/trimmed/SA161911_1_orphans.fastq.gz data/trimmed/SA161911_2_trimmed.fastq.gz data/trimmed/SA161911_2_orphans.fastq.gz ILLUMINACLIP:data/reference/TruSeq3-PE.fa:2:30:10:2:keepBothReads LEADING:3 TRAILING:3 MINLEN:36
# on SA161915
#!trimmomatic PE -threads 2 data/raw_fastq/SA161915_1.fastq.gz data/raw_fastq/SA161915_2.fastq.gz data/trimmed/SA161915_1_trimmed.fastq.gz data/trimmed/SA161915_1_orphans.fastq.gz data/trimmed/SA161915_2_trimmed.fastq.gz data/trimmed/SA161915_2_orphans.fastq.gz ILLUMINACLIP:data/reference/TruSeq3-PE.fa:2:30:10:2:keepBothReads LEADING:3 TRAILING:3 MINLEN:36
!date