In [None]:
# Ask for resources, 100 GB is a lot, ask only if you will be creating a new index, otherwise 50 GB should be enough
# If run is stated as killed, you likely did not have enough memory
srun --pty -c 12 -p interactive -t 0-12:00 --mem 100GB /bin/bash

In [None]:
# Enter scratch folder
cd /path/to/folder/

In [None]:
# Genecode M25, equivalent to GRCM38
# https://www.gencodegenes.org/mouse/releases.html
wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.transcripts.fa.gz


In [None]:
# For an older build GRCm38
wget https://ftp.ensembl.org/pub/release-102/fasta/mus_musculus/cdna/Mus_musculus.GRCm38.cdna.all.fa.gz


In [None]:
module purge
module load miniconda3/4.10.3
conda init zsh
####. ~/.bashrc remove if zshrc is successfull for others
. ~/.zshrc

In [None]:
conda config --add channels defaults
conda config --add channels bioconda
conda config --add channels conda-forge
conda config --set offline false

Part 1:
Creating your enviornments

In [None]:
# create an enviornemnt for quality control, trimming, and alignment 
# select y for proceeding after running the command
conda create --name rnaseq

# enter the enviornment
conda activate rnaseq

In [None]:
# install star
# select y for proceeding after running the command
conda install -c bioconda star

In [None]:
# install fasttqc
# select y for proceeding after running the command
conda install -c bioconda fastqc

In [None]:
# install multiqc
# select y for proceeding after running the command
conda install -c bioconda multiqc

In [None]:
# install trimmomcatic
# select y for proceeding after running the command
conda install -c bioconda trimmomatic

In [None]:
# Locate the adapter sequence, should look like something below
# /home/sez10/miniconda3/envs/meta_assemblers/share/trimmomatic-0.39-1/adapters/
# If this is not where your adapters are stored, identify using 
# find / -name NexteraPE-PE.fa 2>/dev/null

# This takes some time on the cluster, go do something else while you wait >10 minutes
rsync -ahP /home/sez10/miniconda3/envs/meta_assemblers/share/trimmomatic-0.39-1/adapters/NexteraPE-PE.fa ./


In [None]:
# Leave this enviornment
conda deactivate

In [None]:
# Create an enviornemnt for quality control, trimming, and alignment 
# Select y for proceeding after running the command
conda create --name post_alignment

# Enter the enviornment
conda activate post_alignment

In [None]:
# Install samtools
# Select y for proceeding after running the command
conda install -c bioconda samtools

In [None]:
# Install rsem
# Select y for proceeding after running the command
conda install -c bioconda rsem

In [None]:
# Leave
conda deactivate

Part 2: Pre-processing fastqc, multiqc, trimming adaptors, alignment

In [None]:
# Go into rnaseq enviornment
conda activate rnaseq

In [None]:
# In your folder, make this folder and put your fastqc files in it
mkdir fastqc_files

In [None]:
# Run as a fastqc.sh in the future


In [None]:
# Fastqc script below

#!/bin/bash

# Directory containing the FASTQ files
fastq_dir="./fastq_files"

# Directory to store FastQC outputs
output_dir="./FASTQC_report"

# Create output directory if it doesn't exist
mkdir -p $output_dir

# Loop through all the FASTQ files in the fastq_dir
for file in $fastq_dir/*.fastq.gz; do
    # Run FastQC on each FASTQ file
    fastqc -o $output_dir -t 12 $file
done


Question for the above script, how do you request threads specified by the user?

In [None]:
# Multiqc the fastq files into one output
multiqc -f $output_dir -o multiqc_fastqc_results

view output by downloading the html file to your desktop

In [None]:
# Trim adapters with trimmomatic, takes a few minutes
#!/bin/bash

# Define the directory containing fastq files
fastq_dir="./fastq_files"

# Define the output directory for Trimmomatic
output_dir="./trimmed_fastq"

# Make the output directory if it does not already exist
mkdir -p $output_dir

# Loop through all fastq files in the directory
for file in $fastq_dir/*_R1_001.fastq.gz; do
    # Get the base name of the file (without the path or extension)
    base=$(basename $file .fastq.gz | sed 's/_R1_.*//')

    # Run Trimmomatic on each paired-end FASTQ file
    trimmomatic PE \
        -threads 8 -phred33 "$fastq_dir/${base}_R1_001.fastq.gz" \
        "$fastq_dir/${base}_R2_001.fastq.gz" \
        "$output_dir/${base}_trimmed_R1.fastq.gz" \
        "$output_dir/${base}_trimmed_R1_unpaired.fastq.gz" \
        "$output_dir/${base}_trimmed_R2.fastq.gz" \
        "$output_dir/${base}_trimmed_R2_unpaired.fastq.gz" \
        ILLUMINACLIP:NexteraPE-PE.fa:2:30:10 \
        LEADING:3 \
        TRAILING:3 \
        SLIDINGWINDOW:4:15 \
        MINLEN:36
done

In [None]:
# Run fastqc, then multiqc to view data
fastq_dir="./trimmed_fastq"

# Directory to store FastQC outputs
output_dir="./trimmed_report"

# Create output directory if it doesn't exist
mkdir -p $output_dir

# Loop through all the FASTQ files in the fastq_dir
for file in $fastq_dir/*.fastq.gz; do
    # Run FastQC on each FASTQ file
    fastqc -o $output_dir -t 12 $file
done

# MultiQC, verify adapters are removed
multiqc -f $output_dir --ignore '*unpaired*' -o multiqc_trim_results


Run star

In [None]:
# Create an index file (only needs to be done once, if someone has it, just copy into your folder
# because it requires a lot of RAM

#######################################################
# Do Not Execute, code to make index for genecode M31 #
#######################################################

# Make a text file
nano index_M31.sh

# Unpack files
gzip -d gencode.vM31.transcripts.fa.gz
gzip -d gencode.vM31.annotation.gtf.gz
gzip -d gencode.vM25.annotation.fa.gz
gzip -d gencode.vM25.annotation.gtf.gz

# index_M31.sh
#!/bin/bash
#SBATCH -c 12
#SBATCH -t 0-12:00
#SBATCH -p short
#SBATCH --mem=100GB
#SBATCH -o hostname_%j.out
#SBATCH -e hostname_%j.err
#SBATCH --mail-type=FAIL
#SBATCH --mail-user=ivl912@hms.harvard.edu

module purge
module load miniconda3/4.10.3
conda init zsh

####. ~/.bashrc remove if zshrc is successfull for others

. ~/.zshrc
conda activate rnaseq

# Genecode M31
ref_genome=/path/to/folder/Test/gencode.vM31.transcripts.fa
gtf_file=/path/to/folder/Test/gencode.vM31.annotation.gtf

mkdir -p star_index_M31

# Define the directory where the star index will be stored
star_index_dir=/path/to/folder/Test/star_index_M31

# Increase RAM useage
RAM=104454248032

# Command to run STAR to create the index
STAR \
  --runThreadN 12 \
  --runMode genomeGenerate \
  --genomeDir $star_index_dir \
  --genomeFastaFiles $ref_genome \
  --limitGenomeGenerateRAM $RAM

# Run the code once the script is saved
sbatch index_M31.sh

#######################################################
# Do Not Execute, code to make index for genecode M25 #
#######################################################

# Make a text file
nano index_M31.sh

#!/bin/bash
#SBATCH -c 12
#SBATCH -t 0-12:00
#SBATCH -p short
#SBATCH --mem=100GB
#SBATCH -o hostname_%j.out
#SBATCH -e hostname_%j.err
#SBATCH --mail-type=FAIL
#SBATCH --mail-user=ivl912@hms.harvard.edu

module purge
module load miniconda3/4.10.3
conda init zsh

####. ~/.bashrc remove if zshrc is successfull for others
. ~/.zshrc
conda activate rnaseq

# Genecode M25
ref_genome=/path/to/folder/Test/gencode.vM25.transcripts.fa
gtf_file=/path/to/folder/Test/gencode.vM25.annotation.gtf.gz

mkdir -p star_index_38_g25

# Define the directory where the star index will be stored
star_index_dir=/path/to/folder/Test/star_index_M25

# Increase RAM useage
RAM=104454248032

# Command to run STAR to create the index
STAR \
  --runThreadN 12 \
  --runMode genomeGenerate \
  --genomeDir $star_index_dir \
  --genomeFastaFiles $ref_genome \
  --limitGenomeGenerateRAM $RAM

# Run the code once the script is saved
sbatch index_M25.sh