# Mounting Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%%bash

PROJECT_DIR="/content/drive/MyDrive/BioinformaticsProject"

# Installing Fasterq-Dump

In [None]:
# remove files
!rm -rd sratoolkit*/
!rm sratoolkit/

In [None]:
# Install fasterq-dump through the sratoolkit
!wget https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/current/sratoolkit.current-ubuntu64.tar.gz
!tar -xzf sratoolkit.current-ubuntu64.tar.gz
!mv sratoolkit.*-ubuntu64 sratoolkit
!export PATH=$PATH:/content/sratoolkit/bin

In [None]:
# Call this once in order to be able to call fasterq-dump beyond the first cell
import os
os.environ["PATH"] += ":/content/sratoolkit/bin"

In [None]:
# Check that fasterq is installed
!fasterq-dump --version

# Installing Micromamba

Following cells install micromamba and create the enviorenment we'll use

In [None]:
%%bash
# Install micromamba into /usr/local/bin
cd /usr/local

curl -Ls https://micro.mamba.pm/api/micromamba/linux-64/latest \
  -o micromamba.tar.bz2

tar -xvjf micromamba.tar.bz2 bin/micromamba

# Move it into /usr/local/bin so it's on PATH
#mv bin/micromamba /usr/local/bin/
#rmdir bin

In [None]:
%%bash
# Enable micromamba's shell integration
eval "$(micromamba shell hook -s bash)"

# Create the environment (run once per notebook/session)
micromamba create -y -n beeasm python=3.10

#

In [None]:
#@title CARPENTER BEE DUMP
!fasterq-dump SRR24955310 \
  -O /content/CarpenterBee_fastq \
  --temp /content/fasterq_tmp \
  -e 6 -p
#!ls -ld "${PROJECT_DIR}/Reads/CarpenterBee"

In [None]:
#@title HONEY BEE DUMP
!fasterq-dump SRR36076821 -e 6 -p

In [None]:
!wc -l "SRR24955310_1.fastq"
!wc -l "SRR24955310_2.fastq"

In [None]:

!cp /content/SRR24955310_1.fastq /content/drive/MyDrive/BioinformaticsProject/Reads/CarpenterBee/new_reads
!cp /content/SRR24955310_2.fastq /content/drive/MyDrive/BioinformaticsProject/Reads/CarpenterBee/new_reads


In [None]:
!cp /content/SRR36076821_1.fastq /content/drive/MyDrive/BioinformaticsProject/Reads/Honeybee/new_reads
!cp /content/SRR36076821_2.fastq /content/drive/MyDrive/BioinformaticsProject/Reads/Honeybee/new_reads

In [None]:
#@title CLEAR WORKING DIRECTORY
import shutil
import os

# items to keep
keep = {"drive", ".config", ".ipynb_checkpoints", "sratoolkit"}

for name in os.listdir("/content"):
    if name in keep:
        continue

    path = os.path.join("/content", name)

    if os.path.isdir(path):
        shutil.rmtree(path)
    else:
        os.remove(path)

Following cell installs the spades assembler

> Add blockquote



In [None]:
%%bash
# Enable micromamba in this shell
eval "$(micromamba shell hook -s bash)"
micromamba activate beeasm

# Install SPAdes from bioconda
micromamba install -y -c bioconda -c conda-forge spades

# Quick sanity check
spades.py --version


Run spades on our bacteria

In [None]:
%%bash
set -e


# Activate micromamba env
eval "$(micromamba shell hook -s bash)"
micromamba activate beeasm

# One Illumina dataset (single-end from R1)
READS="/content/drive/MyDrive/BioinformaticsProject/Reads/Bacteria/SRR3924617_1.fastq"
OUTDIR="${PROJECT_DIR}/Reads/CarpenterBee/result_spades"
THREADS=8    # adjust if needed
MEM_GB=12    # approximate RAM limit


# extract a partial read
SUBREADS="/content/subsamble_5000.fastq"
NREADS=5000

# Extract the first N reads
head -n $((NREADS * 4)) "$READS" > "$SUBREADS"

mkdir -p "$OUTDIR"

spades.py \
  -s "$READS" \
  -o "$OUTDIR" \
  -t "$THREADS" \
  -m "$MEM_GB" \
  --careful

echo "SPAdes finished. Assembly:"
ls -lh "$OUTDIR/contigs.fasta"


and now run spades on carpenter bee

In [None]:
%%bash
rm -rf /content/result_spades_honeybee*
mkdir /content/result_spades_honeybee

# Check reads with fast qc

In [None]:
!apt-get update
!apt-get install -y fastqc

In [None]:
!fastqc /content/SRR36076821_1.fastq /content/SRR36076821_2.fastq -o /content/qc_reports/

# Trimmomatic

This tool is a part of our preprocessing stage of trimming the whole genome.

In [None]:
#@title Installation

%%bash
apt-get update
apt-get install -y trimmomatic default-jre


In [None]:
%%bash
ls -lh /usr/share/trimmomatic

In [None]:
#@title 1) Trim whole genome - Honeybee

%%bash
set -e

R1="/content/SRR36076821_1.fastq"
R2="/content/SRR36076821_2.fastq"

OUT1P="/content/SRR36076821_1.trimmed.fq"
OUT1U="/content/SRR36076821_1.unpaired.fq"
OUT2P="/content/SRR36076821_2.trimmed.fq"
OUT2U="/content/SRR36076821_2.unpaired.fq"

TRIMJAR="/usr/share/java/trimmomatic.jar"
ADAPTERS="/usr/share/trimmomatic/TruSeq3-PE.fa"

java -jar "$TRIMJAR" PE -threads 8 \
  "$R1" "$R2" \
  "$OUT1P" "$OUT1U" \
  "$OUT2P" "$OUT2U" \
  ILLUMINACLIP:${ADAPTERS}:2:30:10 \
  SLIDINGWINDOW:4:20 \
  MINLEN:50

echo "Trimmed reads:"
ls -lh "$OUT1P" "$OUT2P"


In [None]:
#@title 2) Generate subreads - Honeybee

%%bash
set -e

eval "$(micromamba shell hook -s bash)"
micromamba activate beeasm

micromamba install -y -c bioconda -c conda-forge seqtk

R1=/content/SRR36076821_1.trimmed.fq
R2=/content/SRR36076821_2.trimmed.fq

NREADS=500000

SUB1="/content/SRR36076821_1.sub${NREADS}.fq"
SUB2="/content/SRR36076821_2.sub${NREADS}.fq"

echo "Creating subsampled reads (${NREADS} pairs)..."

echo "Using seqtk for random subsampling."
seqtk sample -s100 "$R1" "$NREADS" > "$SUB1"
seqtk sample -s100 "$R2" "$NREADS" > "$SUB2"

echo "Subsampled files:"
ls -lh "$SUB1" "$SUB2"

In [None]:
#@title 1) Trim whole genome - Carpenter Bee

%%bash
set -e

R1="/content/SRR24955310_1.fastq"
R2="/content/SRR24955310_2.fastq"

OUT1P="/content/SRR24955310_1.trimmed.fq"
OUT1U="/content/SRR24955310_1.unpaired.fq"
OUT2P="/content/SRR24955310_2.trimmed.fq"
OUT2U="/content/SRR24955310_2.unpaired.fq"

TRIMJAR="/usr/share/java/trimmomatic.jar"
ADAPTERS="/usr/share/trimmomatic/TruSeq3-PE.fa"

java -jar "$TRIMJAR" PE -threads 8 \
  "$R1" "$R2" \
  "$OUT1P" "$OUT1U" \
  "$OUT2P" "$OUT2U" \
  ILLUMINACLIP:${ADAPTERS}:2:30:10 \
  SLIDINGWINDOW:4:20 \
  MINLEN:50

echo "Trimmed reads:"
ls -lh "$OUT1P" "$OUT2P"


In [None]:
#@title 2) Generate subreads - Carpenter Bee

%%bash
set -e

eval "$(micromamba shell hook -s bash)"
micromamba activate beeasm

micromamba install -y -c bioconda -c conda-forge seqtk

R1=/content/SRR24955310_1.trimmed.fq
R2=/content/SRR24955310_2.trimmed.fq

NREADS=500000

SUB1="/content/SRR24955310_1.sub${NREADS}.fq"
SUB2="/content/SRR24955310_2.sub${NREADS}.fq"

echo "Creating subsampled reads (${NREADS} pairs)..."

echo "Using seqtk for random subsampling."
seqtk sample -s100 "$R1" "$NREADS" > "$SUB1"
seqtk sample -s100 "$R2" "$NREADS" > "$SUB2"

echo "Subsampled files:"
ls -lh "$SUB1" "$SUB2"

In [None]:
#@title Spades Assembler - Honeybee

%%bash
set -e

# Activate micromamba env
eval "$(micromamba shell hook -s bash)"
micromamba activate beeasm

# One Illumina dataset (single-end from R1)

#OUTDIR="${PROJECT_DIR}/Reads/Honeybee/result_spades"
THREADS=8    # adjust if needed
MEM_GB=12    # approximate RAM limit


# extract a partial read
#SUBREADS="/content/subsamble_5000.fastq"
#NREADS=5000

# Extract the first N reads
#head -n $((NREADS * 4)) "$READS" > "$SUBREADS"

#mkdir -p "$OUTDIR"

spades.py \
  -1 "" \
  -2 "" \
  -o "/content/result_spades_honeybee" \
  -t "$THREADS" \
  -m "$MEM_GB" \
  --careful

echo "SPAdes finished. Assembly:"
ls -lh "$OUTDIR/contigs.fasta"


# Megahit Installation



In [None]:
%%bash
eval "$(micromamba shell hook -s bash)"
micromamba activate beeasm

# Install MEGAHIT
micromamba install -y -c bioconda -c conda-forge megahit

# Sanity check
megahit --version


In [None]:
%%bash
eval "$(micromamba shell hook -s bash)"
micromamba activate beeasm

which megahit


and this cell runs it on bacteria

In [None]:
#@title Megahit - bacteria

%%bash
set -e

eval "$(micromamba shell hook -s bash)"
micromamba activate beeasm

# Full dataset (single-end)
READS="/content/drive/MyDrive/BioinformaticsProject/Reads/Bacteria/SRR3924617_1.fastq"

# Subset: first 5000 reads -> 4 * 5000 = 20000 lines
SUB="/content/SRR3924617_1.first5000.fastq"
OUTDIR="/content/megahit_bacteria_5k"
THREADS=4

echo "Creating subset with first 5000 reads..."
sed -n '1,20000p' "$READS" > "$SUB"
ls -lh "$SUB"

echo
echo "Running MEGAHIT on subset..."
megahit \
  -r "$SUB" \
  -o "$OUTDIR" \
  --num-cpu-threads "$THREADS"

echo
echo "MEGAHIT finished. Assembly:"
ls -lh "$OUTDIR/final.contigs.fa"


and now we run it on the carpenter bee

In [None]:
%%bash
set -e

# Activate micromamba env
eval "$(micromamba shell hook -s bash)"
micromamba activate beeasm

# One Illumina dataset (single-end from R1)
READS="/content/SRR24955310_1.fastq"  # CHANGE PATHNAME IF NEEDED
OUTDIR="/content/drive/MyDrive/BioinformaticsProject/Reads/CarpenterBee/result_spades"
THREADS=8      # adjust if needed
MEM_GB=12      # approximate RAM limit

mkdir -p "$OUTDIR"

spades.py \
  -s "$READS" \
  -o "$OUTDIR" \
  -t "$THREADS" \
  -m "$MEM_GB" \
  --careful

echo "SPAdes finished. Assembly:"
ls -lh "$OUTDIR/contigs.fasta"


In [None]:
ls -lh /content/drive/MyDrive/BioinformaticsProject/Reads/CarpenterBee/result_spades/contigs.fasta

In [None]:
mv /content/CarpenterBee_fastq/SRR24955310_2.fastq /content/drive/MyDrive/BioinformaticsProject/Reads/CarpenterBee

In [None]:
%%bash
rsync -ah --info=progress2 \
  /content/CarpenterBee_fastq/* \
  "/content/drive/MyDrive/BioinformaticsProject/Reads/CarpenterBee"


In [None]:
%%bash
rsync -ah --info=progress2 \
  /content/CarpenterBee_fastq/* \
  "/content/drive/MyDrive/BioinformaticsProject/Reads/CarpenterBee"


In [None]:
%%bash
ls


In [None]:
#@title Megahit - Carpenter Bee

%%bash
set -e

# Activate micromamba env
eval "$(micromamba shell hook -s bash)"
micromamba activate beeasm

R1="/content/SRR24955310_1.sub500000.fq"
R2="/content/SRR24955310_2.sub500000.fq"
OUTDIR="/content/megahit_carpenterbee"

#echo "Cleaning output dir ${OUTDIR}..."
#rm -rf "$OUTDIR"

megahit \
  -1 "$R1" \
  -2 "$R2" \
  -o "$OUTDIR" \
  --min-count 2 \
  --k-min 21 \
  --k-max 81 \
  --k-step 10 \
  --mem-flag 2 \
  -t 8

echo "Assembly complete:"
ls -lh "$OUTDIR/final.contigs.fa"

In [None]:
!cp -r /content/megahit_carpenterbee/ /content/drive/MyDrive/BioinformaticsProject/Reads/CarpenterBee/


In [None]:
#@title Megahit - Honeybee
%%bash
set -e

# Activate micromamba env
eval "$(micromamba shell hook -s bash)"
micromamba activate beeasm

R1="/content/SRR36076821_1.sub500000.fq"
R2="/content/SRR36076821_2.sub500000.fq"
OUTDIR="/content/megahit_honeybee"

#echo "Cleaning output dir ${OUTDIR}..."
#rm -rf "$OUTDIR"

megahit \
  -1 "$R1" \
  -2 "$R2" \
  -o "$OUTDIR" \
  --min-count 2 \
  --k-min 21 \
  --k-max 81 \
  --k-step 10 \
  --mem-flag 2 \
  -t 8

echo "Assembly complete:"
ls -lh "$OUTDIR/final.contigs.fa"


# Quast Evaluation

In [None]:
#@title Installation
%%bash
set -e

eval "$(micromamba shell hook -s bash)"
micromamba activate beeasm

micromamba install -y -c bioconda -c conda-forge quast

In [None]:
#@title Honeybee Reference Genome

%%bash
# Download honeybee reference genome (RefSeq)
wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/254/395/GCF_003254395.2_Amel_HAv3.1/GCF_003254395.2_Amel_HAv3.1_genomic.fna.gz
gunzip GCF_003254395.2_Amel_HAv3.1_genomic.fna.gz


In [None]:
#@title Honeybee - Run Quast on Spades

%%bash
set -e

# Activate micromamba env
eval "$(micromamba shell hook -s bash)"
micromamba activate beeasm

REFERENCE="GCF_003254395.2_Amel_HAv3.1_genomic.fna"
ASSEMBLY="/content/spades_honeybee/contigs.fasta"
OUTDIR="/content/quast_honeybee_spades"

quast.py "$ASSEMBLY" \
  -r "$REFERENCE" \
  -o "$OUTDIR"

In [None]:
#@title Save to Drive
!cp -r quast_honeybee_spades/ /content/drive/MyDrive/BioinformaticsProject/Reads/Honeybee/

In [None]:
!cp -r /content/drive/MyDrive/BioinformaticsProject/Reads/Honeybee/result_megahit /content/
!mv /content/result_megahit /content/result_megahit_honeybee

In [None]:
#@title Honeybee - Run Quast on Megahit

%%bash
set -e

# Activate micromamba env
eval "$(micromamba shell hook -s bash)"
micromamba activate beeasm

REFERENCE="GCF_003254395.2_Amel_HAv3.1_genomic.fna"
ASSEMBLY="/content/result_megahit_honeybee/final.contigs.fa"
OUTDIR="/content/quast_honeybee_megahit"

quast.py "$ASSEMBLY" \
  -r "$REFERENCE" \
  -o "$OUTDIR"

In [None]:
#@title Save results

!cp -r /content/quast_honeybee_megahit /content/drive/MyDrive/BioinformaticsProject/Reads/Honeybee

In [None]:
#@title Carpenter Bee Reference Genome - Install Datasets CLI

%%bash
set -e

eval "$(micromamba shell hook -s bash)"
micromamba activate beeasm


micromamba install -c conda-forge ncbi-datasets-cli


In [None]:
#@title Carpenter Bee Reference Genome - Pull from accession number

%%bash
set -e

eval "$(micromamba shell hook -s bash)"
micromamba activate beeasm

datasets download genome accession GCA_049004755.1 --include genome

In [None]:
!unzip ncbi_dataset.zip -d carpenterbee_reference_archive/


In [None]:
#@title Carpenter Bee - Run Quast on Spades

%%bash
set -e

# Activate micromamba env
eval "$(micromamba shell hook -s bash)"
micromamba activate beeasm

REFERENCE="/content/carpenterbee_reference_archive/ncbi_dataset/data/GCA_049004755.1/GCA_049004755.1_ASM4900475v1_genomic.fna"
ASSEMBLY="/content/carpenterbee_megahit_contigs.fa"
OUTDIR="/content/quast_carpenterbee_spades"

quast.py "$ASSEMBLY" \
  -r "$REFERENCE" \
  -o "$OUTDIR"

In [None]:
#@title Carpenter Bee - Run Quast on Megahit

%%bash
set -e

# Activate micromamba env
eval "$(micromamba shell hook -s bash)"
micromamba activate beeasm

REFERENCE="/content/carpenterbee_reference_archive/ncbi_dataset/data/GCA_049004755.1/GCA_049004755.1_ASM4900475v1_genomic.fna"
ASSEMBLY="/content/carpenterbee_megahit_contigs.fa"
OUTDIR="/content/quast_carpenterbee_megahit"

quast.py "$ASSEMBLY" \
  -r "$REFERENCE" \
  -o "$OUTDIR"