# Miniconda installation

In [None]:
# The -p flag prevents errors if the directory exists and creates parent directories if needed. 
mkdir -p ~/miniconda3 
# use wget to download miniconda installer for linux (64-bit) 
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda3/miniconda.sh
#Run the downloaded file, script is executed by the bash command invoking it
bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3 
# delete the file once installed for space to be freed up 
rm -rf ~/miniconda3/miniconda.sh # allows for the downloaded files that are no longer needed after installation to be deleted 

~/miniconda3/bin/conda init bash # initializing conda for the bash cell 
 Close the shell and reopen 
#creating an environment 
conda create -n project_env python=3.9 -y
 conda activate project_env
	
# adding channels to the conda environment 
conda config --add channels conda-forge 
conda config --add channels Bioconda
 conda config --add channels defaults
# installing required packages 
conda install -c bioconda fastQC
conda install -c bioconda quast
 conda install -c bioconda bbmap



# Quality control filtering long reads 

In [None]:

# making a directory named data
mkdir data
# changing into the newly created directory named data
cd data
# creating an output directory for the fastqc files 
mkdir -p fastqc_output
# Fastqc being run on the long reads and paired-end short reads 
fastqc long_reads.fastq short_reads_1.fastq short_reads_2.fastq -o fastqc_output
# installing the package nanofilt 
pip install nanofilt â€“upgrade
# running nanofilt on the long reads, q-12 filters out long reads with an average quality score below 12 
# -1 2000:filters out reads that are shorter than 2000 base pairs, the output is named cleaned_reads 
NanoFilt -q 12 -l 2000 < long_reads.fastq > cleaned_reads.fastq




# BBmap for paired lengths being repaired 

In [None]:

###previous spades run indicated an error for short read length bbmap will be used to fix this error 
# installing bbmap
conda install bioconda::bbmap
conda install bioconda/label/cf201901::bbmap
# attempting to make the short reads the same length
repair.sh in1=short_reads_1.fastq in2=short_reads_2.fastq out1=short_reads_1_fixed.fastq out2=short_reads_2_fixed. 

# checks the length of reads to see if they are the same. Different lengths were seen.
grep -c "^@" short_reads_1_fixedd.fastq 
grep -c "^@" short_reads_2_fixedd.fastq
#Second time trying the repair for the short reads 
repair.sh in1=short_reads_1_fixed.fastq in2=short_reads_2_fixed.fastq out1=short_reads_1_fixedd.fastq out2=short_reads_2_fixedd.fastq 
# code showed that the files were the same so now ready for assembly.
grep -c "^@" short_reads_1_fixedd.fastq 
grep -c "^@" short_reads_2_fixedd.fastq

# Spades assembly

In [None]:

# spades assembly for the long and short reads allowing for hybrid assembly, the careful parameter was used.
spades.py \
  -1 short_reads_1_fixedd.fastq \
  -2 short_reads_2_fixedd.fastq \
  --nanopore cleaned_reads.fastq \
  -o spades_output \
  --careful \
  -t 8 \
  -m 64
# Quast was used to look at the assembly statistics 
quast /home/hanaj/data/spades_output/contigs.fasta -o /home/hanaj/data/quast_output
# The quast output showed multiple contigs so further cleaning was required 



# Racon polishing

In [None]:
# Creating and activating an environment for racon 
conda create -n racon_env python=3.9
conda activate racon_env
# installing racon through conda, used for polishing genome assemblies 
conda install racon
# installing sam tools through conda 
conda install samtools
# installing bwa through conda, package that maps low divergent sequences against a large genome reference 
conda install bwa
# rapid aligner for sequences of long read data
conda install -c bioconda minimap2
# preparation of the alignments using bwa and samtools
# converting to sequence alignment format
samtools view -Sb alignment.sam > alignment.bam 
# sorts the alignment files
samtools sort -o sorted_alignment.bam alignment.bam 
# sorting is essential before further analysis, this includes indexing and variant calling 
# index created for the bam sorted files 
samtools index sorted_alignment.bam 
# racon is used to polish the assemblies, input includes the filtered long reads and alignment file and the output 
racon cleaned_reads.fastq alignment.sam spades_output/contigs.fasta > polished_contigs.fasta
# The racon environment is deactivated 
conda deactivate 
quast /home/hanaj/data/polished_contigs.fasta -o /home/hanaj/data/quast_output
# A single contig was seen which allowed for annotation. 


# Annotation with prokka



In [None]:
conda install -c conda-forge -c bioconda -c defaults prokka
# a new directory for prokka 
mkdir prokka_output
# changing into the newly created directory 
cd prokka_output
# running Prokka with the newly isolated strain and the 
# Prokka annotation commands with relative paths
prokka --outdir ./GCA_000204195.2 --prefix GCA_000204195.2 ../GCA_000204195.2_ASM20419v1_genomic.fna
prokka --outdir ./GCA_000204215.2 --prefix GCA_000204215.2 ../GCA_000204215.2_ASM20421v2_genomic.fna     
prokka --outdir ./GCF_000204215.1 --prefix GCF_000204215.1 ../GCF_000204215.1_ASM20421v2_genomic.fna
prokka --outdir ./GCF_000204195.1 --prefix GCF_000204195.1 ../GCF_000204195.1_ASM20419v1_genomic.fna
prokka --outdir ./Acidovoraxradicis --prefix Acidovoraxradicis ../polished_contigs.fasta


# Annotation with pgap 

In [None]:

#package list being updated 
sudo apt-get update
# Prerequisites being installed 
# dockers official gpg key being added 
sudo mkdir -p /etc/apt/keyrings
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg
# docker repository being set up
echo \
  "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \
  $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null

# package list being updated 
sudo apt-get update
# docker engine being installed 
sudo apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin

# docker being enabled and started 
sudo systemctl start docker
sudo systemctl enable docker
# docker installation being verified 
# Test the installation of docker 
docker run hello-world
# navigating to the directory tht pgap.py is 
cd /path/to/pgap.py
# ensuring the script is excutable
chmod +x pgap.py
# verification of the execution permissions 
ls -l pgap.py
# script run
./pgap.py -r -o /home/hanaj/data/pgap_annotation/pgap_output -d /home/hanaj/data/input_data input.yaml


# Busco

In [None]:
# BUSCO was used to assess the completeness of the genome assemblies. 
The bacteria kingdom was specified and the genome database was used. 
# -i specifies the input file (genome in FASTA format) 
# -o specifies the output directory name 
# -m specifies the mode (genome mode)
 # -l specifies the lineage dataset (bacteria_odb10 is used here)

busco -i polished_contigs .fasta -o Genome_Acidovoraxradicis -m genome -l bacteria_odb10
busco -i GCA_020510705.1_ASM2051070v1_genomic.fna -o Genome_GCA_020510705.1 -m genome -l bacteria_odb10
busco -i GCA_000204195.2_ASM20419v1_genomic.fna -o Genome_GCA_000204195.2 -m genome -l bacteria_odb10
busco -i GCA_000204215.2_ASM20421v2_genomic.fna -o Genome_GCA_000204215.2 -m genome -l bacteria_odb10
# These commands assess the completeness of various protein assemblies, specifying the bacteria kingdom and using the protein database. 
# -m specifies the mode (protein mode)
# Corrected command for protein mode
busco -i acidovorax_protein.faa -o strain_Acidovoraxradicis_protein -m protein -l bacteria_odb10
busco -i GCA_020510705.1_protein.faa -o strain_GCA_020510705.1_protein -m protein -l bacteria_odb10
busco -i GCF_000204195.1_protein.faa -o strain_GCF_000204195.1_protein -m protein -l bacteria_odb10
busco -i GCF_000204215.1_protein.faa -o strain_GCF_000204215.1_protein -m protein -l bacteria_odb10

# preparation of the Busco summaries to be visualised 

# This script is what allows for the summary plots of the Busco script to be generated, script is copied to the Busco summries directory for the genome database  ones
mkdir -p ~/data/BUSCO_summaries/scripts 
cp generate_plot.py ~/data/BUSCO_summaries/scripts/

# changing to the directory where the Busco summaries are . 
cd ~/data/BUSCO_summaries
# plot is generated for the busco summaries within the current directory
python3 scripts/generate_plot.py -wd .
# visualisation for the protein database. 
# a directory is created for the storage of the Busco summaries for the protein database
mkdir -p ~/data/BUSCO_summaries_proteins_2/scripts 
cp /home/hanaj/miniconda3/envs/busco_env/bin/generate_plot.py ~/data/BUSCO_summaries_proteins_2/scripts/
# changing to the directory and generating a plot for the proteins database run 
cd ~/data/BUSCO_summaries_proteins_2 
python3 scripts/generate_plot.py -wd .


# Roary pangenome analysis 

In [None]:
#creating roary environment 
conda create -n roary_env python=3.8
# activating the new environment 
conda activate roary_env
mkdir roary_dir
cd roary_dir
# The Prokka annotated files for the newly isolated strain and NCBI downloaded strains were manually moved to the new directory.
# default parameters were used for the annotation
roary *.gff
