In [None]:
# Input files
ctx_seqs.fasta


In [None]:
#!/usr/bin/env python3
"""
Script other: Ibrahim ElZahaby
Micro section number: 097220
Script function: python code generate 262 plasmid sequence records (output fasta files) from fasta file
Usage: python3 single_fasta.py
"""
def split_fasta(fasta_file):
    """
    Splits a multi-sequence FASTA file into individual files, one for each sequence.
    The output files will be named with the accession number extracted from the sequence header.
    The accession number is assumed to be the first string after the third '|' character in the header.
    :param fasta_file: input FASTA file includes all genomic records
    """

# declare input fasta file variable
fasta_file = "ctx_seqs.fasta"
# open fasta file
with open(fasta_file, "r") as f:
    # split fasta file into individual sequences
    sequences = f.read().split(">")

    for seq in sequences[1:]:
        # split the sequence into lines
        lines = seq.strip().split("\n")
        # extract the header line
        header = lines[0]
        # join the remaining lines into a single sequence
        seq = "".join(lines[1:])
        # extract the accession number from the header
        accession = header.split("|")[3]

        # write the sequence to a text file with the accession number as the output file name
        with open(f"{accession}.fasta", "w") as out_file:
            out_file.write(f">{header}\n{seq}")

In [None]:
# run the code on linux server
python3 single_fasta.py

In [None]:
# create plasmidfinder results file
mkdir plasmidfinder_res
# run plasmidfinder on 262 ctx-M-15 dummy plasmids
for file in *;do
 mkdir /mnt/DATAPOOL/mmibstudentnew/dummy/output_files/plasmidfinder_res/$file/;
 plasmidfinder.py -i $file
 -o /mnt/DATAPOOL/mmibstudentnew/dummy/output_files/plasmidfinder_res/$file/
 -t 0.60 -x -p
 /mnt/DATAPOOL/mmibstudentnew/plasmidfinder/plasmidfinder_db/;done
# move some output files to plasmidfinder results file
mv tmp results.txt results_tab.tsv data.json Hit_in_genome_seq.fsa Plasmid_seqs.fsa /mnt/DATAPOOL/mmibstudentnew/dummy/output_files/plasmidfinder_res/

In [None]:
# creat prokka results file
mkdir prokka_res
# copy fasta files to prokka results file
cp /mnt/DATAPOOL/mmibstudentnew/dummy/output_files/*.fasta /mnt/DATAPOOL/mmibstudentnew/dummy/output_files/prokka_res/
# current working directory
/mnt/DATAPOOL/mmibstudentnew/dummy/output_files/
# run prokka on 262 ctx-M-15 dummy plasmids
for file in *.fasta; do prokka --kingdom Bacteria --outdir $file$file --prefix $file $file --force; done

In [None]:
# create directory for panaroo
mkdir panaroo_data
# grep all gff files
find /mnt/DATAPOOL/mmibstudentnew/dummy/output_files/prokka_res/
 -type f -name "*.gff" -exec cp {}
 /mnt/DATAPOOL/mmibstudentnew/dummy/output_files/panaroo_data \;

In [None]:
# create new directory for panaroo results
mkdir panaroo_data
cd panaroo_data
mkdir large_files
mkdir small_files

In [None]:
#!/usr/bin/env python3
"""
Script other: Ibrahim ElZahaby
Student number: 1069624
Script function: separate 239 gff files above 1 kb and below 1 kb into two folders
Usage: python3 file_size_separator.py
"""
import os
import shutil

# Define the paths to the input and output directories
gffs_dir = '/mnt/DATAPOOL/mmibstudentnew/dummy/output_files/panaroo_data/'
large_gffs = '/mnt/DATAPOOL/mmibstudentnew/dummy/output_files/panaroo_data/large_files/'
small_gffs = '/mnt/DATAPOOL/mmibstudentnew/dummy/output_files/panaroo_data/small_files/'

# Loop over the files in the input directory
for filename in os.listdir(gffs_dir):
    # Construct the full path to the file
    filepath = os.path.join(gffs_dir, filename)
    # Check if the file size is greater than 5 KB
    if os.path.getsize(filepath) > 1 * 1024:
        # Move the file to the large files directory
        shutil.move(filepath, large_gffs)
    else:
        # Move the file to the small files directory
        shutil.move(filepath, small_gffs)

In [None]:
# run # run the code on linux server
python3 file_size_separator.py

In [None]:
mkdir large_res
# run panaroo on large_gff files
panaroo -i *.gff -o /mnt/DATAPOOL/mmibstudentnew/dummy/output_files/panaroo_data/large_res/ --clean-mode strict --remove-invalid-genes
 -f 0.5 --len_dif_percent 0.98 --merge_paralogs --refind_prop_match 0.5
 --search_radius 1000 --aligner clustal --core_threshold 0.98 -t 10

mkdir small_res
# run panaroo on small_gff files
panaroo -i *.gff -o /mnt/DATAPOOL/mmibstudentnew/dummy/output_files/panaroo_data/small_res/ --clean-mode strict --remove-invalid-genes
 -f 0.5 --len_dif_percent 0.98 --merge_paralogs --refind_prop_match 0.5
 --search_radius 1000 --aligner clustal --core_threshold 0.98 -t 10

In [None]:
# create new directory for panaroo results
mkdir abricate_res
# run abricate on fasta files
abricate -db vfdb --quiet /mnt/DATAPOOL/mmibstudentnew/dummy/output_files/*.fasta > abr_output.tab
# create output summary
abricate --summary abr_output.tab > abr_sum.tab