In [1]:
import param
import panel as pn
import subprocess
import pandas as pd
from Bio import SeqIO
from io import StringIO
import ray
from subprocess import Popen, PIPE, run
import os
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqUtils import gc_fraction
from glob import glob
import matplotlib.pyplot as plt
import collections
#subprocess mess
from pathlib import Path
import numpy as np
import shutil

In [2]:
pn.extension('floatpanel')

def search(event):
    loading.visible = True
    loading.value = True
    loading.name = "Searching NCBI database ..."
    taxon_file = taxon_input.value.lower().replace(" ", "_")
    try:
        subprocess.run(f"""datasets summary genome taxon "{taxon_input.value}" --assembly-source genbank >  {taxon_file}.json""", shell=True)
        num = subprocess.Popen(f"""jq '.total_count' {taxon_file}.json""", stdout=subprocess.PIPE, shell=True)
        num = int(num.stdout.read().decode('ascii').strip())
        num_of_genomes_str.object = f"{num} genomes found."
        subprocess.run(f"cat {taxon_file}.json | jq -r '.reports[] | [.accession, .organism.organism_name, .organism.infraspecific_names.strain, .organism.tax_id]| @csv' | head -{num} > info_table.csv", shell=True) 
        loading.name = "Done. Please download the wanted amount of genomes"
    except:
        num_of_genomes_str.object = "No genomes found. Please check for potential spelling errors"
        loading.name = ""
    loading.value = False
    
def download(event):
    loading.visible = True
    loading.value = True
    loading.name = "Downloading genomes ..."
    taxon_file = taxon_input.value.lower().replace(" ", "_")
    subprocess.run(f"cat {taxon_file}.json | jq -r '.reports[]| .accession' | head -{num_of_genomes_input.value} > accession_list.txt", shell=True)
    subprocess.run(f"datasets download genome accession  --include gbff,genome --inputfile accession_list.txt --filename {taxon_file}.zip", shell=True)
    subprocess.run(f"unzip {taxon_file}.zip -d {project_folder}", shell=True)
    genome_iterator = open('accession_list.txt', 'r')
    genome_lines = genome_iterator.readlines()
    for line in genome_lines:
        genome_path = f"{project_folder}/ncbi_dataset/data/{line.strip()}"
        os.chdir(genome_path)
        genome_iterator = os.listdir(genome_path)
        if len(genome_iterator) == 1:
            subprocess.run(f"""mv {genome_iterator[0]} {line.strip()}.fna""", shell=True)
            subprocess.run(f"mv {line.strip()}.fna {project_folder}", shell=True)
        else:
            subprocess.run(f"""mv genomic.gbff {line.strip()}.gbff""", shell=True)
            subprocess.run(f"mv {line.strip()}.gbff {project_folder}", shell=True)
    os.chdir(project_folder)
    shutil.rmtree("ncbi_dataset")     
    os.remove(f"{taxon_file}.zip")
    os.remove("accession_list.txt")
    os.remove(f"{taxon_file}.json")
    os.remove("README.md")
    
    update_df()
    loading.value = False
    loading.name = "Done. Please upload your own genomes or continue to next stage."

def save(event):
    if genome_input.filename != None:
        loading.value = True
        loading.visible = True
        loading.name = "Uploading genomes ..."
        genome_input.save(genome_input.filename)
        update_df()
        loading.value = False
        loading.name = "Done. Please continue to next stage."

def update_df():
    loading.visible = True
    loading.value = True
    loading.name = "Updating sequence statistic..."
    genome_to_proteins=[]
    dict_list = []
    gbff_names=glob(f'{project_folder}/*.gbff')
    name_df = pd.read_csv('info_table.csv', index_col=0, names=['organism', 'strain', 'tax_id'])
    for genbank_file in gbff_names:
        genome_name = os.path.basename(genbank_file).split('.gbff')[0]
        if not genome_name in sequence_df.value.index.tolist():
            genome_length = 0
            count = 0
            gene_count = 0
            plasmid_counter = 0
            tot = 0
            n50 = 0
            try:
                organism = name_df.at[genome_name, 'organism']
                strain = name_df.at[genome_name, 'strain']
                tax_id = name_df.at[genome_name, 'tax_id']
            except:
                organism = '-'
                strain = '-'
                tax_id = '-'
            seq_concat = ""
            for gb_obj in SeqIO.parse(genbank_file,'genbank'):
                genome_length += len(gb_obj.seq)
                count += 1
                seq_concat += gb_obj.seq
                for feature in gb_obj.features:
                    if feature.type == "gene":
                        gene_count += 1
                    if feature.type == "CDS":
                        if not 'pseudo' in feature.qualifiers:
                            protein = SeqRecord(Seq(feature.qualifiers['translation'][0]), id=feature.qualifiers['locus_tag'][0], description='')
                            prot_id = feature.qualifiers['locus_tag'][0]
                            translation = feature.qualifiers['translation'][0]
                            if 'product' in feature.qualifiers:
                                product = feature.qualifiers['product'][0]
                            else:
                                product = ''
                            genome_to_proteins_dict = {'id': prot_id, 'genome': genome_name, 'translation': translation, 'product': product}
                            genome_to_proteins.append(genome_to_proteins_dict)
                    if feature.type == "source":
                        if 'plasmid' in feature.qualifiers:
                            plasmid_counter += 1
            gc_content = round(gc_fraction(seq_concat) *100, 2)
            for gb_obj in SeqIO.parse(genbank_file,'genbank'):
                tot += len(gb_obj.seq)
                if( n50 == 0 and tot > genome_length/2 ):
                    n50 = len(gb_obj.seq)
            row_dict = {'Name': genome_name, 'Organism': organism, 'Strain': strain, 'Taxonomy id': tax_id, 'Length': genome_length, 'Sequences': count, 'Gene count': gene_count, 'Plasmids': plasmid_counter, 'GC%': gc_content, 'N50': n50}
            dict_list.append(row_dict)

    

    fasta_names=glob(f'{project_folder}/*.f*')
    for fasta_file in fasta_names:
        genome_name = os.path.basename(fasta_file).split('.f')[0]
        if genome_name != 'clusters.fasta':
            if not genome_name in sequence_df.value.index.tolist():
                genome_length = 0
                count = 0
                gene_count = '-'
                plasmid_counter = '-'
                tot = 0
                n50 = 0
                try:
                    organism = name_df.at[genome_name, 'organism']
                    strain = name_df.at[genome_name, 'strain']
                    tax_id = name_df.at[genome_name, 'tax_id']
                except:
                    organism = '-'
                    strain = '-'
                    tax_id = '-'
                seq_concat = ""
                for fa_obj in SeqIO.parse(fasta_file,'fasta'):
                    genome_length += len(fa_obj.seq)
                    count += 1
                    seq_concat += fa_obj.seq
                gc_content = round(gc_fraction(seq_concat) *100, 2)
                for fa_obj in SeqIO.parse(fasta_file,'fasta'):
                    tot += len(fa_obj.seq)
                    if( n50 == 0 and tot > genome_length/2 ):
                        n50 = len(fa_obj.seq)
                row_dict = {'Name': genome_name, 'Organism': organism, 'Strain': strain, 'Taxonomy id': tax_id, 'Length': genome_length, 'Sequences': count, 'Gene count': gene_count, 'Plasmids': plasmid_counter, 'GC%': gc_content, 'N50': n50}
                dict_list.append(row_dict)

    df = pd.DataFrame.from_dict(dict_list)
    if not df.empty:
        df = pd.concat([sequence_df.value, df.set_index('Name')])
        sequence_df.value=df
        sequence_df.value.to_csv('sequence_stats.csv')
    sequence_df.value = pd.read_csv('sequence_stats.csv', index_col=0)
    genome_to_protein_df = pd.DataFrame.from_dict(genome_to_proteins)
    if not genome_to_protein_df.empty:
        genome_to_protein_df.to_csv('genome_to_protein.csv')
    outputfile=f'{project_folder}/clusters.fasta'
    #if os.path.isfile(outputfile) == True:
    #    with open(outputfile, "a") as outputfile:
    #        SeqIO.write(proteins, outputfile, "fasta")        
    #else:
    #    SeqIO.write(proteins,outputfile,'fasta')
    loading.value = False
    loading.name = ""


def remove(event):
    genome_name = sequence_df.value.iloc[sequence_df.selection[0]].name
    sequence_df.value = sequence_df.value.drop([genome_name])
    sequence_df.value.to_csv('sequence_stats.csv')
    genome_to_protein_df = pd.read_csv('genome_to_protein.csv')
    genome_to_protein_df = genome_to_protein_df[genome_to_protein_df.genome != genome_name]
    genome_to_protein_df.to_csv('genome_to_protein.csv')
    os.remove(glob(f"{project_folder}/{genome_name}.*")[0])

def create(event):
    cwd = os.getcwd()
    global project_folder
    project_folder = f"{cwd}/{project_name.value}"
    if not os.path.exists(project_folder):
        os.mkdir(project_folder)
        os.chdir(project_folder)
        project_structure()
        floatpanel.visible = False
    else:
        project_name.value=""
        project_name.placeholder = "This folder already exists!"


def open_proj(event):
    if os.path.exists(open_proj_name.value) and os.path.isdir(open_proj_name.value):
        global project_folder
        project_folder = open_proj_name.value
        os.chdir(project_folder)
        project_structure()
        floatpanel.visible = False
        with os.scandir(project_folder) as entries:
            for entry in entries:
                if 'sequence_stats.csv' == entry.name:
                    sequence_df.value = pd.read_csv('sequence_stats.csv', index_col=0)
        update_df()
    else:
        open_proj_name.value=""
        open_proj_name.placeholder = "This folder does not exist!"

def project_structure():
    mafft_folder=f"{project_folder}/aligned"
    tree_folder=f"{project_folder}/trees"
    if os.path.exists(mafft_folder) == False:
        os.mkdir(os.path.join(project_folder, "aligned"))
    if os.path.exists(tree_folder) == False:
        os.mkdir(os.path.join(project_folder, "trees"))
        
welcome_Str = pn.pane.Str("Welcome to PanLoki!\nPlease create a new project...",styles={'font-size': '12pt'})
project_name = pn.widgets.TextInput(name='Project Name:', placeholder='Enter a project name here...')
create_proj_button = pn.widgets.Button(name='Create new project', button_type='primary')
or_Str = pn.pane.Str("\n\n... or open an existing project by entering the path to the project folder",styles={'font-size': '12pt'})
open_proj_name = pn.widgets.TextInput(name='Project Path', placeholder='Enter path to existing project...')
open_proj_button  = pn.widgets.Button(name='Open existing project', button_type='primary')

create_proj_button.on_click(create)
open_proj_button.on_click(open_proj)

config = {"headerControls": {"close": "remove", "maximize": "remove", "normalize": "remove", "minimize": "remove", "smallify": "remove"}}
floatpanel = pn.layout.FloatPanel(pn.Column(welcome_Str,pn.Row(project_name, create_proj_button), or_Str, pn.Row(open_proj_name, open_proj_button)), name='Welcome', margin=20, config=config)

taxon_input = pn.widgets.TextInput(name='Fetch genomes from NCBI', placeholder='Enter taxon name here...')
search_button = pn.widgets.Button(name='Search', button_type='primary')
num_of_genomes_str = pn.pane.Str("",styles={'font-size': '12pt'})

num_of_genomes_input = pn.widgets.IntInput(name='Number of genomes', value=5, step=1, start=0)
download_button = pn.widgets.Button(name='Download', button_type='primary')

genome_input = pn.widgets.FileInput(accept='.fasta, .gbff, .fna, .fa', multiple=True)
genome_save_button =pn.widgets.Button(name='Save', button_type='primary')
upload_str = pn.pane.Str("\n\nUpload genomes below. The maximum \nfile size for uploading is 100 Mb.\nAllowed formats: .gbff, .fa, .fna, .fasta",styles={'font-size': '12pt'})

sequence_df = pn.widgets.DataFrame(pd.DataFrame(),height=700, width=1000)
remove_button = pn.widgets.Button(name='Remove Genome', width=150, button_type='primary')
remove_button.on_click(remove)



search_button.on_click(search)
download_button.on_click(download)
genome_save_button.on_click(save)



genome_interface = pn.Row( pn.Column(pn.Row(taxon_input, search_button),num_of_genomes_str, pn.Row(num_of_genomes_input, download_button), pn.Column(upload_str, pn.Row(genome_input, genome_save_button))), pn.Row(sequence_df, remove_button))

loading = pn.indicators.LoadingSpinner(value=False, name='', visible = False)

gspec_spinner = pn.GridSpec()
gspec_spinner[2,0:2] = loading

gspec_genome = pn.GridSpec(height = 600)
gspec_genome[0:3,   0:4  ] = genome_interface
gspec_genome[4,   4] = gspec_spinner



pn.Column(floatpanel, gspec_genome).servable()

In [1]:
    
    #proteins=[]
    #GC_contents = {}
    #for genbank_file in file_names:
    #    for gb_obj in SeqIO.parse(genbank_file,'genbank'):
    #        genome_name = os.path.basename(genbank_file).split('.gbff')[0]
    #        GC_contents[genome_name] = gc_fraction(gb_obj.seq)
    #        for feature in gb_obj.features:
    #             if feature.type == "CDS":
    #                protein = SeqRecord(Seq(feature.qualifiers['translation'][0]), id=feature.qualifiers['locus_tag'][0], description='')
    #                proteins.append(protein)
    #SeqIO.write(proteins,outputfile,'fasta')
    #plt.bar(*zip(*GC_contents.items()), color = "green")
    #plt.title('GC content')
    #plt.xticks(rotation='vertical')
    #plt.ylabel('GC%')
    #plt.axis(ymin=min(GC_contents.values())-0.01)
    #plt.savefig(f'{project_folder}/GC%.png')

    