In [2]:
import param
import panel as pn
import subprocess
import pandas as pd
from Bio import SeqIO, AlignIO
from io import StringIO
import os
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqUtils import gc_fraction
from glob import glob
import shutil
import csv
from collections import Counter
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots
import multiprocessing
import numpy as np
from bokeh.models import ColumnDataSource, Plot, Grid, Range1d
from bokeh.plotting import figure
from bokeh.models.glyphs import Text, Rect
from bokeh.layouts import gridplot
import pyperclip as pc
from collections import defaultdict
import json
import matplotlib.cm as cm
import matplotlib.colors as mplc
import re
from enum import Enum

In [3]:
#jupyter config with updated file input settings!

In [4]:
pn.extension('floatpanel', 'tabulator', 'plotly')
pipeline = pn.pipeline.Pipeline(debug=True)

In [5]:
project_folder= os.getcwd()
mafft_folder=f"{project_folder}/aligned"
tree_folder=f"{project_folder}/trees"
images_folder = f"{project_folder}/images"
blast_folder = f"{project_folder}/blast_results"
export_folder = f"{project_folder}/export"

show_float = True
num_cpus = multiprocessing.cpu_count()

In [7]:
class Genomes(param.Parameterized):

    def view(self):
        def create(event):
            global project_folder
            if project_path_input.value == '':
                project_path_input.value = project_path_input.placeholder
            if project_path_input.value[-1] == '/':
                project_folder = f"{project_path_input.value}{project_name.value}"
            else:
                project_folder = f"{project_path_input.value}/{project_name.value}"
            if not os.path.exists(project_folder):
                os.mkdir(project_folder)
                os.chdir(project_folder)
                project_structure()
                floatpanel.visible = False
            else:
                project_name.value=""
                project_name.placeholder = "This folder already exists!"

        def open_proj(event):
            global project_folder
            if os.path.exists(open_proj_name.value) and os.path.isdir(open_proj_name.value):
                project_folder = open_proj_name.value
                os.chdir(project_folder)
                project_structure()
                floatpanel.visible = False
                with os.scandir(project_folder) as entries:
                    for entry in entries:
                        if 'sequence_stats.csv' == entry.name:
                            sequence_df.value = pd.read_csv('sequence_stats.csv', index_col=0)
                update_df()
            else:
                open_proj_name.value=""
                open_proj_name.placeholder = "This folder does not exist!"
        
        def project_structure():
            global show_float
            global mafft_folder
            global tree_folder
            global image_folder
            global blast_folder
            global export_folder
            show_float = False
            mafft_folder=f"{project_folder}/aligned"
            tree_folder=f"{project_folder}/trees"
            images_folder = f"{project_folder}/images"
            blast_folder = f"{project_folder}/blast_results"
            export_folder = f"{project_folder}/export"
            if os.path.exists(mafft_folder) == False:
                os.mkdir(os.path.join(project_folder, "aligned"))
            if os.path.exists(tree_folder) == False:
                os.mkdir(os.path.join(project_folder, "trees"))
            if os.path.exists(images_folder) == False:
                os.mkdir(os.path.join(project_folder, "images"))
            if os.path.exists(blast_folder) == False:
                os.mkdir(os.path.join(project_folder, "blast_results"))
            if os.path.exists(export_folder) == False:
                os.mkdir(os.path.join(project_folder, "export"))

        def update_df():
            if os.path.isfile(f'{project_folder}/sequence_stats.csv') == True: 
                sequence_df.value = pd.read_csv('sequence_stats.csv', index_col=0)
            loading.visible = True
            loading.value = True
            loading.name = "Updating sequence statistic..."
            genome_to_proteins=[]
            dict_list = []
            gbff_names=glob(f'{project_folder}/*.gbff')
            name_df = pd.read_csv('info_table.csv', index_col=0, names=['organism', 'strain', 'tax_id'])
            for genbank_file in gbff_names:
                genome_name = os.path.basename(genbank_file).split('.gbff')[0]
                if not genome_name in sequence_df.value.index.tolist():
                    genome_length = 0
                    count = 0
                    plasmid_counter = 0
                    tot = 0
                    n50 = 0
                    rRNA_count = 0
                    tRNA_count = 0
                    protein_number = 0
                    try:
                        organism = name_df.at[genome_name, 'organism']
                        strain = name_df.at[genome_name, 'strain']
                        tax_id = name_df.at[genome_name, 'tax_id']
                    except:
                        organism = '-'
                        strain = '-'
                        tax_id = '-'
                    seq_concat = ""
                    for gb_obj in SeqIO.parse(genbank_file,'genbank'):
                        genome_length += len(gb_obj.seq)
                        count += 1
                        seq_concat += gb_obj.seq
                        for feature in gb_obj.features:
                            if feature.type == "CDS":
                                if not 'pseudo' in feature.qualifiers:
                                    prot_id = feature.qualifiers['locus_tag'][0]
                                    translation = feature.qualifiers['translation'][0]
                                    if 'product' in feature.qualifiers:
                                        product = feature.qualifiers['product'][0]
                                    else:
                                        product = ''
                                    genome_to_proteins_dict = {'id': prot_id, 'genome': genome_name, 'sequence': str(translation), 'product': product, 'type' : 'protein', 'protein number': protein_number}
                                    genome_to_proteins.append(genome_to_proteins_dict)
                                    protein_number +=1
                            elif feature.type == "rRNA" or feature.type == "tRNA" or feature.type == "tmRNA":
                                if feature.type == 'rRNA':
                                    rRNA_count += 1
                                elif feature.type == 'tRNA':
                                    tRNA_count += 1 
                                RNA_id = feature.qualifiers['locus_tag'][0]
                                sequence = feature.extract(gb_obj).seq
                                if 'product' in feature.qualifiers:
                                    product = feature.qualifiers['product'][0]
                                    genome_to_proteins_dict = {'id': RNA_id, 'genome': genome_name, 'sequence': str(sequence), 'product': product, 'type' : feature.type, 'protein number': '-'}
                                    genome_to_proteins.append(genome_to_proteins_dict)
                            if feature.type == "source":
                                if 'plasmid' in feature.qualifiers:
                                    plasmid_counter += 1
                    gc_content = round(gc_fraction(seq_concat) *100, 2)
                    for gb_obj in SeqIO.parse(genbank_file,'genbank'):
                        tot += len(gb_obj.seq)
                        if( n50 == 0 and tot > genome_length/2 ):
                            n50 = len(gb_obj.seq)
                    row_dict = {'Name': genome_name, 'Organism': organism, 'Strain': strain, 'Taxonomy id': tax_id, 'Length': genome_length, 'Contigs': count, 'Gene count': protein_number+tRNA_count+rRNA_count, 'Plasmids': plasmid_counter, 'GC%': gc_content, 'N50': n50, 'tRNA': tRNA_count, 'rRNA': rRNA_count, 'protein count': protein_number}
                    dict_list.append(row_dict)

            fasta_names=glob(f'{project_folder}/*.f*')
            for fasta_file in fasta_names:
                genome_name = os.path.basename(fasta_file).split('.f')[0]
                if not genome_name in sequence_df.value.index.tolist():
                    loading.name = f"Running Prodigal for {genome_name}..."
                    subprocess.run(f"prodigal -i {os.path.basename(fasta_file)} -q -a proteins.faa", shell=True)
                    loading.name = f"Running barrnap for {genome_name}..."
                    subprocess.run(f" barrnap {os.path.basename(fasta_file)} -q --threads {num_cpus} --outseq rRNA.fasta", shell=True)
                    loading.name = f"Running aragorn for {genome_name}..."
                    subprocess.run(f"aragorn -l -gc11 -fo -o tRNA.fasta {os.path.basename(fasta_file)}", shell=True)
                    loading.name = "Updating sequence statistic..."
                    genome_length = 0
                    count = 0
                    gene_count = 0
                    plasmid_counter = '-'
                    tot = 0
                    n50 = 0
                    rRNA_count = 0
                    tRNA_count = 0
                    protein_number = 0
                    try:
                        organism = name_df.at[genome_name, 'organism']
                        strain = name_df.at[genome_name, 'strain']
                        tax_id = name_df.at[genome_name, 'tax_id']
                    except:
                        organism = '-'
                        strain = '-'
                        tax_id = '-'
                    seq_concat = ""
                    for fa_obj in SeqIO.parse(fasta_file,'fasta'):
                        genome_length += len(fa_obj.seq)
                        count += 1
                        seq_concat += fa_obj.seq
                    gc_content = round(gc_fraction(seq_concat) *100, 2)
                    for fa_obj in SeqIO.parse(fasta_file,'fasta'):
                        tot += len(fa_obj.seq)
                        if( n50 == 0 and tot > genome_length/2 ):
                            n50 = len(fa_obj.seq)
                    for prot_obj in SeqIO.parse(f"{project_folder}/proteins.faa",'fasta'):
                        gene_count += 1
                        genome_to_proteins_dict = {'id':  f'prot_{genome_name[-6:]}_{gene_count}', 'genome': genome_name, 'sequence': str(prot_obj.seq), 'product': '-', 'type': 'protein', 'protein number': protein_number}
                        genome_to_proteins.append(genome_to_proteins_dict)
                        protein_number +=1
                    for rRNA_obj in SeqIO.parse(f"{project_folder}/rRNA.fasta",'fasta'):
                        gene_count += 1
                        rRNA_count +=1
                        genome_to_proteins_dict = {'id': f'rRNA_{genome_name[-6:]}_{gene_count}', 'genome': genome_name, 'sequence': str(rRNA_obj.seq), 'product': rRNA_obj.id.split('::')[0].split('_')[0] + " ribosomal RNA", 'type': 'rRNA', 'protein number': '-'}
                        genome_to_proteins.append(genome_to_proteins_dict)
                    for tRNA_obj in SeqIO.parse(f"{project_folder}/tRNA.fasta",'fasta'):
                        gene_count += 1
                        type = 'tRNA'
                        tRNA_count +=1
                        if tRNA_obj.id == 'tmRNA':
                            type = 'tmRNA'
                            tRNA_count -= 1
                        genome_to_proteins_dict = {'id': f'tRNA_{genome_name[-6:]}_{gene_count}', 'genome': genome_name, 'sequence': str(tRNA_obj.seq), 'product': tRNA_obj.id, 'type': type, 'protein number': '-'}
                        genome_to_proteins.append(genome_to_proteins_dict)
                    os.remove("proteins.faa")
                    os.remove("rRNA.fasta")
                    os.remove("tRNA.fasta")
                    os.remove(f"{os.path.basename(fasta_file)}.fai")
                    row_dict = {'Name': genome_name, 'Organism': organism, 'Strain': strain, 'Taxonomy id': tax_id, 'Length': genome_length, 'Contigs': count, 'Gene count': protein_number+tRNA_count+rRNA_count, 'Plasmids': plasmid_counter, 'GC%': gc_content, 'N50': n50, 'tRNA': tRNA_count, 'rRNA': rRNA_count, 'protein count': protein_number}
                    dict_list.append(row_dict)

            df = pd.DataFrame.from_dict(dict_list)
            if not df.empty:
                df = pd.concat([sequence_df.value, df.set_index('Name')])
                sequence_df.value=df
                sequence_df.value.to_csv('sequence_stats.csv')
            sequence_df.value = pd.read_csv('sequence_stats.csv', index_col=0)
            genome_to_protein_add_df = pd.DataFrame.from_dict(genome_to_proteins)
            if os.path.isfile(f"{project_folder}/genome_to_protein.csv") == True:
                genome_to_protein_df = pd.read_csv('genome_to_protein.csv')
            else:
                genome_to_protein_df = pd.DataFrame()
            genome_to_protein_df = pd.concat([genome_to_protein_df, genome_to_protein_add_df]).reset_index(drop=True)
            genome_to_protein_df.to_csv('genome_to_protein.csv', index=False)

            
            genome_to_protein_df = pd.read_csv('genome_to_protein.csv')
            print(genome_to_protein_df)
            genome_to_protein_df.set_index('id', inplace=True)
            duplicate_index = genome_to_protein_df.index[genome_to_protein_df.index.duplicated()]
            print(duplicate_index)
            genome_to_protein_df.reset_index(inplace=True)
            if not len(duplicate_index)==0:
                genome_to_protein_df['sequence'] = genome_to_protein_df.groupby('id')['sequence'].transform(lambda x: ''.join(x))
                genome_to_protein_df.drop_duplicates(subset='id', inplace=True)
                print(genome_to_protein_df)
            genome_to_protein_df.to_csv('genome_to_protein.csv', index=False)

            
            fig = make_subplots(rows=1, cols=3)
            columns_to_plot = ['Length', 'Gene count', 'GC%']
            for i, col in enumerate(columns_to_plot, start=1):
                fig.add_trace(go.Bar(x=sequence_df.value.index, y=sequence_df.value[col], name=col,  xaxis=f'x{i}', yaxis=f'y{i}'))
            for i in range(1, len(columns_to_plot) + 1):
                fig.update_layout(xaxis=dict(title='Genomes', tickangle=-90), xaxis2=dict(title='Genomes', tickangle=-90), xaxis3=dict(title='Genomes', tickangle=-90))
            
            fig.update_yaxes(title_text="Length in bp", row=1, col=1)
            fig.update_yaxes(title_text="Gene count", row=1, col=2)
            fig.update_yaxes(title_text="GC%", row=1, col=3)    
            
            fig.update_layout(
                title='Sequence Statistics',
                xaxis_title='Genomes',
                colorway=px.colors.qualitative.T10,
                grid={'rows': 1, 'columns': len(columns_to_plot)},
                height=400,
                width=1200,
            )
            fig.write_image("images/sequence_stats.png")
            stats_fig.object = fig
            stats_fig.visible = True
            
            
            loading.value = False
            loading.name = ""
            

        def search(event):
            loading.visible = True
            loading.value = True
            loading.name = "Searching NCBI database ..."
            taxon_file = taxon_input.value.lower().replace(" ", "_")
            try:
                subprocess.run(f"""datasets summary genome taxon "{taxon_input.value}" --assembly-source genbank >  {taxon_file}.json""", shell=True)
                num = subprocess.Popen(f"""jq '.total_count' {taxon_file}.json""", stdout=subprocess.PIPE, shell=True)
                num = int(num.stdout.read().decode('ascii').strip())
                num_of_genomes_str.object = f"{num} genomes found."
                subprocess.run(f"cat {taxon_file}.json | jq -r '.reports[] | [.accession, .organism.organism_name, .organism.infraspecific_names.strain, .organism.tax_id]| @csv' | head -{num} > info_table.csv", shell=True) 
                loading.name = "Done. Please download the wanted amount of genomes"
            except:
                num_of_genomes_str.object = "No genomes found. Please check for potential spelling errors"
                loading.name = ""
            loading.value = False
            
        def download(event):
            loading.visible = True
            loading.value = True
            loading.name = "Downloading genomes ..."
            taxon_file = taxon_input.value.lower().replace(" ", "_")
            subprocess.run(f"cat {taxon_file}.json | jq -r '.reports[]| .accession' | head -{num_of_genomes_input.value} > accession_list.txt", shell=True)
            subprocess.run(f"datasets download genome accession  --include gbff,genome --inputfile accession_list.txt --filename {taxon_file}.zip", shell=True)
            subprocess.run(f"unzip {taxon_file}.zip -d {project_folder}", shell=True)
            genome_iterator = open('accession_list.txt', 'r')
            genome_lines = genome_iterator.readlines()
            for line in genome_lines:
                genome_path = f"{project_folder}/ncbi_dataset/data/{line.strip()}"
                os.chdir(genome_path)
                genome_iterator = os.listdir(genome_path)
                if len(genome_iterator) == 1:
                    subprocess.run(f"""mv {genome_iterator[0]} {line.strip()}.fna""", shell=True)
                    subprocess.run(f"mv {line.strip()}.fna {project_folder}", shell=True)
                else:
                    subprocess.run(f"""mv genomic.gbff {line.strip()}.gbff""", shell=True)
                    subprocess.run(f"mv {line.strip()}.gbff {project_folder}", shell=True)
            os.chdir(project_folder)
            shutil.rmtree("ncbi_dataset")     
            os.remove(f"{taxon_file}.zip")
            os.remove("accession_list.txt")
            os.remove(f"{taxon_file}.json")
            os.remove("README.md")
            
            update_df()
            loading.value = False
            loading.name = "Done. Please upload your own genomes or continue to next stage."
        
        def save(event):
            if genome_input.filename != None:
                loading.value = True
                loading.visible = True
                loading.name = "Uploading genomes ..."
                genome_input.save(genome_input.filename)
                update_df()
                loading.value = False
                loading.name = "Done. Please continue to next stage."

        def remove(event):
            genome_name = sequence_df.value.iloc[sequence_df.selection[0]].name
            sequence_df.value = sequence_df.value.drop([genome_name])
            sequence_df.value.to_csv('sequence_stats.csv')
            genome_to_protein_df = pd.read_csv('genome_to_protein.csv')
            genome_to_protein_df = genome_to_protein_df[genome_to_protein_df.genome != genome_name].reset_index(drop=True)
            genome_to_protein_df.to_csv('genome_to_protein.csv', index=False)
            os.remove(glob(f"{project_folder}/{genome_name}.*")[0])
                
        welcome_Str = pn.pane.Str("Welcome to PanLoki!\nPlease create a new project...",styles={'font-size': '12pt'})
        project_name = pn.widgets.TextInput(name='Project Name:', placeholder='Enter project name ...', width=150)
        project_path_input = pn.widgets.TextInput(name='Project Path:', placeholder=os.getcwd(), width=450)
        create_proj_button = pn.widgets.Button(name='Create new project', button_type='primary')
        or_Str = pn.pane.Str("\n\n... or open an existing project by entering the path to the project folder",styles={'font-size': '12pt'})
        open_proj_name = pn.widgets.TextInput(name='Project Path', placeholder='Enter path to existing project...', width=600)
        open_proj_button  = pn.widgets.Button(name='Open existing project', button_type='primary')
        
        create_proj_button.on_click(create)
        open_proj_button.on_click(open_proj)

        
        config = {"headerControls": {"close": "remove", "maximize": "remove", "normalize": "remove", "minimize": "remove", "smallify": "remove"}}
        floatpanel = pn.layout.FloatPanel(pn.Column(welcome_Str,pn.Row(project_path_input, project_name, create_proj_button), or_Str, pn.Row(open_proj_name, open_proj_button)), name='Welcome', margin=20, config=config)

        taxon_input = pn.widgets.TextInput(name='Fetch genomes from NCBI', placeholder='Enter taxon name here...')
        search_button = pn.widgets.Button(name='Search', button_type='primary')
        num_of_genomes_str = pn.pane.Str("",styles={'font-size': '12pt'})
        
        num_of_genomes_input = pn.widgets.IntInput(name='Number of genomes', value=5, step=1, start=0)
        download_button = pn.widgets.Button(name='Download', button_type='primary')
        
        genome_input = pn.widgets.FileInput(accept='.fasta, .gbff, .fna, .fa', multiple=True)
        genome_save_button =pn.widgets.Button(name='Save', button_type='primary')
        upload_str = pn.pane.Str("\n\nUpload genomes below. The maximum \nfile size for uploading is 100 Mb.\nAllowed formats: .gbff, .fa, .fna, .fasta",styles={'font-size': '12pt'})
        
        sequence_df = pn.widgets.DataFrame(pd.DataFrame(),height=450, width=1000)
        stats_fig = pn.pane.Plotly(visible=False)

        remove_button = pn.widgets.Button(name='Remove Genome', width=150, button_type='primary')
        remove_button.on_click(remove)
        
        
        
        search_button.on_click(search)
        download_button.on_click(download)
        genome_save_button.on_click(save)
        
        genome_interface = pn.Row( pn.Column(pn.Row(taxon_input, search_button),num_of_genomes_str, pn.Row(num_of_genomes_input, download_button), pn.Column(upload_str, pn.Row(genome_input, genome_save_button))), pn.Row(sequence_df, remove_button))
        
        loading = pn.indicators.LoadingSpinner(value=False, name='', visible = False)
        
        gspec_spinner = pn.GridSpec()
        gspec_spinner[2,0:2] = loading
        
        gspec_genome = pn.GridSpec(height = 750)
        gspec_genome[0:2,   0:4  ] = genome_interface
        gspec_genome[3, 1:4] = pn.Column(stats_fig)
        gspec_genome[4,   4] = gspec_spinner

        if show_float == False:
            update_df()

        if show_float == True:
            return pn.Column(floatpanel, gspec_genome)
        else:
            return pn.Column(gspec_genome)

    def panel(self):
        return pn.Row(self.view,)


IndentationError: unindent does not match any outer indentation level (<tokenize>, line 57)

In [4]:
class Diamond(param.Parameterized):
    def view(self):

        def cluster(event):
            loading.visible = True
            loading.name = "Diamond clustering ..."
            loading.value = True
            proteins = []
            rnas= []
            rna_dict = defaultdict(list)
            rna_list = []
            for index, row in gene_df.value.iterrows():
                if row['type'] == 'protein':
                    protein = SeqRecord(Seq(row['sequence']), id=index, description='')
                    proteins.append(protein)
                else:
                    rna = SeqRecord(Seq(row['sequence']), id=index, description='')
                    rnas.append(rna)
                    if row["type"] == "tRNA" :
                        rna_dict[row['product'].split('(')[0]].append(index)
                    elif row["type"] == "rRNA" :
                        rna_dict[row['product'].split(' .')[0]].append(index)
                    else:
                        rna_dict['tmRNA'].append(index)
            for key in rna_dict:
                cluster = rna_dict[key]
                genome_list = []
                row_cluster_to_rna = {'Name': cluster[-1], 'genes': cluster, 'pangenome': '-'}
                for rna in cluster:
                    genome_list.append(genome_to_protein_df.loc[rna]['genome'])
                row_cluster_to_rna['genomes']=genome_list
                row_cluster_to_rna['genomes_set']= "-"
                row_cluster_to_rna['type']=genome_to_protein_df.loc[cluster[-1]]['type']
                rna_list.append(row_cluster_to_rna)
            cluster_to_rna_df = pd.DataFrame.from_dict(rna_list)
            cluster_to_rna_df.to_csv('cluster_to_rna.csv' ,index=False)            
            outputfile_proteins=f'{project_folder}/clusters.fasta'
            outputfile_rnas=f'{project_folder}/rnas.fasta'
            SeqIO.write(proteins,outputfile_proteins,'fasta')
            SeqIO.write(rnas,outputfile_rnas,'fasta')
            subprocess.run('makeblastdb -in rnas.fasta -parse_seqids -out blastdb/rnagenes_db -dbtype nucl', shell=True)
            subprocess.run("diamond makedb --in clusters.fasta -d clusterdb",shell=True)
            os.remove("clusters.fasta")
            os.remove("rnas.fasta")
            subprocess.run(f"diamond cluster --db clusterdb --evalue 0.00001 --approx-id {approx_slider.value} --member-cover {member_slider.value} --out {project_folder}/clusters.txt --cluster-steps faster sensitive ultra-sensitive",shell=True)
            loading.name = "Diamond reclustering ..."
            subprocess.run(f"diamond recluster --db clusterdb --evalue 0.00001 --approx-id {approx_slider.value} --member-cover {member_slider.value} --clusters {project_folder}/clusters.txt --out {project_folder}/reclusters.txt --cluster-steps faster sensitive ultra-sensitive" ,shell=True)
            loading.value = False
            loading.name = "Done. Please continue to next stage."

        def save_changes_protein_table(event):
            loading.name = 'Saving changes.'
            loading.value = True
            gene_df.value.to_csv('genome_to_protein.csv')
            gene_df.value = pd.read_csv('genome_to_protein.csv', index_col=0)
            loading.name = 'Done. Please cluster proteins.'
            loading.value = False
        cluster_button = pn.widgets.Button(name='Cluster Proteins', button_type='primary')
        approx_slider = pn.widgets.EditableFloatSlider(name='Approximate sequence identity threshold in %', fixed_start=0.0, fixed_end=100.0, step=0.5, value=50.0)
        member_slider= pn.widgets.EditableFloatSlider(name='Coverage threshold of the cluster member sequence in %', fixed_start=0.0, fixed_end=100.0, step=0.5, value=50.0)
        cluster_button.on_click(cluster)
        cluster_interface = pn.Column(approx_slider, member_slider, cluster_button)
        genome_to_protein_df = pd.read_csv('genome_to_protein.csv', index_col=0)
        gene_df = pn.widgets.Tabulator(genome_to_protein_df , header_filters=True, pagination="remote", widths={'id': '10%', 'genome': '15%', 'sequence': '40%', 'product': '25%', 'type':'10%'}, height=700, width=1000)
        save_changes_button = pn.widgets.Button(name='Save changes', button_type='primary')
        save_changes_button.on_click(save_changes_protein_table)

        loading = pn.indicators.LoadingSpinner(value=False, name='Cluster the proteins with Diamond.', visible = True)
        if os.path.isfile(f'{project_folder}/reclusters.txt') == True:
            loading.name = "Calculated Dimond clusters detected. This stage can be skipped."
            loading.visible = True
        gspec_spinner = pn.GridSpec()
        gspec_spinner[2,0:2] = loading
        
        gspec_cluster = pn.GridSpec(height = 600)
        gspec_cluster[0:4,   0:3  ] = pn.Row(pn.Column(gene_df, save_changes_button), cluster_interface) 
        gspec_cluster[4,   4] = gspec_spinner
        
        
        return gspec_cluster
    def panel(self):
        return self.view()

NameError: name 'param' is not defined

In [3]:
class Pan_computation(param.Parameterized):
    def view(self):
        def pan_comp(event):
            loading.visible = True
            loading.name = "Calculating Pangenome Characteristics..."
            loading.value = True

            cluster_df = pd.read_csv(f'{project_folder}/reclusters.txt', sep='\t', header=None)
            prot_lists= []
            prot_old = ''
            for index, row in cluster_df.iterrows():
                prot = row[0]
                if prot != prot_old:
                    if index > 0:
                        prot_list.append(prot_list.pop(prot_list.index(prot_old)))
                        prot_lists.append(prot_list)
                    prot_list = []
                    prot_list.append(row[1])
                else:
                    prot_list.append(row[1])
                prot_old = row[0]
            prot_lists.append(prot_list)
            genome_to_protein_df = pd.read_csv('genome_to_protein.csv', index_col=0)
            cluster_to_rna_df = pd.read_csv('cluster_to_rna.csv', converters = {"genes": pd.eval, 'genomes': pd.eval})
            for index, row in cluster_to_rna_df.iterrows():
                cluster_to_rna_df.at[index, 'genomes_set'] = set(cluster_to_rna_df.loc[index]['genomes'])
            cluster_to_protein_df = pd.DataFrame()
            cluster_to_protein_dict = {}
            cluster_to_protein_list = []
            #cluster_list = []
            for cluster in prot_lists:
                #helper_list = []
                genome_list = []
                row_cluster_to_protein = {'Name': cluster[-1], 'genes': cluster, 'pangenome': '-'}
                for protein in cluster:
                    genome_list.append(genome_to_protein_df.loc[protein]['genome'])
                    #helper_list.append(SeqRecord(Seq(genome_to_protein_df.loc[protein]['sequence']), id=protein, description=''))
                row_cluster_to_protein['genomes']=genome_list
                row_cluster_to_protein['genomes_set']=set(genome_list)
                row_cluster_to_protein['type']=genome_to_protein_df.loc[cluster[-1]]['type']
                #cluster_list.append(helper_list)
                cluster_to_protein_list.append(row_cluster_to_protein)
            #for index, row in cluster_to_rna_df.iterrows():
                #helper_list = []
                #for rna in row['genes']:
                #    helper_list.append(SeqRecord(Seq(genome_to_protein_df.loc[rna]['sequence']), id=rna, description=''))
                #cluster_list.append(helper_list)
            cluster_to_protein_df = pd.DataFrame.from_dict(cluster_to_protein_list)
            cluster_to_gene_df = pd.concat([cluster_to_protein_df, cluster_to_rna_df], axis=0).set_index("Name")
            
            stats = open('sequence_stats.csv', 'r')
            reader = csv.reader(stats)
            next(reader, None)
            genome_names = [] 
            for row in reader:
                genome_names.append(row[0])
            pan_df = pd.DataFrame(0, index=genome_names, columns=['Cloud','Shell','Soft-Core','Core']) 
            soft_core = round(len(genome_names)*0.95)
            for index, row in cluster_to_gene_df.iterrows():
                counter =  Counter(row['genomes']).items()
                if len(counter)==1:
                    for item, count in counter:
                        pan_df.loc[item]['Cloud'] += count
                        cluster_to_gene_df.at[index, 'pangenome'] = 'Cloud'
                elif 1 < len(counter) < soft_core:
                    for item, count in counter:
                        pan_df.loc[item]['Shell'] += count
                        cluster_to_gene_df.at[index, 'pangenome'] = 'Shell'
                elif soft_core != len(genome_names) and soft_core <= len(counter) < len(genome_names):
                    for item, count in counter:
                        pan_df.loc[item]['Soft-Core'] += count
                        cluster_to_gene_df.at[index, 'pangenome'] = 'Soft-Core'
                elif len(counter) == len(genome_names):
                    for item, count in counter:
                        pan_df.loc[item]['Core'] += count
                        cluster_to_gene_df.at[index, 'pangenome'] = 'Core'
            pan_df.to_csv('pan.csv')
            cluster_to_gene_df.to_csv('cluster_to_gene.csv')

            add_genomes_set = set()
            pan_core_size_df = pd.DataFrame(0, index=genome_names, columns=['Core','Pan']) 
            for genome in genome_names:
                add_genomes_set.add(genome)
                core_size = 0
                pan_size = 0
                for index, row in cluster_to_gene_df.iterrows():
                    if add_genomes_set.issubset(row['genomes_set']):
                        core_size += 1
                    elif len(add_genomes_set.intersection(row['genomes_set'])) > 0:
                        pan_size +=1
                pan_core_size_df.loc[genome]['Core']= core_size
                pan_core_size_df.loc[genome]['Pan']= pan_size+core_size
            pan_core_size_df.to_csv('pan_core_size.csv')
            show_pan_core()
            loading.name = "Done. Please continue to next stage."
            loading.value = False

        def show_pan_core():
            pan_df = pd.read_csv('pan.csv', index_col=0)
            traces = []
            for col in pan_df.columns:
                trace = go.Bar(x=pan_df.index, y=pan_df[col], name=col,  hoverinfo='y')
                traces.append(trace)
                
            layout = go.Layout(
                title='Pan Genome Distribution',
                xaxis=dict(title='Genome'),
                yaxis=dict(title='Cluster'),
                barmode='stack',
                xaxis_tickangle=-90,
                colorway=px.colors.qualitative.T10,
                width=1200,
                height=500
            )
            
            fig_pc = go.Figure(data=traces, layout=layout)
            fig_pc.write_image("images/pan_core.png")
            pan_core_fig.object =fig_pc
            pan_core_fig.visible = True

            pan_core_size_df = pd.read_csv('pan_core_size.csv', index_col=0)
            trace_core = go.Scatter(x=pan_core_size_df.index, y=pan_core_size_df['Core'], mode='lines+markers', name='Core', hoverinfo='y')
            trace_pan = go.Scatter(x=pan_core_size_df.index, y=pan_core_size_df['Pan'], mode='lines+markers', name='Pan',  hoverinfo='y')
            
            layout = go.Layout(
                title='Pan-Core Chart',
                xaxis=dict(title='Genomes'),
                yaxis=dict(title='Cluster'),
                xaxis_tickangle=-90,
                colorway=px.colors.qualitative.T10,
                width=1200,
                height=500
            )
            
            fig_pcs = go.Figure(data=[trace_core, trace_pan], layout=layout)
            fig_pcs.write_image("images/pan_core_size.png")
            pan_core_size_fig.object = fig_pcs
            pan_core_size_fig.visible = True

            
        msa_hmm_button=pn.widgets.Button(name='Pangenome Charecteristics Calculation', button_type='primary')
        msa_hmm_button.on_click(pan_comp)
        
        pan_core_fig = pn.pane.Plotly(visible=False)
        pan_core_size_fig = pn.pane.Plotly(visible=False)

        if os.path.isfile('pan_core_size.csv') and os.path.isfile('pan.csv'):
            show_pan_core()

        loading = pn.indicators.LoadingSpinner(value=False, name='', visible = False)

        if os.path.exists(f"{project_folder}/pan.csv") == True and os.path.exists(f"{project_folder}/pan_core_size.csv") == True:
            loading.visible = True
            loading.name = 'Characteristics detected. This stage can be skipped.'
            
        
        msa_interface = msa_hmm_button

        
        gspec_msa = pn.GridSpec(height = 600)
        gspec_msa[0,   0  ] = msa_interface
        gspec_msa[1:10, 0:7] = pn.Column(pan_core_fig,pan_core_size_fig)
        gspec_msa[10,   7:10] = loading
        
        return gspec_msa
    def panel(self):
        return self.view()

NameError: name 'param' is not defined

In [4]:
class Sequence_Workbench(param.Parameterized):
    def view(self):
        cluster_to_protein_df = pd.read_csv('cluster_to_gene.csv', index_col=0, converters = {"genes": pd.eval})
        genome_to_protein_df = pd.read_csv('genome_to_protein.csv', index_col=0)
        cluster_overview_df = cluster_to_protein_df[['genes', 'type', 'pangenome']].copy().rename(columns={"genes": "size"})
        cluster_overview_df['product'] = '-'
        for index, row in cluster_overview_df.iterrows():
            cluster_overview_df.at[index, 'size'] = len(row['size'])
            product = ''
            for protein in row['size']:
                if protein in genome_to_protein_df.index:
                    product = genome_to_protein_df.loc[protein]['product']
                    break
            cluster_overview_df.at[index, 'product'] = product
        cluster_overview_df = cluster_overview_df.astype({'size': 'int64'})
        nested_cluster_df = lambda row: pn.widgets.Tabulator(genome_to_protein_df.loc[cluster_to_protein_df.loc[row.name]['genes']], width=900, layout='fit_columns', widths={'id': '10%', 'genome': '15%', 'translation': '40%', 'product': '35%'})

        seq_stats_df = pd.read_csv('sequence_stats.csv')
        color_df = seq_stats_df['Name'].copy().to_frame()
        color_df['Color'] = '-'
        color_df['Color2'] = '-'
        num_genomes = len(color_df.index)
        colors = cm.rainbow(np.linspace(0, 1, num_genomes))
        for index, row in color_df.iterrows():
            color_df.at[index, 'Color'] ='#ff' + str(mplc.rgb2hex(colors[index])).rsplit('#')[1]
            color_df.at[index, 'Color2'] = str(mplc.rgb2hex(colors[index]))
        color_df.to_csv("color.csv", index = False)
        color_df = pd.read_csv('color.csv', index_col=0)

        legend = pn.pane.Plotly(visible=True)
        fig = px.bar(color_df, y=color_df.index, color='Color2', color_discrete_sequence=color_df['Color2'], orientation='h')
        fig.update_layout(title='Genome Colors',
                          showlegend=False,
                          width=200, 
                          height=700)
        legend.object = fig

        def get_colors(seqs, protein):
            """make colors for bases in sequence"""
            text = [i for s in list(seqs) for i in s]
            if protein == True:
                clrs =	{'A': 'lightgreen', 'G': 'lightgreen', 'C': 'green', 'D': 'darkgreen', 'E': 'darkgreen', 'N': 'darkgreen', 'Q': 'darkgreen',
                        'I': 'blue', 'L': 'blue', 'M': 'blue', 'V': 'blue', 'F': 'palevioletred', 'W': ' palevioletred', 'Y': ' palevioletred', 'H': 'darkblue', 'K': 'orange',
                        'R': 'orange', 'P': 'pink', 'S': 'red', 'T': 'red', '-': 'white', 'X' : 'gray'}
            else:
                clrs =  {'A':'red','T':'green','G':'orange','C':'blue','-':'white', 'U': 'purple', 'N':'yellow', 'R':'yellow', 'Y':'yellow'}
            colors = [clrs[i.upper()] for i in text]
            return colors
        
        
        def view_alignment(aln, protein_bool, fontsize="9pt", plot_width=800):
            """Bokeh sequence alignment view"""
        
            #make sequence and id lists from the aln object
            seqs = [rec.seq for rec in (aln)]
            ids = [rec.id for rec in aln]    
            text = [i for s in list(seqs) for i in s.upper()]
            colors = get_colors(seqs, protein_bool)    
            N = len(seqs[0])
            S = len(seqs)    
            width = .4
        
            x = np.arange(1,N+1)
            y = np.arange(0,S,1)
            #creates a 2D grid of coords from the 1D arrays
            xx, yy = np.meshgrid(x, y)
            #flattens the arrays
            gx = xx.ravel()
            gy = yy.flatten()
            #use recty for rect coords with an offset
            recty = gy+.5
            h= 1/S
            #now we can create the ColumnDataSource with all the arrays
            source = ColumnDataSource(dict(x=gx, y=gy, recty=recty, text=text, colors=colors ))#colors=colors
            plot_height = len(seqs)*15+50
            x_range = Range1d(0,N+1, bounds='auto')
            if N>100:
                viewlen=100
            else:
                viewlen=N
            #view_range is for the close up view
            view_range = (0,viewlen)
            tools="xpan, xwheel_zoom, reset, save"
        
            #entire sequence view (no text, with zoom)
            p = figure(title=None,  height=50, width=plot_width,   #plot_width= plot_width,
                       x_range=x_range, y_range=(0,S), tools=tools,
                       min_border=0, toolbar_location='below')
            rects = Rect(x="x", y="recty",  width=1, height=1, fill_color="colors",
                         line_color=None, fill_alpha=0.6)
            p.add_glyph(source, rects)
            p.yaxis.visible = False
            p.grid.visible = False  
        
            #sequence text view with ability to scroll along x axis
            p1 = figure(title=None, width=plot_width, height=plot_height,
                        x_range=view_range, y_range=ids, tools="xpan,reset",
                        min_border=0, toolbar_location='below')#, lod_factor=1)          
            glyph = Text(x="x", y="y", text="text", text_align='center',text_color="black", text_font_size=fontsize)
            rects = Rect(x="x", y="recty",  width=1, height=1, fill_color="colors",
                        line_color=None, fill_alpha=0.4)
            p1.add_glyph(source, glyph)
            p1.add_glyph(source, rects)
        
            p1.grid.visible = False
            p1.xaxis.major_label_text_font_style = "bold"
            p1.yaxis.minor_tick_line_width = 0
            p1.yaxis.major_tick_line_width = 0
        
            p = gridplot([[p],[p1]], toolbar_location='below')
            return p
        
        
        def alignment_view(event):
            cluster_name = cluster_overview.value.iloc[cluster_overview.selection].index.to_list()[0]
            if cluster_overview.value.loc[cluster_name]['size'] >= 2:
                protein_bool = True
                if cluster_overview.value.loc[cluster_name]['type'] != 'protein':
                    protein_bool = False
                cluster = []
                loading.name = "Generating MSA ..."
                loading.value = True
                loading.visible = True
                for gene in cluster_to_protein_df.loc[cluster_name]['genes']:
                    cluster.append(SeqRecord(Seq(genome_to_protein_df.loc[gene]['sequence']), id=gene, description=''))
                out_handle = StringIO()
                SeqIO.write(cluster, out_handle, "fasta")
                fasta_data = out_handle.getvalue()
                mafft = subprocess.Popen(["mafft", "--auto", "--thread", "-1", "-"], stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
                stdout_data, stderr_data = mafft.communicate(input=fasta_data)
                output_file = f'{mafft_folder}/{cluster[-1].id}.fasta'
                with open(output_file, 'w') as output_handle:
                    output_handle.write(stdout_data)
                aln = AlignIO.read(f"{mafft_folder}/{cluster_name}.fasta",'fasta')
                alignment_viewer = view_alignment(aln=aln, protein_bool=protein_bool ,plot_width=1500)
                alignment_viewer_bokeh.object = alignment_viewer
                loading.value = False
                loading.visible = False
            else:
                str_tree_explanation.object = 'The cluster needs to be at least of size two to display an alignment.'

        tree_view = pn.pane.HTML("""
        <iframe src="https://phylogenetictreedraw.web.app/?hide=true" id="myFrame" height="600" width="850" allow_embedding=True>
        </iframe>""",
        styles={'background-color': '#F6F6F6',
        'border': '2px solid black',
        'border-radius': '5px',
        'padding': '10px'})

        html_helper = pn.pane.HTML(" ")

        def is_valid(input_string):
            elements = input_string.split(',')
            for element in elements:
                num_part = element.split(':')[1].split(')')[0]
                if num_part != '0.0':
                    return True
            return False

        def insert_value_after_identifier(string):
            pattern = r'\b[A-Za-z][A-Za-z0-9_.\[\]]+(?=:)'
            matches = re.findall(pattern, string)
            for match in matches:
                string = string.replace(match, f"{match}[{color_df.loc[genome_to_protein_df.loc[match]['genome']]['Color']}]")
            return string
        
        def get_tree(event):
            cluster_name = cluster_overview.value.iloc[cluster_overview.selection].index.to_list()[0]
            if os.path.exists(f"{mafft_folder}/{cluster_name}.fasta") == True:
                loading.name = "Generating Tree ..."
                loading.value = True
                loading.visible = True
                if cluster_overview.value.loc[cluster_name]['size'] >= 2:
                    if cluster_overview.value.loc[cluster_name]['type'] == 'protein':
                        subprocess.run(f"fasttree < {mafft_folder}/{cluster_name}.fasta >  {tree_folder}/{cluster_name}.tre",shell=True)
                    else:
                        subprocess.run(f"fasttree -gtr -nt < {mafft_folder}/{cluster_name}.fasta >  {tree_folder}/{cluster_name}.tre",shell=True)
                    with open(f'{tree_folder}/{cluster_name}.tre', 'r') as f:
                        data = f.read()
                    data = data[0:-1]
                    if is_valid(data) == True:
                        html_helper.object = f"""<script type="text/javascript">
                        var panel_row_elements = document.getElementsByClassName('bk-panel-models-layout-Column');
                        console.log(panel_row_elements);
                        var htmlElements = panel_row_elements[0].shadowRoot.childNodes[10].shadowRoot.childNodes[9].shadowRoot.childNodes[12];
                        console.log(htmlElements);
                        var myFrame = htmlElements.shadowRoot.childNodes[9].shadowRoot.childNodes[8].childNodes[0];
                        console.log(myFrame);
                        myFrame.contentWindow.postMessage(`{insert_value_after_identifier(data)}`, '*');
                        </script>"""
                        html_helper.param.trigger('object')
                        loading.value = False
                        loading.visible = False
                    else:
                        str_tree_explanation.object = 'The sequences are identical. It is not possible to draw a tree.'
                else:
                    str_tree_explanation.object = 'The cluster needs to be at least of size two to display a tree.'
            else:
                str_tree_explanation.object = 'Please generate the MSA before you draw the tree.'
        
        draw_tree_button = pn.widgets.Button(name='Draw Tree', width=150, button_type='primary')
        draw_tree_button.on_click(get_tree)
        
        str_tree_explanation = pn.pane.Str(
            'Please select a cluster in the table \non the left and use "Show MSA" first \nand "Draw Tree" or "Copy MSA to clipboar" second.',
            styles={'font-size': '12pt'}
        )
                    
        alignment_viewer_bokeh = pn.pane.Bokeh()

        def copy_to_clipboard(event):
            cluster_name = cluster_overview.value.iloc[cluster_overview.selection].index.to_list()[0]
            if os.path.exists(f"{mafft_folder}/{cluster_name}.fasta") == True:
                if cluster_overview.value.loc[cluster_name]['size'] >= 2: 
                    with open(f'{mafft_folder}/{cluster_name}.fasta', 'r') as f:
                        data = f.read()
                    pc.copy(data) 
                else:
                    str_tree_explanation.object = 'The cluster needs to be at least of size two to copy the alignment.'
            else:
                str_tree_explanation.object = 'Please generate the MSA before you draw the tree.'
        
        copy_button = pn.widgets.Button(name='Copy MSA to clipboard', width=150, button_type='primary')
        copy_button.on_click(copy_to_clipboard)
        
        
        cluster_overview = pn.widgets.Tabulator(cluster_overview_df, header_filters=True, pagination='remote', row_content=nested_cluster_df, height = 700, width=900, layout= 'fit_columns')
        alignment_button = pn.widgets.Button(name='Show MSA', button_type='primary')
        loading = pn.indicators.LoadingSpinner(value=False, name='', visible = False, size = 50)

        alignment_button.on_click(alignment_view)

        workspace_interface = pn.Column(html_helper, pn.Row(cluster_overview, pn.Column(alignment_button,draw_tree_button, copy_button, str_tree_explanation, loading)), pn.Row(tree_view, legend), alignment_viewer_bokeh )
                
        return workspace_interface
    def panel(self):
        return self.view()

NameError: name 'param' is not defined

In [2]:
class Blast(param.Parameterized):
    def view(self):
        def get_colors(seqs, protein):
            """make colors for bases in sequence"""
            text = [i for s in list(seqs) for i in s]
            if protein == True:
                clrs =	{'A': 'lightgreen', 'G': 'lightgreen', 'C': 'green', 'D': 'darkgreen', 'E': 'darkgreen', 'N': 'darkgreen', 'Q': 'darkgreen',
                        'I': 'blue', 'L': 'blue', 'M': 'blue', 'V': 'blue', 'F': 'palevioletred', 'W': ' palevioletred', 'Y': ' palevioletred', 'H': 'darkblue', 'K': 'orange',
                        'R': 'orange', 'P': 'pink', 'S': 'red', 'T': 'red', '-': 'white', 'X' : 'gray'}
            else:
                clrs =  {'A':'red','T':'green','G':'orange','C':'blue','-':'white', 'U': 'purple', 'N':'yellow', 'R':'yellow', 'Y':'yellow'}
            colors = [clrs[i.upper()] for i in text]
            return colors
        
        
        def view_alignment(aln, protein_bool, fontsize="9pt", plot_width=800):
            """Bokeh sequence alignment view"""
        
            #make sequence and id lists from the aln object
            seqs = [rec.seq for rec in (aln)]
            ids = [rec.id for rec in aln]    
            text = [i for s in list(seqs) for i in s.upper()]
            colors = get_colors(seqs, protein_bool)    
            N = len(seqs[0])
            S = len(seqs)    
            width = .4
        
            x = np.arange(1,N+1)
            y = np.arange(0,S,1)
            #creates a 2D grid of coords from the 1D arrays
            xx, yy = np.meshgrid(x, y)
            #flattens the arrays
            gx = xx.ravel()
            gy = yy.flatten()
            #use recty for rect coords with an offset
            recty = gy+.5
            h= 1/S
            #now we can create the ColumnDataSource with all the arrays
            source = ColumnDataSource(dict(x=gx, y=gy, recty=recty, text=text, colors=colors ))#colors=colors
            plot_height = len(seqs)*15+50
            x_range = Range1d(0,N+1, bounds='auto')
            if N>100:
                viewlen=100
            else:
                viewlen=N
            #view_range is for the close up view
            view_range = (0,viewlen)
            tools="xpan, xwheel_zoom, reset, save"
        
            #entire sequence view (no text, with zoom)
            p = figure(title=None,  height=50, width=plot_width,   #plot_width= plot_width,
                       x_range=x_range, y_range=(0,S), tools=tools,
                       min_border=0, toolbar_location='below')
            rects = Rect(x="x", y="recty",  width=1, height=1, fill_color="colors",
                         line_color=None, fill_alpha=0.6)
            p.add_glyph(source, rects)
            p.yaxis.visible = False
            p.grid.visible = False  
        
            #sequence text view with ability to scroll along x axis
            p1 = figure(title=None, width=plot_width, height=plot_height,
                        x_range=view_range, y_range=ids, tools="xpan,reset",
                        min_border=0, toolbar_location='below')#, lod_factor=1)          
            glyph = Text(x="x", y="y", text="text", text_align='center',text_color="black", text_font_size=fontsize)
            rects = Rect(x="x", y="recty",  width=1, height=1, fill_color="colors",
                        line_color=None, fill_alpha=0.4)
            p1.add_glyph(source, glyph)
            p1.add_glyph(source, rects)
        
            p1.grid.visible = False
            p1.xaxis.major_label_text_font_style = "bold"
            p1.yaxis.minor_tick_line_width = 0
            p1.yaxis.major_tick_line_width = 0
        
            p = gridplot([[p],[p1]], toolbar_location='below')
            return p
        
        def blastp(event):
            blastmode = 'blastp'
            if blast_switch.value == True:
                blastmode = 'blastx'
            with open("query.fasta", "w") as output_handle:
                SeqIO.write(SeqRecord(Seq(seq_input.value), id=id_input.value, description=''), output_handle, "fasta")
            subprocess.run(f"diamond {blastmode} -d clusterdb -q query.fasta -o {blast_folder}/{id_input.value}.tsv",shell=True)
            os.remove('query.fasta')
            results_df = pd.read_csv(f"{blast_folder}/{id_input.value}.tsv", sep='\t', index_col=1, names=['Query accession', 'Target accession', 'Sequence identity', 'Length', 'Mismatches', 'Gap openings', 'Query start', 'Query end', 'Target start', 'Target end', "E-value", "Bit score"])
            results_tabulator.value = results_df
            protein_df = pd.read_csv('genome_to_protein.csv', index_col=0)
            cluster_df = pd.read_csv('cluster_to_gene.csv', index_col=0)
            recluster_df = pd.read_csv('reclusters.txt', sep='\t', index_col=0, names=['Cluster', 'Protein'])
            matching_sequence_list = []
            if not results_df.empty:
                align = []
                if blast_switch.value == False: 
                    align.append(SeqRecord(Seq(seq_input.value), id=id_input.value, description=''))
                for index, row in results_df.iterrows():
                    cluster = recluster_df.index[recluster_df['Protein'] == index].tolist()[0]
                    sequence = protein_df.loc[index]['sequence']
                    matching_sequence_row = {'Name': index, 'Cluster': cluster, 'Pangenome': cluster_df.loc[cluster]['pangenome'],'Sequence': sequence, 'Product': protein_df.loc[index]['product'], 'Type': protein_df.loc[index]['type'], }
                    matching_sequence_list.append(matching_sequence_row)
                    align.append(SeqRecord(Seq(protein_df.loc[index]['sequence']), id=index, description=''))
                matching_sequence_df = pd.DataFrame.from_dict(matching_sequence_list)
                matching_sequence_df.set_index('Name', inplace=True)
                sequence_tabulator.value = matching_sequence_df
                sequence_tabulator.visible = True
                out_handle = StringIO()
                SeqIO.write(align, out_handle, "fasta")
                fasta_data = out_handle.getvalue()
                mafft = subprocess.Popen(["mafft", "--auto", "--thread", "-1", "-"], stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
                stdout_data, stderr_data = mafft.communicate(input=fasta_data)
                output_file = f'{blast_folder}/{id_input.value}_alignment.fasta'
                with open(output_file, 'w') as output_handle:
                        output_handle.write(stdout_data)
                protein_bool = True
                aln = AlignIO.read(output_file,'fasta')
                alignment_viewer = view_alignment(aln=aln, protein_bool=protein_bool ,plot_width=1600)
                alignment_viewer_bokeh.object = alignment_viewer

        def blastn(event):
            with open("query.fasta", "w") as output_handle:
                SeqIO.write(SeqRecord(Seq(seq_input_rna.value), id=id_input_rna.value, description=''), output_handle, "fasta")
            subprocess.run(f"blastn -db blastdb/rnagenes_db -query query.fasta -num_threads {num_cpus} -mt_mode 0 -outfmt 0 -out {blast_folder}/{id_input_rna.value}_res",shell=True)
            subprocess.run(f"blastn -db blastdb/rnagenes_db -query query.fasta -num_threads {num_cpus} -mt_mode 0 -outfmt 6 -out {blast_folder}/{id_input_rna.value}.tsv",shell=True)
            os.remove('query.fasta')
            results_df = pd.read_csv(f"{blast_folder}/{id_input_rna.value}.tsv", sep='\t', index_col=1, names=['Query accession', 'Target accession', 'Sequence identity', 'Length', 'Mismatches', 'Gap openings', 'Query start', 'Query end', 'Target start', 'Target end', "E-value", "Bit score"])
            results_tabulator.value = results_df
            protein_df = pd.read_csv('genome_to_protein.csv', index_col=0)
            cluster_df = pd.read_csv('cluster_to_gene.csv', index_col=0)
            matching_sequence_list = []
            if not results_df.empty:
                results_df.reset_index(inplace=True)
                results_df.drop_duplicates(subset='Target accession', inplace=True)
                results_df.set_index('Target accession', inplace=True)
                align = []
                align.append(SeqRecord(Seq(seq_input_rna.value), id=id_input_rna.value, description=''))
                for index, row in results_df.iterrows():
                    cluster = cluster_df.index[cluster_df['genes'].apply(lambda x: index in x)].tolist()[0]
                    sequence = protein_df.loc[index]['sequence']
                    matching_sequence_row = {'Name': index, 'Cluster': cluster, 'Pangenome': cluster_df.loc[cluster]['pangenome'],'Sequence': sequence, 'Product': protein_df.loc[index]['product'], 'Type': protein_df.loc[index]['type'], }
                    matching_sequence_list.append(matching_sequence_row)
                    align.append(SeqRecord(Seq(protein_df.loc[index]['sequence']), id=index, description=''))
                matching_sequence_df = pd.DataFrame.from_dict(matching_sequence_list)
                matching_sequence_df.set_index('Name', inplace=True)
                sequence_tabulator.value = matching_sequence_df
                sequence_tabulator.visible = True
                out_handle = StringIO()
                SeqIO.write(align, out_handle, "fasta")
                fasta_data = out_handle.getvalue()
                mafft = subprocess.Popen(["mafft", "--auto", "--thread", "-1", "-"], stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
                stdout_data, stderr_data = mafft.communicate(input=fasta_data)
                output_file = f'{blast_folder}/{id_input_rna.value}_alignment.fasta'
                with open(output_file, 'w') as output_handle:
                        output_handle.write(stdout_data)
                aln = AlignIO.read(output_file,'fasta')
                protein_bool = False
                alignment_viewer = view_alignment(aln=aln, protein_bool=protein_bool ,plot_width=1600)
                alignment_viewer_bokeh.object = alignment_viewer

        diamond_blast_str = pn.pane.Str(
            'Blast on protein database:',
            styles={'font-size': '12pt'}
        )
        id_input = pn.widgets.TextInput(name='Name of query', placeholder='Enter an identifier for you blast search...')
        seq_input = pn.widgets.TextInput(name='Sequence', placeholder='Enter Sequence...')
        blast_switch = pn.widgets.Switch(name='Switch')
        protein_text = pn.widgets.StaticText(value='Amino acids')
        nucleic_text = pn.widgets.StaticText(value='Nucleic acids')
        
        rna_blast_str = pn.pane.Str(
            'Blast on RNA encoding gene database:',
            styles={'font-size': '12pt'}
        )
        blast_button = pn.widgets.Button(name='Diamond BLAST on proteins', button_type='primary')
        blast_button.on_click(blastp)

        id_input_rna = pn.widgets.TextInput(name='Name of query', placeholder='Enter an identifier for you blast search...')
        seq_input_rna = pn.widgets.TextInput(name='Sequence', placeholder='Enter Sequence...')
        
        blast_rna_button = pn.widgets.Button(name='\n\nBLAST on RNA genes', button_type='primary')
        blast_rna_button.on_click(blastn)
        
        results_tabulator = pn.widgets.Tabulator()
        sequence_tabulator = pn.widgets.Tabulator(pagination='remote', width=1600, visible=False, layout = "fit_columns")
        alignment_viewer_bokeh = pn.pane.Bokeh()
        
        blast_interface = pn.Column(diamond_blast_str,pn.Row(id_input, seq_input, pn.Row(protein_text, blast_switch, nucleic_text), blast_button), rna_blast_str, pn.Row(id_input_rna,seq_input_rna,blast_rna_button), results_tabulator, alignment_viewer_bokeh ,sequence_tabulator)
        return blast_interface
    def panel(self):
        return self.view()

SyntaxError: invalid syntax (1874445932.py, line 105)

In [6]:
class Gene_atlas(param.Parameterized):
    def view(self):
        sequence_df = pd.read_csv('sequence_stats.csv', index_col=0)
        gene_df = pd.read_csv('genome_to_protein.csv', index_col=0)
        cluster_df = pd.read_csv('cluster_to_gene.csv', index_col=0, converters = {'genes': pd.eval})
        recluster_df = pd.read_csv('reclusters.txt', sep='\t', index_col=0, names=['Cluster', 'Protein'])
        
        cluster_overview_df = cluster_df[['genes', 'type', 'pangenome']].copy().rename(columns={"genes": "size"})
        cluster_overview_df = cluster_overview_df.loc[cluster_overview_df['pangenome'] == 'Core']
        cluster_overview_df['product'] = '-'
        for index, row in cluster_overview_df.iterrows():
            cluster_overview_df.at[index, 'size'] = len(row['size'])
            product = ''
            for protein in row['size']:
                if protein in gene_df.index:
                    product = gene_df.loc[protein]['product']
                    break
            cluster_overview_df.at[index, 'product'] = product
        cluster_overview_df = cluster_overview_df.astype({'size': 'int64'})
        nested_cluster_df = lambda row: pn.widgets.Tabulator(gene_df.loc[cluster_df.loc[row.name]['genes']], width=900, layout='fit_columns', widths={'id': '10%', 'genome': '15%', 'translation': '40%', 'product': '35%'})
        
        genome_select = []
        
        for index, row in sequence_df.iterrows():
            genome_select.append(index)
            
        atlas_view = pn.pane.HTML("""
        <iframe src="https://geneatlas.web.app/" id="myFrame" height="900" width="850" allow_embedding=True>
        </iframe>
        """,
        styles={'background-color': '#F6F6F6',
        'border': '2px solid black',
        'border-radius': '5px',
        'padding': '10px'}, height=900)
        html_helper = pn.pane.HTML(" ")
        
        def send_atlas(event):
            
            genome_selection = genome_df.selection
            reference_genome = select_ref.value
            
            if len(cluster_overview.selection) == 0:
                select_warning_str.object = "Select a starting gene set for the gene atlas below."                
            else:
                select_warning_str = pn.pane.Str(
                    'It is recommended to only select up to 10 genomes \naside the referennce genome for best visibility.\nThe best results are optained, if the assembly\nof the genome is complete and made up of one sequence.\nSelect a gene set below. It will be\ncoloured deep blue.',
                    styles={'font-size': '12pt'})
                loading.value = True
                loading.visible = True
                cluster_name = cluster_overview.value.iloc[cluster_overview.selection].index.to_list()[0]
                cluster_numbers = []
                for gene in cluster_df.loc[cluster_name]['genes']:
                        cluster_numbers.append({gene_df.loc[gene]['genome'] : gene_df.loc[gene]['protein number']})
                genome_dd = defaultdict(list)
                for d in cluster_numbers:
                    for key, value in d.items():
                        genome_dd[key].append(value)
                genome_dd = dict(genome_dd)
                
                genome_list = []
                helper_list = []
                if len(genome_selection) == 0:
                    protein_position = genome_dd[reference_genome]
                    protein_position.sort()
                    genome_list.append({reference_genome: [int(sequence_df.loc[reference_genome]['protein count']), int(protein_position[0])]})
                    helper_list.append(reference_genome)
                else:
                    for genome in genome_selection:
                        id = genome_df.value.iloc[genome].name
                        protein_position = genome_dd[id]
                        protein_position.sort()
                        genome_list.append({id: [int(sequence_df.loc[id]['protein count']), int(protein_position[0])]})
                        helper_list.append(id)
                    
                    protein_position = genome_dd[reference_genome]
                    protein_position.sort()
                    if {reference_genome: [int(sequence_df.loc[reference_genome]['protein count']), int(protein_position[0])]} in genome_list:
                        genome_list.append(genome_list.pop(genome_list.index({reference_genome: [int(sequence_df.loc[reference_genome]['protein count']), int(protein_position[0])]})))
                        helper_list.append(helper_list.pop(helper_list.index(reference_genome)))
                    else:
                        genome_list.append({reference_genome: [int(sequence_df.loc[reference_genome]['protein count']), int(protein_position[0])]})
                        helper_list.append(reference_genome)
    
                cluster_dict_list = []
                for index, row in gene_df.loc[(gene_df['genome'] == reference_genome) & (gene_df['type'] == 'protein')].iterrows():
                    cluster_dict  = defaultdict(list)
                    cluster_index = recluster_df.loc[recluster_df['Protein']==index].index[0]
                    gene_info = cluster_df.loc[cluster_index]['genes']
                    for gene in gene_info:
                        if gene_df.loc[gene]['genome'] in helper_list:
                            cluster_dict[gene_df.loc[gene]['genome']].append(int(gene_df.loc[gene]['protein number']))
                    cluster_dict_list.append(dict(cluster_dict))
                atlas_json = {"genomes": genome_list, "gensets": cluster_dict_list}
                atlas_json = json.dumps(atlas_json)
                loading.value = False
                loading.visible = False
                html_helper.object = f"""<script type="text/javascript">
                var panel_row_elements = document.getElementsByClassName('bk-panel-models-layout-Column');
                console.log(panel_row_elements);
                var myFrame = panel_row_elements[0].shadowRoot.childNodes[10].shadowRoot.childNodes[9].shadowRoot.childNodes[9].shadowRoot.childNodes[9].shadowRoot.childNodes[8].childNodes[0];
                console.log(myFrame);
                myFrame.contentWindow.postMessage(`{atlas_json}`, '*');
                </script>"""
                html_helper.param.trigger('object')
        
        select_ref = pn.widgets.Select(name='Reference Genome', options=genome_select)
        atlas_button = pn.widgets.Button(name='Draw Geneatlas', width=150, button_type='primary')
        atlas_button.on_click(send_atlas)

        genome_df = pn.widgets.Tabulator(sequence_df , header_filters=True, pagination="remote", height=400, width=800, selectable="checkbox")
        select_warning_str = pn.pane.Str(
            'It is recommended to only select up to 10 genomes \naside the referennce genome for best visibility.\nThe best results are optained, if the assembly\nof the genome is complete and made up of one sequence.\nSelect a gene set below. It will be\ncoloured deep blue.',
            styles={'font-size': '12pt'}
        )
        loading = pn.indicators.LoadingSpinner(value=False, name='Calculating Genealtas...' ,visible = False)
        gspec_spinner = pn.GridSpec()
        gspec_spinner[2,0:2] = loading

        cluster_overview = pn.widgets.Tabulator(cluster_overview_df, header_filters=True, pagination='remote', row_content=nested_cluster_df, height = 700, width=800, layout= 'fit_columns', selectable="checkbox-single")

        
        gspec_atlas = pn.GridSpec(height = 1200)
        gspec_atlas[0:4,   0:3  ] = pn.Row(atlas_view, pn.Column(select_ref ,atlas_button, html_helper, select_warning_str, genome_df, cluster_overview))
        gspec_atlas[4,   0] = gspec_spinner
        
        
        return gspec_atlas
    def panel(self):
        return self.view()

NameError: name 'param' is not defined

In [1]:
class Export(param.Parameterized):
    
    def view(self):
        
        class SELECT(Enum):
            GENOMES = 1
            CLUSTERS = 2
            GENES = 3
       
        def display(event):
            global enum
            if select_export.value == "Genomes":
                df = pd.read_csv('sequence_stats.csv', index_col=0)
                enum = SELECT.GENOMES
            elif select_export.value == "Clusters":
                cluster_to_protein_df = pd.read_csv('cluster_to_gene.csv', index_col=0, converters = {"genes": pd.eval})
                df = cluster_to_protein_df[['genes', 'type', 'pangenome']].copy()
                enum = SELECT.CLUSTERS
            else:
                df = pd.read_csv('genome_to_protein.csv', index_col=0)
                enum = SELECT.GENES
            select_tabulator.value = df
            select_tabulator.visible = True
            export_button.visible = True
            file_name_input.visible = True

        def export(event):
            if file_name_input.value == "":
                debug_str.object = "Please specify a file name."
                return 0
            gene_df = pd.read_csv('genome_to_protein.csv', index_col=0)
            if len(select_tabulator.selection) > 0:
                selected_genes = []
                out_handle = StringIO()
                if enum.value == 1:
                    for genome in select_tabulator.selection:
                        genome_name = select_tabulator.value.iloc[genome].name
                        selected_genes_df = gene_df.loc[(gene_df['genome'] == genome_name)]
                        for index, row in selected_genes_df.iterrows():
                            selected_genes.append(SeqRecord(Seq(row['sequence']), id=index, description=''))

                elif enum.value == 2:
                    for cluster in select_tabulator.selection:
                        cluster_name = select_tabulator.value.iloc[cluster].name
                        for gene in select_tabulator.value.loc[cluster_name]['genes']:
                            selected_genes.append(SeqRecord(Seq(gene_df.loc[gene]['sequence']), id=gene, description=''))
                else:
                    for gene in select_tabulator.selection:
                        gene_name = select_tabulator.value.iloc[gene].name
                        selected_genes.append(SeqRecord(Seq(select_tabulator.value.loc[gene_name]['sequence']), id=gene_name, description=''))
                SeqIO.write(selected_genes, out_handle, "fasta")
                output_file = f'{export_folder}/{file_name_input.value}.fasta'
                with open(output_file, 'w') as output_handle:
                        output_handle.write(out_handle.getvalue())
                debug_str.object = "Exporting sequences successfull."
            else:
                debug_str.object = "Please select at least one checkbox on the right side of the table."
        
        select_export = pn.widgets.Select(name='Select Export', options=['Genomes', 'Clusters', 'Genes'])
        select_button = pn.widgets.Button(name='Select', button_type='primary')
        select_button.on_click(display)
        select_tabulator = pn.widgets.Tabulator(selectable='checkbox', header_filters=True, pagination="remote",  width=1000, height=650, layout='fit_columns', visible = False)
        file_name_input = pn.widgets.TextInput(name='Name of export file', placeholder='Enter an identifier for you export file...', visible = False, value = "")
        export_button = pn.widgets.Button(name='Export Selection', button_type='primary', visible = False)        
        export_button.on_click(export)
        debug_str = pn.pane.Str("",styles={'font-size': '12pt'})

        export_interface = pn.Column(pn.Row(select_export, select_button), pn.Row(select_tabulator, file_name_input,export_button), debug_str)
        return export_interface
    def panel(self):
        return self.view()

NameError: name 'param' is not defined

In [21]:
pipeline.add_stage('Add Genomes', Genomes)
pipeline.add_stage('Clustering Proteins', Diamond)
pipeline.add_stage('Pangenome Characteristics', Pan_computation)
pipeline.add_stage('Sequence Workbench', Sequence_Workbench)
pipeline.add_stage('Blast', Blast)
pipeline.add_stage('Geneatlas', Gene_atlas)
pipeline.add_stage('Export Data', Export)

In [1]:
pipeline.servable();

NameError: name 'pipeline' is not defined