In [1]:
import param
import panel as pn
import subprocess
import pandas as pd
from Bio import SeqIO, AlignIO
from io import StringIO
#import ray
import os
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqUtils import gc_fraction
from glob import glob
import csv
from collections import Counter
#import plotly.graph_objs as go
#import plotly.express as px
import numpy as np
from bokeh.models import ColumnDataSource, Plot, Grid, Range1d
from bokeh.plotting import figure
from bokeh.models.glyphs import Text, Rect
from bokeh.layouts import gridplot
import pyperclip as pc

In [2]:
project_folder = "/home/infosebi/Documents/Programmieren/MastersProject/test_3"
mafft_folder=f"{project_folder}/aligned"
tree_folder=f"{project_folder}/trees"

In [3]:
pn.extension('tabulator')
pipeline = pn.pipeline.Pipeline(debug=True)

In [4]:
class Sequence_Workbench(param.Parameterized):
    def view(self):
        
        ###########
        os.chdir(project_folder)
        #############
        clusters_to_protein_df = pd.read_csv('cluster_to_protein.csv', index_col=0, converters = {"proteins": pd.eval})
        genome_to_protein_df = pd.read_csv('genome_to_protein.csv', index_col=0,)
        cluster_overview_df = clusters_to_protein_df[['proteins', 'type', 'pangenome']].copy().rename(columns={"proteins": "size"})
        cluster_overview_df['product'] = '-'
        for index, row in cluster_overview_df.iterrows():
            cluster_overview_df.at[index, 'size'] = len(row['size'])
            product = ''
            for protein in row['size']:  # Changed from row[0] to row['size']
                if protein in genome_to_protein_df.index:
                    product = genome_to_protein_df.loc[protein]['product']
                    break  # Exit loop once a product is found
            cluster_overview_df.at[index, 'product'] = product
        cluster_overview_df = cluster_overview_df.astype({'size': 'int64'})
        nested_cluster_df = lambda row: pn.widgets.Tabulator(genome_to_protein_df.loc[clusters_to_protein_df.loc[row.name]['proteins']], width=900, layout='fit_columns', widths={'id': '10%', 'genome': '15%', 'translation': '40%', 'product': '35%'})

        def get_colors(seqs, protein):
            """make colors for bases in sequence"""
            text = [i for s in list(seqs) for i in s]
            if protein == True:
                clrs =	{'A': 'lightgreen', 'G': 'lightgreen', 'C': 'green', 'D': 'darkgreen', 'E': 'darkgreen', 'N': 'darkgreen', 'Q': 'darkgreen',
                        'I': 'blue', 'L': 'blue', 'M': 'blue', 'V': 'blue', 'F': 'palevioletred', 'W': ' palevioletred', 'Y': ' palevioletred', 'H': 'darkblue', 'K': 'orange',
                        'R': 'orange', 'P': 'pink', 'S': 'red', 'T': 'red', '-': 'white'}
            else:
                clrs =  {'A':'red','T':'green','G':'orange','C':'blue','-':'white'}
            colors = [clrs[i.upper()] for i in text]
            return colors
        
        
        def view_alignment(aln, protein_bool, fontsize="9pt", plot_width=800):
            """Bokeh sequence alignment view"""
        
            #make sequence and id lists from the aln object
            seqs = [rec.seq for rec in (aln)]
            ids = [rec.id for rec in aln]    
            text = [i for s in list(seqs) for i in s.upper()]
            colors = get_colors(seqs, protein_bool)    
            N = len(seqs[0])
            S = len(seqs)    
            width = .4
        
            x = np.arange(1,N+1)
            y = np.arange(0,S,1)
            #creates a 2D grid of coords from the 1D arrays
            xx, yy = np.meshgrid(x, y)
            #flattens the arrays
            gx = xx.ravel()
            gy = yy.flatten()
            #use recty for rect coords with an offset
            recty = gy+.5
            h= 1/S
            #now we can create the ColumnDataSource with all the arrays
            source = ColumnDataSource(dict(x=gx, y=gy, recty=recty, text=text, colors=colors ))#colors=colors
            plot_height = len(seqs)*15+50
            x_range = Range1d(0,N+1, bounds='auto')
            if N>100:
                viewlen=100
            else:
                viewlen=N
            #view_range is for the close up view
            view_range = (0,viewlen)
            tools="xpan, xwheel_zoom, reset, save"
        
            #entire sequence view (no text, with zoom)
            p = figure(title=None,  height=50, width=plot_width,   #plot_width= plot_width,
                       x_range=x_range, y_range=(0,S), tools=tools,
                       min_border=0, toolbar_location='below')
            rects = Rect(x="x", y="recty",  width=1, height=1, fill_color="colors",
                         line_color=None, fill_alpha=0.6)
            p.add_glyph(source, rects)
            p.yaxis.visible = False
            p.grid.visible = False  
        
            #sequence text view with ability to scroll along x axis
            p1 = figure(title=None, width=plot_width, height=plot_height,
                        x_range=view_range, y_range=ids, tools="xpan,reset",
                        min_border=0, toolbar_location='below')#, lod_factor=1)          
            glyph = Text(x="x", y="y", text="text", text_align='center',text_color="black", text_font_size=fontsize)
            rects = Rect(x="x", y="recty",  width=1, height=1, fill_color="colors",
                        line_color=None, fill_alpha=0.4)
            p1.add_glyph(source, glyph)
            p1.add_glyph(source, rects)
        
            p1.grid.visible = False
            p1.xaxis.major_label_text_font_style = "bold"
            p1.yaxis.minor_tick_line_width = 0
            p1.yaxis.major_tick_line_width = 0
        
            p = gridplot([[p],[p1]], toolbar_location='below')
            return p
        
        
        def alignment_view(event):
            cluster_name = cluster_overview.value.iloc[cluster_overview.selection].index.to_list()[0]
            if cluster_overview.value.loc[cluster_name]['size'] >= 2:
                #print(cluster_name)
                protein_bool = True
                if cluster_overview.value.loc[cluster_name]['type'] != 'protein':
                    protein_bool = False
                aln = AlignIO.read(f"{mafft_folder}/{cluster_name}.fasta",'fasta')
                alignment_viewer = view_alignment(aln=aln, protein_bool=protein_bool ,plot_width=900)
                alignment_viewer_bokeh.object = alignment_viewer
            else:
                str_tree_explanation.object = 'The cluster needs to be at least of size two to display an alignment.'

        tree_view = pn.pane.HTML("""
        <iframe src="https://phylogenetictreedraw.web.app/?hide=true" id="myFrame" height="600" width="850" allow_embedding=True>
        </iframe>""",
        styles={'background-color': '#F6F6F6',
        'border': '2px solid black',
        'border-radius': '5px',
        'padding': '10px'})

        html_helper = pn.pane.HTML(" ")
        
        def get_tree(event):
            cluster_name = cluster_overview.value.iloc[cluster_overview.selection].index.to_list()[0]
            if cluster_overview.value.loc[cluster_name]['size'] >= 2:
                if cluster_overview.value.loc[cluster_name]['type'] == 'protein':
                    subprocess.run(f"fasttree < {mafft_folder}/{cluster_name}.fasta >  {tree_folder}/{cluster_name}.tre",shell=True)
                else:
                    subprocess.run(f"fasttree -gtr -nt < {mafft_folder}/{cluster_name}.fasta >  {tree_folder}/{cluster_name}.tre",shell=True)
                with open(f'{tree_folder}/{cluster_name}.tre', 'r') as f:
                    data = f.read()
                data = data[0:-1]
                html_helper.object = f"""<script type="text/javascript">
                var panel_row_elements = document.getElementsByClassName('bk-panel-models-layout-Column');
                var htmlElements = panel_row_elements[0].shadowRoot.childNodes[10].shadowRoot.childNodes[9].shadowRoot.childNodes[11]
                console.log(htmlElements);
                var myFrame = htmlElements.shadowRoot.childNodes[9].shadowRoot.childNodes[8].childNodes[0];
                console.log(myFrame);
                myFrame.contentWindow.postMessage(`{data}`, '*');
                </script>"""
                html_helper.param.trigger('object')
            else:
                str_tree_explanation.object = 'The cluster needs to be at least of size two to display a tree.'
        
        draw_tree_button = pn.widgets.Button(name='Draw Tree', width=150, button_type='primary')
        draw_tree_button.on_click(get_tree)
        
        str_tree_explanation = pn.pane.Str(
            'Please select a cluster in the table \non the left and click the "Draw Tree"  or "Show MSA".',
            styles={'font-size': '12pt'}
        )
                    
        alignment_viewer_bokeh = pn.pane.Bokeh()

        def copy_to_clipboard(event):
            cluster_name = cluster_overview.value.iloc[cluster_overview.selection].index.to_list()[0]
            if cluster_overview.value.loc[cluster_name]['size'] >= 2: 
                with open(f'{mafft_folder}/{cluster_name}.fasta', 'r') as f:
                    data = f.read()
                pc.copy(data) 
            else:
                str_tree_explanation.object = 'The cluster needs to be at least of size two to copy the alignment.'

        copy_button = pn.widgets.Button(name='Copy MSA to clipboard', width=150, button_type='primary')
        copy_button.on_click(copy_to_clipboard)
        
        
        cluster_overview = pn.widgets.Tabulator(cluster_overview_df, header_filters=True, pagination='remote', row_content=nested_cluster_df, width=900, layout= 'fit_columns')
        alignment_button = pn.widgets.Button(name='Show MSA', button_type='primary')
        #loading = pn.indicators.LoadingSpinner(value=False, name='', visible = False)

        alignment_button.on_click(alignment_view)
        
        workspace_interface = pn.Column(html_helper, pn.Row(cluster_overview, pn.Column(alignment_button,draw_tree_button, copy_button, str_tree_explanation)), pn.Row(tree_view, alignment_viewer_bokeh) )
                
        return workspace_interface
    def panel(self):
        return self.view()

In [5]:
pipeline.add_stage('Sequence Workbench', Sequence_Workbench)

In [6]:
pipeline.servable();

In [None]:
#{'source', 'repeat_region', 'misc_binding', 'ncRNA', 'gene', 'misc_feature', 'tRNA', 'CDS', 'rRNA'}
#{'source', 'sig_peptide', 'repeat_region', 'misc_binding', 'tmRNA', 'ncRNA', 'gene', 'tRNA', 'CDS', 'rRNA'}
#{'CDS', 'source', 'assembly_gap', 'gene', 'tRNA', 'misc_RNA', 'rRNA'}
#{'source', 'repeat_region', 'regulatory', 'misc_binding', 'ncRNA', 'gene', 'misc_feature', 'tRNA', 'CDS', 'rRNA'}
#{'rRNA', 'source', 'repeat_region', 'misc_binding', 'tmRNA', 'ncRNA', 'gene', 'tRNA', 'CDS', 'sig_peptide'}