In [None]:
%matplotlib notebook
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from bokeh.models import CustomJS, Dropdown, Button
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
#output_file-to save the layout in file, show-display the layout , output_notebook-to configure the default output state  to generate the output in jupytor notebook.
from bokeh.io import output_file, show, output_notebook
#ColumnDataSource makes selection of the column easier and Select is used to create drop down 
from bokeh.models import ColumnDataSource, Select, Spinner, MultiSelect, ColorPicker, RangeSlider, DataTable, TableColumn, HTMLTemplateFormatter
#Figure objects have many glyph methods that can be used to draw vectorized graphical glyphs. example of glyphs-circle, line, scattter etc. 
from bokeh.plotting import figure 
#To create intractive plot we need this to add callback method.
from bokeh.models import CustomJS, Legend
#This is for creating layout
from bokeh.layouts import column, gridplot, row
from bokeh.core.enums import MarkerType
plt.rcParams['legend.fontsize'] = 10
from bokeh.palettes import inferno, viridis, magma, Spectral

# from matplotlib.ticker import MaxNLocator
import math
import random
from random import shuffle

output_file("test.html")
output_notebook() #create default state to generate the output

In [None]:
def convert_strings_list_to_integer_dict(dataframe:pd.DataFrame)->dict:
    try:
        ints = {}
        
        for tax in ['genus','family','order','class','phylum']:
            if tax in dataframe.columns:
                strings = dataframe[tax].unique()
                for i,k in enumerate(strings):
                    ints[k]=i+1
        return ints
    except Exception as e:
        raise Exception("[-] ERROR with exception : {}".format(e))

In [None]:
def transform_dataframe_for_pca(dataframe:pd.DataFrame,transformer_dict:dict)->pd.DataFrame:
    try:
        dataframe = dataframe.copy()
        transform_column = lambda x : transformer_dict[x]
        for tax in ['genus','family','order','class','phylum']:
            if tax in dataframe.columns:
                dataframe.loc[:,tax] = dataframe[tax].apply(transform_column)
        return dataframe
    except Exception as e:
        raise Exception("[-] ERROR with exception : {}".format(e))

In [None]:
def produce_3d_plot(pca_df:pd.DataFrame,qseqid:str,pca_selection:PCA,color_dict:dict):
    fig = plt.figure(figsize=(8,8))
    ax = fig.add_subplot(111, projection='3d') 

    for color in pca_df['color'].unique():
        temp_df = pca_df[pca_df['color'] == color]
        keys = [k for k, v in color_dict.items() if v == color]
        ax.scatter(temp_df[0], temp_df[1], temp_df[2],c=color,marker='o',s=200,linewidth=1,edgecolors='black', alpha=0.5, label=keys[0])
        
    ax.set_title("Principal Component Analysis with {} RBH dataframe".format(qseqid))
    ax.set_xlabel('captured variance {var:.3f}%'.format(var=pca_selection.explained_variance_ratio_[0]*100))
    ax.set_ylabel('captured variance {var:.3f}%'.format(var=pca_selection.explained_variance_ratio_[1]*100))
    ax.set_zlabel('captured variance {var:.3f}%'.format(var=pca_selection.explained_variance_ratio_[2]*100))
    ax.legend()
    return ax


In [None]:
def load_domain_query_data(path_to_query_domains:str)->dict:
    try:
        #sseq qseq stitle
        header = "qseqid qlen sacc slen qstart qend sstart send sseq qseq bitscore evalue pident stitle".split(" ")
        cdd_queries = pd.read_table(path_to_query_domains,header=None)
        cdd_queries.columns=header
        domain_dict = {}
        for qseq in cdd_queries.qseqid.unique():
            qseq_key = qseq.split(".")[0]
            domain_dict[qseq_key]=cdd_queries[cdd_queries.qseqid == qseq].sacc.unique()
        return domain_dict
    except Exception as e:
        raise Exception("[-] ERROR loading domain query dictionary with exception: {}".format(e))

In [None]:
#qseqid qlen sacc slen qstart qend sstart send sseq qseq bitscore evalue pident stitle
def load_domain_data(path_to_domains:str,qseqid_key:str,domain_dict:dict)->pd.DataFrame:
    try:
        header = "qseqid qlen sacc slen qstart qend sstart send bitscore evalue pident".split(" ")
        cdd_output = pd.read_table(path_to_domains,header=None)
        cdd_output.columns=header
        transform_protids = lambda protid : protid.split(".")[0]
        cdd_output['transformed_qseqid'] = cdd_output.qseqid.apply(transform_protids)
        cdd_output['pident'] = cdd_output.pident.apply(lambda x : round(x/100,2))
        
        query_domain_df = pd.DataFrame(domain_dict[qseqid_key],columns=['sacc'])
        query_domain_df = query_domain_df.append({'sacc':'additional_domains'}, ignore_index=True)

        header=list(query_domain_df['sacc'])
        cdd_dataframe = pd.DataFrame({qseqid_key:[1 for i in range(len(header))]}).transpose()
        cdd_dataframe.columns=header
        cdd_dataframe['additional_domains']=0

        apply_matrix_transform = lambda x : 1 if x != 0 else 0
        for query_seq in cdd_output.qseqid.unique():

            domain_df = cdd_output[cdd_output['qseqid'] == query_seq][['sacc','qseqid','pident']]
            domain_df = domain_df.drop_duplicates(subset='sacc').reset_index()
            domain_df = domain_df.drop('index',axis=1)
            
            counter = 0
            for domain in domain_df.sacc:
                if domain not in header:
                    counter += 1
                    
            merged_df = domain_df.merge(query_domain_df,on=['sacc'],how='right')
            merged_df = merged_df.fillna(0)
            merged_df['qseqid'] = merged_df['qseqid'].apply(apply_matrix_transform)
            index = merged_df[merged_df['sacc'] == 'additional_domains'].index[0]
            merged_df.loc[index,'pident'] = counter
            
            merged_df = pd.DataFrame(merged_df['pident'])#qseqid
            merged_df.columns = [query_seq]
            merged_df = merged_df.transpose()
            merged_df.columns = header
            cdd_dataframe = pd.concat([cdd_dataframe,merged_df])
        return cdd_dataframe
    except Exception as e:
        raise Exception("[-] ERROR loading domain dataframe with exception: {}".format(e))

In [None]:
'''add_color_column_to_dataframe
    
    This function adds a color column to the input dataframe based on 
    a tax column of this dataframe. The tax column is defined by the 
    provided taxonomic unit.
    
    :param result_df --> RBH result dataframe
        :type pd.DataFrame
    :param taxonomic_unit
        :type str
        
    :return result_df
        :type pd.DataFrame
    :return color_dict
        :type dict
'''
def add_color_column_to_dataframe(result_df:pd.DataFrame, taxonomic_unit)->tuple:
    try:
        import matplotlib._color_data as mcd
        overlap = [name for name in mcd.CSS4_COLORS]
        overlap.remove("lightgrey")
        num_colors = len(result_df[taxonomic_unit].unique())
        #clrs = sns.color_palette('pastel', n_colors=num_colors)
        #clrs = clrs.as_hex()
        colors = []
        for i in range(num_colors):
            c = overlap[random.randint(0,len(overlap)-1)]
            if c not in colors:
                colors.append(c)
            #color c is in colors --> one color for at least two different tax
            else:
                #select other random color
                c = overlap[random.randint(0,len(overlap)-1)]
                if c not in colors:
                    colors.append(c)
                else:
                    colors.append('black')

        color_dict = dict(zip(result_df[taxonomic_unit].unique(), colors)) #colors
        create_color_scheme = lambda value: color_dict[value]
        result_df['color'] = result_df[taxonomic_unit].apply(create_color_scheme)
        
        return result_df, color_dict
    except Exception as e:
        raise Exception("[-] ERROR creating color column for result dataframe with exception: {}".format(e))

In [None]:
'''build_dataframe_for_bokeh
    
    This function transforms the result dataframe from the CDD domain search, the RBH
    result dataframe and the PCA result dataframe for bokeh ColumnData input.
    Additionally returns a list of columns for the bokeh DataTable.
    
    :param cdd_dataframe
        :type pandas.DataFrame
    :param pca_df 
        :type pandas.DataFrame
    :param selection
        :type pandas.DataFrame
        
    :return pca_df -> transformed dataframe
        :type pandas.DataFrame
    :return header -> CDD domains
        :type list[str]
'''
def build_dataframe_for_bokeh(cdd_dataframe:pd.DataFrame,pca_df:pd.DataFrame,selection:pd.DataFrame)->tuple:
    try:
        cdd_dataframe = cdd_dataframe.reset_index()
        cdd_dataframe['sacc'] = list(cdd_dataframe['index'])
        cdd_dataframe = cdd_dataframe.drop("index",axis=1)
        cdd_dataframe['transformed_sacc'] = cdd_dataframe.sacc.apply(lambda x: x.split(".")[0])
        
        #get all PC0 -> PCX columns of the pca_df dataframe
        cols = []
        for col in pca_df.columns:
            try:
                int(col)
                col = "PC"+str(col)
                cols.append(col)
            except:
                cols.append(col)
                continue
        
        pca_df.columns = cols
        pca_df['sacc'] = list(selection.index)
        pca_df['bitscore'] = list(selection.bitscore)
        pca_df['pident'] = list(selection.pident)
        pca_df['order'] = list(selection.order)
        pca_df['evalue'] = list(selection.evalue)
        pca_df['family'] = list(selection.family)
        pca_df['genus'] = list(selection.genus)
        pca_df['phylum'] = list(selection.phylum)
        pca_df['class'] = list(selection['class'])
        pca_df['stitle'] = list(selection.stitle)

        pca_df['transformed_sacc'] = pca_df['sacc'].apply(lambda x : x.split(".")[0])
        pca_df = pca_df.merge(cdd_dataframe.loc[1:,:], on='sacc')
        
        #header for bokeh data table
        header=list(cdd_dataframe.columns)
        header.append('transformed_sacc_x')
        header.remove('transformed_sacc')
        
        return pca_df, header
    except Exception as e:
        raise Exception("[-] ERROR during creation of bokeh dataframe with exception: {}".format(e))

In [None]:
'''build_taxonomy_menu
    
    This function constructs a taxonomy menu for the bokeh plot.
    
    :param bokeh_dataframe
        :type pd.DataFrame
    :param taxonomic_unit
        :type str
    
    :return tax_menu
        :type MulitSelect -> bokeh
'''
def build_taxonomy_menu(bokeh_dataframe:pd.DataFrame,taxonomic_unit:str):
    try:
        unique_tax = list(bokeh_dataframe[taxonomic_unit].unique())
        if len(unique_tax) > 1:
            tax_menu = MultiSelect(options=unique_tax, value=[unique_tax[0], unique_tax[1]],
                       title='Select: ' + taxonomic_unit.capitalize())
        else:
            tax_menu = MultiSelect(options=unique_tax, value=[unique_tax[0]],
                       title='Select: ' + taxonomic_unit.capitalize())
            
        return tax_menu
    except Exception as e:
        raise Exception("[-] ERROR creating taxonomy menu for bokeh plot with exception: {}".format(e))

In [None]:
def build_json_callback_for_selection(column_dat:ColumnDataSource, table_dat:ColumnDataSource,table_header:list)->CustomJS:
    selection_callback=CustomJS(args=dict(sc=column_dat, table_data=table_dat, columns=table_header), code="""
            var call_back_object = cb_obj.indices
            

            table_data.data['transformed_sacc_x'] = []
            table_data.data['index'] = []
            table_data.data['sacc'] = []
            for(var i = 0; i < columns.length; i++){
                table_data.data[columns[i]] = []
            }

            for(var i = 0; i < call_back_object.length; i++){
                    for(var j = 0; j < columns.length; j++){
                        table_data.data[columns[j]].push(sc.data[columns[j]][call_back_object[i]])
                    }
                    table_data.data['transformed_sacc_x'].push(sc.data['transformed_sacc_x'][call_back_object[i]])
                    table_data.data['sacc'].push(sc.data['sacc'][call_back_object[i]])
                    table_data.data['index'].push(call_back_object[i])

            }
            
            //circle.glyph.x = 'PC1'
            //circle.glyph.y = 'PC0'
            
            table_data.change.emit();
            """)
    return selection_callback

In [None]:
def build_json_callback_for_taxonomy(column_dat:ColumnDataSource,static_dat:ColumnDataSource,
                                     table_dat:ColumnDataSource,domains:list,taxonomic_unit:str,tax_selection:list)->CustomJS:
    tax_menu_callback = CustomJS(args=dict(sc=column_dat,
                                           source=static_dat,
                                           tax_unit=taxonomic_unit,
                                           columns=list(column_dat.data.keys()),
                                           domains=domains,
                                           selected_taxonomy=tax_selection,
                                           table_data=table_dat), code="""
                        var call_back_object = cb_obj.value                        
                        for(var i = 0;i < columns.length;i++){
                            sc.data[columns[i]]=[]
                        }
                        
                        table_data.data['transformed_sacc_x'] = []
                        table_data.data['index'] = []
                        table_data.data['sacc'] = []
                        
                        for(var i = 0; i < domains.length; i++){
                            table_data.data[domains[i]] = []
                        }
                        
                        var unique_class = []
                        var unique_order = []
                        var unique_family = []
                        var unique_genus = []
                        
                        for(var i = 0; i < source.get_length(); i++){
                            for(var j = 0; j < call_back_object.length; j++){
                                if(source.data[tax_unit][i] == call_back_object[j]){
                                
                                    if(unique_order.includes(source.data['order'][i]) == false){
                                        unique_order.push(source.data['order'][i])
                                    }
                                    
                                    if(unique_class.includes(source.data['class'][i]) == false){
                                        unique_class.push(source.data['class'][i])
                                    }
                                    

                                    if(unique_family.includes(source.data['family'][i]) == false){
                                        unique_family.push(source.data['family'][i])
                                    }
                                    
                                    if(unique_genus.includes(source.data['genus'][i]) == false){
                                        unique_genus.push(source.data['genus'][i])
                                    }
                                    
                                    
                                    for(var k = 0; k < columns.length;k++){
                                            sc.data[columns[k]].push(source.data[columns[k]][i])
                                        }
                                        
                                    for(var y = 0; y < domains.length; y++){
                                            table_data.data[domains[y]].push(source.data[domains[y]][i])
                                        }
                                        
                                    table_data.data['transformed_sacc_x'].push(source.data['transformed_sacc_x'][i])
                                    table_data.data['sacc'].push(source.data['sacc'][i])
                                    table_data.data['index'].push(i)
                                }
                            }
                        }
                        
                        for(var key in selected_taxonomy) {
                            if(key == 'order'){
                                selected_taxonomy[key].options = unique_order
                                selected_taxonomy[key].value = unique_order                                                        
                            }
                            
                            if(key == 'class'){
                                selected_taxonomy[key].options = unique_class
                                selected_taxonomy[key].value = unique_class                                                        
                            }
                            
                            if(key == 'family'){
                                selected_taxonomy[key].options = unique_family
                                selected_taxonomy[key].value = unique_family                                                        
                            }
                            
                            if(key == 'genus'){
                                selected_taxonomy[key].options = unique_genus
                                selected_taxonomy[key].value = unique_genus                                                        
                            }
                        }

                        
                        table_data.change.emit();
                        sc.change.emit();
                        """)
    return tax_menu_callback

In [None]:
def build_table_html_formatter(value:str):
    condition_string=""" 
    if({val}<0.25)
        {{return("red")}}
    else if({val}>=0.25 && {val}<0.5)
        {{return("orange")}}
    else if({val}>=0.5 && {val}<0.75)
        {{return("lightgreen")}}
    else if({val}>=0.75)
        {{return("green")}}
    """.format(val=value)
    template="""
                <p style="color:<%=
                    (function colorfromint(){{
                        {cond}        
                        }}()) %>;"> 
                <%= value %>
                </p>
            """.format(cond=condition_string)
    formatter =  HTMLTemplateFormatter(template=template)
    return formatter

In [None]:
'''build_table_columns

    This function takes the CDD domain search dataframe as input and
    transforms the domain columns to TableColumn classes for the bokeh table.
    
    :param cdd_dataframe
        :type pandas.DataFrame
    
    
    :return table_columns
        :type TableColumn
    :return cdd_header
        :type list[str]
'''
def build_table_columns(cdd_dataframe:pd.DataFrame)->tuple:
    try:
        table_columns=[]
        cdd_headers=[]
        
        for col in cdd_dataframe.columns:
            col = col.replace(":","_")
            if "CDD" in col:
                table_columns.append(
                    TableColumn(field=col, title=col, formatter=build_table_html_formatter(col))
                )
                cdd_headers.append(col)
            elif col == "transformed_sacc_x":
                table_columns.append(
                    TableColumn(field=col, title="Accession ID")
                ) 
            elif col == 'additional_domains':
                table_columns.append(TableColumn(field=col, title="Add. Domains"))
                cdd_headers.append(col)
        return table_columns, cdd_headers
    except Exception as e:
        raise Exception("[-] ERROR creating table column for bokeh table with exception: {}".format(e)) 

In [None]:
'''build_bokeh_plot
    
    This function produces an interactive bokeh plot for the visualization of 
    the CDD result dataframe.
    
    :param bokeh_dataframe -> PCA results and additional information
        :type pd.DataFrame
    :param domains -> list[str] for bokeh DataTable
        :type list
    :param taxonomic_unit
        :type str
    :param variances -> PCA variances
        :type list[str]
    :param query_sequence
        :type str
'''
def build_bokeh_plot(bokeh_dataframe:pd.DataFrame, domains:list, taxonomic_unit:str,variances:list, query_sequence:str):
    try:
        #bokeh data
        bokeh_dataframe = bokeh_dataframe.rename(columns=dict(zip([val for val in domains],[val.replace(":","_") for val in domains])))
        columns = ['PC0','PC1','sacc','color','pident','bitscore','evalue','genus','family','order','phylum','class','stitle','additional_domains']
        domains = [domain.replace(":","_") for domain in domains]
        columns.extend(domains)
        tab_dat = domains.copy()
        tab_dat.append('additional_domains')
        column_dat = ColumnDataSource(bokeh_dataframe[columns])
        static_data = ColumnDataSource(bokeh_dataframe[columns])
        table_dat = ColumnDataSource(bokeh_dataframe[tab_dat])
        table_columns, table_header = build_table_columns(bokeh_dataframe)
        print(table_header)
        TOOLTIPS = [
            ("family: ","@family"),
            ("order: ", "@order"),
            ("genus: ", "@genus"),
            ("bitscore,pident: ", "@bitscore, @pident"),
            ("sacc: ", "@sacc"),
            ("title: ","@stitle")
        ]
        
        #main figure properties
        p = figure(x_axis_label='PC1 with {}% captured variance'.format(variances[0]), 
                   y_axis_label='PC2 with {}% captured variance'.format(variances[1]),
                   sizing_mode="scale_both",tools="lasso_select, reset,save, box_zoom,undo,redo,wheel_zoom, pan",#plot_height=700, plot_width=700,
                   tooltips=TOOLTIPS, 
                   title="Principal Component Analysis of the percent identitiy of inferred domains of {} reciprocal best hits".format(query_sequence),
                           )
        
        p.add_layout(Legend(), 'left')
        p.legend.glyph_width = 40
        p.legend.glyph_height = 40
        #scatter plot
        circle = p.circle(x='PC0', y='PC1',
                          color='color', size=20, line_width=1, line_color='black',
                          source=column_dat,
                          legend_field=taxonomic_unit)
                    
        #bokeh table
        table = DataTable(source=table_dat,width=400, height=275,
                          sizing_mode="stretch_both", reorderable=True, sortable=True, fit_columns=True, 
                          columns=table_columns)

        
        #defining select funcionality
        selection_callback=build_json_callback_for_selection(column_dat,table_dat,table_header)#circle
        column_dat.selected.js_on_change('indices' ,selection_callback)
        
        #taxonomy menus in bokeh plot
        phylum_menu = build_taxonomy_menu(bokeh_dataframe,'phylum')
        class_menu = build_taxonomy_menu(bokeh_dataframe,'class')
        order_menu = build_taxonomy_menu(bokeh_dataframe,'order')
        family_menu = build_taxonomy_menu(bokeh_dataframe,'family')
        genus_menu = build_taxonomy_menu(bokeh_dataframe,'genus')

        
        tax_selection_dict = {'class':class_menu,'order':order_menu,'family':family_menu,'genus':genus_menu}
        phylum_menu_callback = build_json_callback_for_taxonomy(column_dat, static_data,table_dat,table_header,'phylum',tax_selection_dict)
        phylum_menu.js_on_change('value', phylum_menu_callback)

        tax_selection_dict = {'order':order_menu,'family':family_menu,'genus':genus_menu}
        class_menu_callback = build_json_callback_for_taxonomy(column_dat, static_data,table_dat,table_header,'class',tax_selection_dict)
        class_menu.js_on_change('value', class_menu_callback)
        
        tax_selection_dict = {'family':family_menu,'genus':genus_menu}
        order_menu_callback = build_json_callback_for_taxonomy(column_dat,static_data,table_dat,table_header,'order',tax_selection_dict)
        order_menu.js_on_change('value',order_menu_callback)
        
        tax_selection_dict = {'genus':genus_menu}
        family_menu_callback = build_json_callback_for_taxonomy(column_dat,static_data,table_dat,table_header,'family',tax_selection_dict)
        family_menu.js_on_change('value',family_menu_callback)
        
        genus_menu_callback = build_json_callback_for_taxonomy(column_dat,static_data,table_dat,table_header,'genus',{})
        genus_menu.js_on_change('value',genus_menu_callback)
        
        return row(p,column(table,phylum_menu,class_menu,order_menu,family_menu,genus_menu)), circle
    except Exception as e:
        raise Exception("[-] ERROR during creation of bokeh plots with exception: {}".format(e))

In [None]:
def produce_3d_pca_plot(result_df:pd.DataFrame,qseqid:str,path_to_query_domains:str,path_to_domains:str,taxonomic_unit:str):
    try:
        
        #add color column to result_df 
        result_df, color_dict = add_color_column_to_dataframe(result_df,taxonomic_unit)
        #transform taxonomic units to distinct numbers
        #transformer_dict = convert_strings_list_to_integer_dict(result_df)
        selection = result_df[result_df['qseqid'] == qseqid][['sacc',
                                                              'color',
                                                              'pident',
                                                              'bitscore',
                                                              'evalue',
                                                              'genus',
                                                              'family',
                                                              'order',
                                                              'phylum',
                                                              'class',
                                                              'stitle']]
        selection =selection.reset_index()
        selection.index = selection['sacc']
        selection = selection.drop("index",axis=1)
        
        #load domain data
        domain_dict = load_domain_query_data(path_to_query_domains)
        cdd_dataframe = load_domain_data(path_to_domains,qseqid,domain_dict)
        
        #perform principal component analysis
        if len(cdd_dataframe.columns) > 1:
            selection = pd.merge(selection, cdd_dataframe, left_index=True, right_index=True)

            
            pca_selection = PCA(n_components=len(cdd_dataframe.columns)-1, svd_solver='full')
            cols = list(cdd_dataframe.columns)
            cols.remove('additional_domains')

            principal_components_selection = pca_selection.fit_transform(selection[cols])#[final_df.columns]
            pca_df = pd.DataFrame(data = principal_components_selection)

            pca_df['color'] = list(selection['color'])

            if len(cdd_dataframe.columns) > 2:
                #3d plot
                plot = produce_3d_plot(pca_df,qseqid,pca_selection,color_dict)


            #bokeh plotting
            variances = [round(pca_selection.explained_variance_ratio_[0]*100,3),round(pca_selection.explained_variance_ratio_[1]*100,3)]
            bk_df,header = build_dataframe_for_bokeh(cdd_dataframe,pca_df,selection)
            grid,p = build_bokeh_plot(bk_df, header,taxonomic_unit,variances,qseqid)

            return pca_df, pca_selection, selection, cdd_dataframe, color_dict, grid,p, domain_dict, bk_df
        
        else:
            return None
            
    except Exception as e:
        raise Exception("[-] ERROR with exception : {}".format(e))

In [None]:
#data_path = '../data/blast_to_pca'
#result_df = pd.read_csv(data_path + '/reciprocal_results_with_taxonomy.csv', index_col=0)
result_df = pd.read_csv('../data/cdd_extension_project/aGPCRs/reciprocal_results_with_taxonomy.csv', index_col=0)
#result_df = pd.read_csv('../data/database_statistics/bokeh/big/reciprocal_results_with_taxonomy.csv')
pca_df, pca_selection, selection, cdd_dataframe, color_dict, grid,p, domain_dict, bk_df = produce_3d_pca_plot(result_df,
                                                                   'NP_001367041',
                                                                   "../data/cdd_extension_project/aGPCRs/query_domains.tsf",
                                                                  "../data/cdd_extension_project/aGPCRs/NP_001367041/cdd_domains.tsf",
                                                                   taxonomic_unit='class')

In [None]:
show(grid)

In [None]:
'''extract_value_counts

    This function calculates distinct CDD domain patterns and
    returns a corresponding pandas dataframe.
    
    :param cdd_dataframe
        :type pd.DataFrame
        
    :return cdd_data
        :type pd.DataFrame

'''
def extract_value_counts(cdd_dataframe:pd.DataFrame)->pd.DataFrame:
    try:
        cdd_data = cdd_dataframe.copy()
        for col in cdd_data.columns:
            cdd_data[col] = cdd_data[col].apply(lambda x: 1 if x != 0 else 0)

        val_counts = cdd_data.value_counts()
        temp = []
        for i in range(len(val_counts)):
            ls = list(val_counts[i:i+1].index[0])
            for j in val_counts[i:i+1]:
                ls.append(j)
            temp.append(ls)

        header=list(val_counts[0:1].index.names)
        header.append("counts")
        cdd_data = pd.DataFrame(temp)
        cdd_data.columns = header
        return cdd_data
    except Exception as e:
        raise Exception("[-] ERROR creating cdd value dataframe with exception: {}".format(e))

In [None]:
values=extract_value_counts(cdd_dataframe)
values.head()

In [None]:
value_dict={}
for row in range(len(values)):
    value_dict[row] = []
    for col in values.columns:
        if col != 'counts':
            value_dict[row].append(values[col][row])

# biopython protein --> cdd interaction

In [None]:
search = Entrez.elink(dbfrom='protein',id=targets[0:500],linkname="protein_cdd")
record = Entrez.read(search)
search.close()

In [None]:
linked = [link["Id"] for link in record[0]["LinkSetDb"][0]["Link"]]

In [None]:
linked

In [None]:
fetch = Entrez.efetch(db="cdd",id=linked,rettype="docsum",retmode="xml")
record = Entrez.read(fetch)
fetch.close()

In [None]:
record

# bokeh BLAST result plots

In [None]:
def create_color_and_marker_dictionaries_for_bokeh_dataframe(result_data:pd.DataFrame)->tuple:
    try:
        # prepare distinct colors for the specified taxonomic unit
        color_dict = {}
        for tax_unit in ['phylum','order','class','family','genus']:
            num_colors = len(result_data[tax_unit].unique())

            if num_colors > 256:
                clrs = sns.color_palette('pastel', n_colors=num_colors)
                clrs = clrs.as_hex()
                color_dict.update(dict(zip(result_data[tax_unit].unique(), clrs)))

            else:
                clrs = sns.color_palette('pastel', n_colors=num_colors)
                clrs = clrs.as_hex()
                color_dict.update(dict(zip(result_data[tax_unit].unique(),clrs))) #magma(n)

        # prepare custom marker for each query sequence
        marker_dict = {}
        marker = list(MarkerType)
        #just use colorable marker types
        for m in ["x", "y", "dot", "dash", "cross", "asterisk"]:
            marker.remove(m)
        shuffle(marker)

        for i, query in enumerate(result_data['qseqid'].unique()):
            marker_dict[query] = marker[i % len(marker)]
            
        return color_dict, marker_dict
    except Exception as e:
        raise Exception("[-] ERROR creating marker and color data for RBH result plot with exception: {}".format(e))

In [None]:
def create_initial_bokeh_result_data(result_data:pd.DataFrame, taxonomic_unit:str)->tuple:
    try:
        #RBH result dataframe
        result_data = result_data.loc[:,
                       ['order','class','phylum','genus','family', 'bitscore', 'pident', 'stitle', 'scomnames', 'staxids', 'qseqid',
                        'sacc_transformed','slen']]#,'slen'
        result_data = result_data.sort_values(by=taxonomic_unit)
        
        color_dict,marker_dict = create_color_and_marker_dictionaries_for_bokeh_dataframe(result_data)
        
        #lambda functions for adding color and marker columns
        create_color_scheme = lambda value: color_dict[value]
        create_marker_scheme = lambda value: marker_dict[value]
        
        result_data['x'] = result_data['bitscore']
        result_data['y'] = result_data['pident']
        result_data['color'] = result_data[taxonomic_unit].apply(create_color_scheme)
        result_data['marker'] = result_data['qseqid'].apply(create_marker_scheme)
        
        return result_data, color_dict
        
    except Exception as e:
        raise Exception("[-] ERROR creating result dataframe for bokeh RBH result plot with exception: {}".format(e))

In [None]:
def create_initial_bokeh_data_selection(result_data:pd.DataFrame,taxonomic_unit:str):
    try:
        unique_tax = list(result_data[taxonomic_unit].unique())
        unique_qseqids = list(result_data['qseqid'].unique())
    
        if len(unique_tax) > 1:
            data_selection = result_data[
                (result_data[taxonomic_unit] == unique_tax[0]) | (result_data[taxonomic_unit] == unique_tax[1])
            ]
        else:
            data_selection = result_data[result_data[taxonomic_unit] == unique_tax[0]]        #prepare table dataframe
            
            
        if len(unique_qseqids) > 1:
            data_selection = data_selection[
                data_selection['qseqid'] == unique_qseqids[0] #| data_selection['qseqid'] == unique_qseqids[1]
            ]
        else:
            data_selection = data_selection[data_selection['qseqid'] == unique_qseqids[0]]
            
        taxcount_df = pd.DataFrame(data_selection.staxids.value_counts())
        taxcount_df['value'] = taxcount_df['staxids']
        taxcount_df['staxids'] = taxcount_df.index
        taxcount_df.index = pd.Index(range(len(taxcount_df)))
        taxid_to_taxonomic_unit = lambda taxid: data_selection[data_selection.staxids == taxid][taxonomic_unit].unique()[0]
        taxcount_df[taxonomic_unit] = taxcount_df.staxids.apply(taxid_to_taxonomic_unit)
        taxcount_df = pd.DataFrame(taxcount_df[taxonomic_unit].value_counts())
        
        taxcount_df.columns = ['value']
        taxcount_df[taxonomic_unit] = taxcount_df.index
        taxcount_df.index = range(len(taxcount_df))

        return data_selection, taxcount_df
    except Exception as e:
        raise Exception("[-] ERROR creating initial result dataframe selection for bokeh RBH result plot with exception: {}".format(e))

In [None]:
def create_initial_bokeh_database_data(database:pd.DataFrame, data_selection:pd.DataFrame, taxcount_df:pd.DataFrame ,taxonomic_unit:str):
    try:
        #unique database entries
        db_df = pd.DataFrame(database[taxonomic_unit].value_counts())

        selection = pd.DataFrame(data_selection[taxonomic_unit].value_counts())
    
        db_df.columns = ['value']
        selection.columns = ['value']

        db_df[taxonomic_unit] = db_df.index
        selection[taxonomic_unit] = selection.index

        db_df.index = range(len(db_df))
        selection.index = range(len(selection))

        db_df = selection.merge(db_df, on=taxonomic_unit, how='outer')
        db_df = taxcount_df.merge(db_df, on=taxonomic_unit, how='outer')
        db_df = db_df[['value_x', 'value_y', 'value', taxonomic_unit]]
        db_df.columns = ['#RBHs', '# Different Organisms In DB', '# Different Organisms In Selection',
                         '# TaxName']
        
        db_df = db_df.fillna(0)
        
        return db_df
    except Exception as e:
        raise Exception("[-] ERROR creating database dataframe for bokeh RBH result plot with exception: {}".format(e))

In [None]:
#TODO add taxonomy menu callback:
def create_qseqid_menu_callback(Overall:ColumnDataSource,Curr:ColumnDataSource,DbData:ColumnDataSource,
                                taxonomic_unit:str,tax_menu:MultiSelect,xaxis_menu:MultiSelect,yaxis_menu:MultiSelect,
                                color_menu:Select,color_dict:dict,taxonomy_menus:list)->CustomJS:
    try:
        menu_qseqid_callback = CustomJS(args=dict(source=Overall, sc=Curr, table_data=DbData,
                                                  tax_unit=taxonomic_unit, 
                                                  xaxis_menu=xaxis_menu,yaxis_menu=yaxis_menu,
                                                  color_menu=color_menu,color_dict=color_dict,taxonomy_menus=taxonomy_menus), code="""
        var tab_dict = {};
        var tab_dict_static = {};
        var tab_dict_org_count = {};
        console.log(table_data.data)
        for(var i = 0;i<table_data.get_length();i++){
            tab_dict[table_data.data['# TaxName'][i]] = 0
            tab_dict_org_count[table_data.data['# TaxName'][i]] = 0
            tab_dict_static[table_data.data['# TaxName'][i]] = table_data.data['# Different Organisms In DB'][i]
        }
        var call_back_object = cb_obj.value


        let keys = Object.keys(sc.data)
        for(var i = 0; i < keys.length; i++){
            sc.data[keys[i]] = []
        }
        
        var taxid_arr = []
        for(var i = 0; i < source.get_length(); i++){
            for(var j = 0; j < call_back_object.length; j++){
                
                
                if(source.data['qseqid'][i] == call_back_object[j]){

                     if(taxonomy_menus[0].value.includes(source.data['phylum'][i]) == true){
                        if(taxonomy_menus[1].value.includes(source.data['class'][i]) == true){
                            if(taxonomy_menus[2].value.includes(source.data['order'][i]) == true){
                                if(taxonomy_menus[3].value.includes(source.data['family'][i]) == true){
                                     if(taxonomy_menus[4].value.includes(source.data['genus'][i]) == true){
                                       for(var x = 0; x < keys.length; x++){
                                            if((keys[x] != 'x') && (keys[x] != 'y') && (keys[x] != 'color')){
                                                sc.data[keys[x]].push(source.data[keys[x]][i])
                                            }
                                        }

                                        sc.data['color'].push(color_dict[source.data[color_menu.value][i]])
                                        sc.data['x'].push(source.data[xaxis_menu.value][i])
                                        sc.data['y'].push(source.data[yaxis_menu.value][i])
                                        tab_dict[source.data[tax_unit][i]]+=1
                                        if(taxid_arr.includes(source.data['staxids'][i]) == false){
                                            taxid_arr.push(source.data['staxids'][i])
                                            tab_dict_org_count[source.data[tax_unit][i]]+=1
                                        }
                                     }                               
                                }
                            }
                        }
                     }

                   
                    
                    
                  
                }
            }
        }
        table_data.data['#RBHs'] = []
        table_data.data['# Different Organisms In DB'] = []
        table_data.data['# TaxName'] = []
        table_data.data['# Different Organisms In Selection'] = []
        table_data.data['index'] = []
        var counter = 1
        for(let key in tab_dict){
            console.log(key)
            if(key != '# TaxName'){
                table_data.data['#RBHs'].push(tab_dict[key])
                table_data.data['# Different Organisms In DB'].push(tab_dict_static[key])
                table_data.data['# TaxName'].push(key)
                table_data.data['# Different Organisms In Selection'].push(tab_dict_org_count[key])
                table_data.data['index'].push(counter)
                counter += 1            
            }
        }
        
        console.log(table_data.data)
        table_data.change.emit();
        sc.change.emit();
        """)
        return menu_qseqid_callback
    except Exception as e:
        raise Exception("[-] ERROR creating the custom js callback for the qseqid menu with exception: {}".format(e))

In [None]:
def create_unlinked_bokeh_plot(logfile:str,result_data: pd.DataFrame,database:pd.DataFrame, taxonomic_unit: str, project_id:int)->int:
    try:
        with open(logfile,'w') as log:
            #path_to_static_dir = "static/images/result_images/" + str(project_id) + "/"
            # log.write("INFO:checking if static dir: {} exists\n".format(path_to_static_dir))
            #path_to_bokeh_plot = path_to_static_dir + taxonomic_unit + "_bokeh_plot.html"
            
            ########
            
            #create bokeh dataframes for plots and tables
            data_all,color_dict = create_initial_bokeh_result_data(result_data, taxonomic_unit)
            # selection subset for initial plot data
            data_selection, taxcount_df = create_initial_bokeh_data_selection(data_all, taxonomic_unit)
            db_df = create_initial_bokeh_database_data(database, data_selection, taxcount_df, taxonomic_unit)
            
            data_selection_phylum, taxcount_df_phylum = create_initial_bokeh_data_selection(data_all, 'phylum')
            db_df_phylum = create_initial_bokeh_database_data(database, data_selection, taxcount_df_phylum, 'phylum')
            data_selection_class, taxcount_df_class = create_initial_bokeh_data_selection(data_all, 'class')
            db_df_class = create_initial_bokeh_database_data(database, data_selection, taxcount_df_class, 'class')
            data_selection_order, taxcount_df_order = create_initial_bokeh_data_selection(data_all, 'order')
            db_df_order = create_initial_bokeh_database_data(database, data_selection, taxcount_df_order, 'order')
            data_selection_family, taxcount_df_family = create_initial_bokeh_data_selection(data_all, 'family')
            db_df_family = create_initial_bokeh_database_data(database, data_selection, taxcount_df_family, 'family')
            data_selection_genus, taxcount_df_genus = create_initial_bokeh_data_selection(data_all, 'genus')
            db_df_genus = create_initial_bokeh_database_data(database, data_selection, taxcount_df_genus, 'genus')
            ########
            
            # setup bokeh classes
            Overall = ColumnDataSource(data=data_all)
            Curr = ColumnDataSource(data=data_selection)
            DbData = ColumnDataSource(data=db_df)
            
            table_data_dict = {
                'phylum':ColumnDataSource(data=db_df_phylum),
                'class':ColumnDataSource(data=db_df_class),
                'order':ColumnDataSource(data=db_df_order),
                'family':ColumnDataSource(data=db_df_family),
                'genus':ColumnDataSource(data=db_df_genus)
            }
            
            # plot and the menu is linked with each other by this callback function
            unique_tax = list(data_all[taxonomic_unit].unique())
            unique_qseqids = list(data_all['qseqid'].unique())
            
            # selection subset for initial plot data
            if len(unique_tax) > 1:
                tax_menu = MultiSelect(options=unique_tax, value=[unique_tax[0], unique_tax[1]],
                                   title='Select: ' + taxonomic_unit.capitalize())  # drop down menu

            else:
                tax_menu = MultiSelect(options=unique_tax, value=[unique_tax[0]],
                                   title='Select: ' + taxonomic_unit.capitalize())  # drop down menu
            
            table = DataTable(source=DbData, width=390, height=275,
                              sizing_mode="scale_both", reorderable=True, sortable=True, fit_columns=True,
                              columns=[
                                  TableColumn(field='#RBHs', title='#RBHs'),
                                  TableColumn(field='# Different Organisms In DB', title='# Different Organisms In DB'),
                                  TableColumn(field='# Different Organisms In Selection',
                                              title='# Different Organisms In Selection'),
                                  TableColumn(field='# TaxName', title='# TaxName'),
                              ])
            
            qseq_values = []
            if len(unique_qseqids) > 1:
                qseq_values.append(unique_qseqids[0])
                qseq_values.append(unique_qseqids[1])
            else:
                qseq_values.append(unique_qseqids[0])
                
            menu_qseqids = MultiSelect(options=unique_qseqids, value=qseq_values,
                                       title='Select target query sequence')  # drop down menu
           
            #menu_callback = create_taxonomy_menu_callback(Overall,
            #                                     Curr,
            #                                     DbData,
            #                                     taxonomic_unit
            #                                     ,menu_qseqids)

        
            
            TOOLTIPS = [
                ("stitle", "@stitle"),
                ("bitscore,pident", "@bitscore, @pident"),
                ("sacc RBH to qseqid", "@sacc_transformed RBH to @qseqid "),
                ("scomname", "@scomnames"),
            ]

            # x_range=(0, result_data['bitscore'].max() + result_data['bitscore'].min())
            p = figure(x_axis_label='bitscore', y_axis_label='pident',
                       plot_height=700, plot_width=900,
                       tooltips=TOOLTIPS,
                       tools="lasso_select, reset,save, box_zoom,undo,redo,wheel_zoom, pan", title="Number of RBHs - pident vs bitscore",
                       )  # ,tools="box_select, reset" creating figure object
            
            p.add_layout(Legend(), 'left')

            circle = p.scatter(x='x', y='y', color='color', marker='marker', size=10, line_width=1,
                               line_color='black',
                               source=Curr, legend_field=taxonomic_unit)  # plotting the data using glyph circle

            p.legend.glyph_width = 40
            p.legend.glyph_height = 40
            
            color_menu = Select(options=['phylum','class','order','family','genus'],
                        value=taxonomic_unit,title="Select Legend Color")
            color_callback = CustomJS(
                            args=dict(legend=p.legend.items[0],sc=Curr,source=Overall,color_dict=color_dict,
                            table_data=DbData,table_data_dict=table_data_dict, menu_qseqids=menu_qseqids),code='''
                
                                var tax_unit = cb_obj.value
                                             
                                var tab_dict = {};
                                var tab_dict_org_count = {}
                                var tab_dict_static = {}; //numbers dont change
                                for(var i = 0;i<table_data_dict[tax_unit].get_length();i++){
                                    tab_dict[table_data_dict[tax_unit].data['# TaxName'][i]] = 0
                                    tab_dict_org_count[table_data_dict[tax_unit].data['# TaxName'][i]] = 0
                                    tab_dict_static[table_data_dict[tax_unit].data['# TaxName'][i]] = table_data_dict[tax_unit].data['# Different Organisms In DB'][i]
                                }
                                
                                legend.label = {'field':tax_unit}
                                var length = sc.get_length()
                                sc.data['color']=[]
                                for(var i = 0; i < length; i++){
                                    sc.data['color'].push(color_dict[sc.data[tax_unit][i]])
                                }
                                
                                var taxid_arr = []
                                for(var i = 0; i < source.get_length(); i++){
                                    for(var k = 0; k < menu_qseqids.value.length; k++){
                                        if(source.data['qseqid'][i] == menu_qseqids.value[k]){
                                            if(taxid_arr.includes(source.data['staxids'][i]) == false){
                                                taxid_arr.push(source.data['staxids'][i])
                                                tab_dict_org_count[source.data[tax_unit][i]]+=1
                                            }
                                            tab_dict[source.data[tax_unit][i]]+=1
                                        }
                                    }
                                }

                                table_data.data['#RBHs'] = []
                                table_data.data['# Different Organisms In DB'] = []
                                table_data.data['# TaxName'] = []
                                table_data.data['# Different Organisms In Selection'] = []
                                table_data.data['index'] = []
                                var counter = 1
                                for(let key in tab_dict){
                                    table_data.data['#RBHs'].push(tab_dict[key])
                                    table_data.data['# Different Organisms In DB'].push(tab_dict_static[key])
                                    table_data.data['# TaxName'].push(key)
                                    table_data.data['# Different Organisms In Selection'].push(tab_dict_org_count[key])
                                    table_data.data['index'].push(counter)
                                    counter += 1
                                }
                                table_data.change.emit();
                                sc.change.emit();
            ''')
            color_menu.js_on_change('value', color_callback)
            
            phylum_menu = build_taxonomy_menu(data_all,'phylum')
            class_menu = build_taxonomy_menu(data_all,'class')
            order_menu = build_taxonomy_menu(data_all,'order')
            family_menu = build_taxonomy_menu(data_all,'family')
            genus_menu = build_taxonomy_menu(data_all,'genus')
            
            unique_phylum = list(result_data['phylum'].unique())
            unique_class = list(result_data['class'].unique())
            unique_order = list(result_data['order'].unique())
            unique_family = list(result_data['family'].unique())
            unique_genus = list(result_data['genus'].unique())
            
            taxonomy_table_callback_dict = {
                'phylum':unique_phylum,
                'class':unique_class,
                'order':unique_order,
                'family':unique_family,
                'genus':unique_genus
            }
            
            
            tax_menus = [phylum_menu,class_menu,order_menu,family_menu,genus_menu]
            
            x_axis_menu = create_x_axis_menu(circle,p.axis,Curr)
            y_axis_menu = create_y_axis_menu(circle,p.axis,Curr)
            
            tax_selection_dict = {'class':class_menu,'order':order_menu,'family':family_menu,'genus':genus_menu}
            phylum_menu_callback = build_json_callback_for_taxonomy(Curr, Overall,DbData,'phylum',
                                                                    tax_selection_dict, menu_qseqids,
                                                                    x_axis_menu,y_axis_menu,color_menu,
                                                                    color_dict,taxonomy_table_callback_dict)
            phylum_menu.js_on_change('value', phylum_menu_callback)

            tax_selection_dict = {'order':order_menu,'family':family_menu,'genus':genus_menu}
            class_menu_callback = build_json_callback_for_taxonomy(Curr, Overall,DbData,'class',tax_selection_dict,
                                                                   menu_qseqids,x_axis_menu,y_axis_menu,color_menu,
                                                                   color_dict,taxonomy_table_callback_dict)
            class_menu.js_on_change('value', class_menu_callback)

            tax_selection_dict = {'family':family_menu,'genus':genus_menu}
            order_menu_callback = build_json_callback_for_taxonomy(Curr, Overall,DbData,'order',tax_selection_dict,
                                                                   menu_qseqids,x_axis_menu,y_axis_menu,
                                                                   color_menu,color_dict,taxonomy_table_callback_dict)
            order_menu.js_on_change('value',order_menu_callback)

            tax_selection_dict = {'genus':genus_menu}
            family_menu_callback = build_json_callback_for_taxonomy(Curr, Overall,DbData,'family',tax_selection_dict,
                                                                    menu_qseqids,x_axis_menu,y_axis_menu,
                                                                    color_menu,color_dict,taxonomy_table_callback_dict)
            family_menu.js_on_change('value',family_menu_callback)

            genus_menu_callback = build_json_callback_for_taxonomy(Curr, Overall,DbData,'genus',{},
                                                                   menu_qseqids,x_axis_menu,y_axis_menu,
                                                                   color_menu,color_dict,taxonomy_table_callback_dict)
            genus_menu.js_on_change('value',genus_menu_callback)

            menu_qseqid_callback = create_qseqid_menu_callback(Overall,
                                                               Curr,
                                                               DbData,
                                                               taxonomic_unit,
                                                               phylum_menu,
                                                               x_axis_menu,y_axis_menu,
                                                              color_menu,color_dict,tax_menus)
            
            menu_qseqids.js_on_change('value', menu_qseqid_callback)

            range_slider = RangeSlider(start=0, end=result_data['bitscore'].max() + result_data['bitscore'].min(),
                                       value=(result_data['bitscore'].min(), result_data['bitscore'].max()), step=1,
                                       title="Bitscore Range Slider")

            circle_size_spinner = Spinner(title="Circle size",
                                          low=0, high=60, step=5,
                                          value=circle.glyph.size,
                                          width=200
                                          )

            line_size_spinner = Spinner(title="Circle line size",
                                        low=0, high=20, step=1,
                                        value=circle.glyph.line_width,
                                        width=200
                                        )
            line_color_picker = ColorPicker(color='black', title="Line Color")

            range_slider.js_link("value", p.x_range, "start", attr_selector=0)
            range_slider.js_link("value", p.x_range, "end", attr_selector=1)

            line_size_spinner.js_link("value", circle.glyph, "line_width")
            circle_size_spinner.js_link("value", circle.glyph, "size")
            line_color_picker.js_link('color', circle.glyph, 'line_color')



            selection_callback = CustomJS(
                args=dict(sc=Curr, source=Overall, table_data=DbData, menu=tax_menu,
                          qseqids=menu_qseqids,color_menu=color_menu), code="""
                var call_back_object = cb_obj.indices
                var tax_unit = color_menu.value
                var tab_dict = {};
                var tab_dict_static = {};
                var tab_dict_org_counter = {};
                for(var i = 0;i<table_data.get_length();i++){
                    tab_dict[table_data.data['# TaxName'][i]] = 0
                    tab_dict_org_counter[table_data.data['# TaxName'][i]] = 0
                    tab_dict_static[table_data.data['# TaxName'][i]] = table_data.data['# Different Organisms In DB'][i]
                }
                var taxid_arr = []
                for(var i = 0; i < call_back_object.length; i++){
                    tab_dict[sc.data[tax_unit][call_back_object[i]]]+=1
                    if(taxid_arr.includes(sc.data['staxids'][[call_back_object[i]]]) == false){
                        taxid_arr.push(sc.data['staxids'][[call_back_object[i]]])
                        tab_dict_org_counter[sc.data[tax_unit][[call_back_object[i]]]]+=1
                    }  
                }
                table_data.data['#RBHs'] = []
                table_data.data['# Different Organisms In DB'] = []
                table_data.data['# TaxName'] = []
                table_data.data['# Different Organisms In Selection'] = []
                for(let key in tab_dict){
                    table_data.data['#RBHs'].push(tab_dict[key])
                    table_data.data['# Different Organisms In DB'].push(tab_dict_static[key])
                    table_data.data['# TaxName'].push(key)
                    table_data.data['# Different Organisms In Selection'].push(tab_dict_org_counter[key])
                }
                table_data.change.emit();
                """)

            Curr.selected.js_on_change('indices', selection_callback)


            download_selection_callback = CustomJS(args=dict(sc=Curr, tax_unit=taxonomic_unit), code="""
                var temp = []
                var csvFileData = []
                for(var i = 0; i < sc.selected.indices.length; i++){
                    temp = [sc.data['qseqid'][sc.selected.indices[i]],
                            sc.data['sacc_transformed'][sc.selected.indices[i]],
                            sc.data['staxids'][sc.selected.indices[i]]]
                    csvFileData.push(temp)
                }
                //define the heading for each row of the data  
                var csv = `qseqid,sacc,staxids\n`;  
                //merge the data with CSV  
                csvFileData.forEach(function(row) {  
                        csv += row.join(',');  
                        csv += `\n`;  
                });  
                var json = JSON.stringify(csv);
                var blob = new Blob([csv], {type: "octet/stream"});
                var url  = window.URL.createObjectURL(blob);
                window.location.assign(url);
            """)

            download_selection_button = Button(label="Download Selection")
            download_selection_button.js_on_click(download_selection_callback)
            
            color_palette = create_color_palette_selection()
            color_palette_callback = create_color_palette_selection_callback(Curr,color_menu,taxonomy_table_callback_dict)
            color_palette.js_on_change('value',color_palette_callback) 
            
            grid = gridplot([[column(p),
                              column( menu_qseqids, row(circle_size_spinner, line_size_spinner),
                                     range_slider,download_selection_button, table, x_axis_menu, y_axis_menu),
                             column(phylum_menu,class_menu,order_menu,family_menu,genus_menu,color_menu, color_palette)]],
                            toolbar_location='right')

            #output_file(filename=path_to_bokeh_plot,
            #           title="Interactive Graph Percent Identity vs. Bitscore linked to {} database entries".format(
            #               taxonomic_unit))
            #save(grid)
            
        return grid
    except Exception as e:
        raise Exception("ERROR in producing bokeh plots for database statistics with exception: {}".format(e))

In [None]:
def build_taxonomy_menu(bokeh_dataframe:pd.DataFrame,taxonomic_unit:str):
    try:
        unique_tax = list(bokeh_dataframe[taxonomic_unit].unique())
        if len(unique_tax) > 1:
            tax_menu = MultiSelect(options=unique_tax, value=[unique_tax[0], unique_tax[1]],
                       title='Select: ' + taxonomic_unit.capitalize())
        else:
            tax_menu = MultiSelect(options=unique_tax, value=[unique_tax[0]],
                       title='Select: ' + taxonomic_unit.capitalize())
            
        return tax_menu
    except Exception as e:
        raise Exception("[-] ERROR creating taxonomy menu for bokeh plot with exception: {}".format(e))

In [None]:
def create_y_axis_menu(circle, axis, data_column):
    y_axis_menu = Select(options=['bitscore','pident','evalue','slen'],#,'slen'
                          value='pident',
                          title="Select Y axis elements")
    
    y_axis_menu_callback = CustomJS(args=dict(gl=circle,plot=axis, data=data_column),code='''
           var call_back_object = cb_obj.value;
           data.data['y'] = data.data[call_back_object]
           plot[1].axis_label = call_back_object;
           data.change.emit();

    ''')

    y_axis_menu.js_on_change('value', y_axis_menu_callback)
    return y_axis_menu

def create_x_axis_menu(circle, axis, data_column):
    x_axis_menu = Select(options=['bitscore','pident','evalue','slen'],#,'slen'
                          value='bitscore',
                          title="Select X axis elements")
    
    x_axis_menu_callback = CustomJS(args=dict(gl=circle,plot=axis, data=data_column),code='''
           var call_back_object = cb_obj.value;
           data.data['x'] = data.data[call_back_object]
           plot[0].axis_label = call_back_object;
           data.change.emit();

    ''')

    x_axis_menu.js_on_change('value', x_axis_menu_callback)
    return x_axis_menu

In [None]:
#todo pass taxonomy dictionary as input for table data
def build_json_callback_for_taxonomy(column_dat:ColumnDataSource,static_dat:ColumnDataSource,
                                     table_dat:ColumnDataSource,taxonomic_unit:str,tax_selection:dict,
                                     menu_qseqid:MultiSelect,xaxis_menu:Select,yaxis_menu:Select,color_menu:Select,color_dict:dict,
                                     taxonomy_table_callback_dict:dict)->CustomJS:
    tax_menu_callback = CustomJS(args=dict(sc=column_dat,
                                           source=static_dat,
                                           tax_unit=taxonomic_unit,
                                           selected_taxonomy=tax_selection,
                                           menu_qseqids=menu_qseqid,
                                           table_data=table_dat,
                                           xaxis_menu=xaxis_menu,yaxis_menu=yaxis_menu,
                                           color_menu=color_menu,color_dict=color_dict,
                                           tax_dict=taxonomy_table_callback_dict), code="""
                    
                        var call_back_object = cb_obj.value 
                        var tab_dict = {};
                        var tab_dict_static = {};
                        var tab_dict_org_counter = {};
                        
                        for(var i = 0;i<table_data.get_length();i++){ 
                            tab_dict[table_data.data['# TaxName'][i]] = 0
                            tab_dict_org_counter[table_data.data['# TaxName'][i]] = 0
                            tab_dict_static[table_data.data['# TaxName'][i]] = table_data.data['# Different Organisms In DB'][i]                                                  
                        }
                        
                        
                        var unique_class = []
                        var unique_order = []
                        var unique_family = []
                        var unique_genus = []
                        
                        let keys = Object.keys(sc.data)
                        for(var i = 0; i < keys.length; i++){
                            sc.data[keys[i]] = []
                        }
                        
                        var taxid_arr = []
                        for(var i = 0; i < source.get_length(); i++){
                            for(var j = 0; j < call_back_object.length; j++){  
                                for(var k = 0; k < menu_qseqids.value.length; k++){
                                    if(source.data['qseqid'][i] == menu_qseqids.value[k]){
                                        if(source.data[tax_unit][i] == call_back_object[j]){

                                            if(unique_order.includes(source.data['order'][i]) == false){
                                                    unique_order.push(source.data['order'][i])
                                                }

                                            if(unique_class.includes(source.data['class'][i]) == false){
                                                unique_class.push(source.data['class'][i])
                                            }


                                            if(unique_family.includes(source.data['family'][i]) == false){
                                                unique_family.push(source.data['family'][i])
                                            }

                                            if(unique_genus.includes(source.data['genus'][i]) == false){
                                                unique_genus.push(source.data['genus'][i])
                                            }
                                            
                                            for(var x = 0; x < keys.length; x++){
                                                if((keys[x] != 'x') && (keys[x] != 'y') && (keys[x] != 'color')){
                                                    sc.data[keys[x]].push(source.data[keys[x]][i])

                                                }
                                            }
                                            
                                            sc.data['color'].push(color_dict[source.data[color_menu.value][i]])
                                            sc.data['x'].push(source.data[xaxis_menu.value][i])
                                            sc.data['y'].push(source.data[yaxis_menu.value][i])
                                            //tab_dict[call_back_object[j]]+=1
                                            for(var l = 0;l<tax_dict[color_menu.value].length;l++){
                                                if(source.data[color_menu.value][i] == tax_dict[color_menu.value][l]){
                                                    tab_dict[tax_dict[color_menu.value][l]]+=1
                                                    //hier muss ich wissen ob color_menu.value zu source.data[tax_unit][i] gehört
                                                    if(taxid_arr.includes(source.data['staxids'][i]) == false){
                                                        taxid_arr.push(source.data['staxids'][i])
                                                        tab_dict_org_counter[tax_dict[color_menu.value][l]]+=1
                                                    }                                           
                                                }                                           
                                            }

                                         
                                        }            
                                    }
                                }
                            }
                        }

                        for(var key in selected_taxonomy) {
                            if(key == 'order'){
                                selected_taxonomy[key].options = unique_order
                                selected_taxonomy[key].value = unique_order                                                        
                            }
                            
                            if(key == 'class'){
                                selected_taxonomy[key].options = unique_class
                                selected_taxonomy[key].value = unique_class                                                        
                            }
                            
                            if(key == 'family'){
                                selected_taxonomy[key].options = unique_family
                                selected_taxonomy[key].value = unique_family                                                        
                            }
                            
                            if(key == 'genus'){
                                selected_taxonomy[key].options = unique_genus
                                selected_taxonomy[key].value = unique_genus                                                        
                            }
                        }

                        table_data.data['#RBHs'] = []
                        table_data.data['# Different Organisms In DB'] = []
                        table_data.data['# TaxName'] = []
                        table_data.data['# Different Organisms In Selection'] = []
                        table_data.data['index'] = []
                        var counter = 1
                        for(let key in tab_dict){
                            for(var l = 0;l<tax_dict[color_menu.value].length;l++){
                                if(key == tax_dict[color_menu.value][l]){
                                    table_data.data['#RBHs'].push(tab_dict[key])
                                    table_data.data['# Different Organisms In DB'].push(tab_dict_static[key])
                                    table_data.data['# TaxName'].push(key)
                                    table_data.data['# Different Organisms In Selection'].push(tab_dict_org_counter[key])
                                    table_data.data['index'].push(counter)
                                    counter += 1                               
                                }
                            }
                        }
                        
                        table_data.change.emit();
                        sc.change.emit();
                        """)
    return tax_menu_callback

In [None]:
def create_color_palette_selection():
    try:
        
        palettes = Spectral
        options = [(str(val),"Spectral"+str(val)) for val in range(3,12)]
        color_palette_menu = Select(options=options,
                          value=str(3),
                          title="Select a color palette")
        return color_palette_menu
    except Exception as e:
        raise Exception("[-] ERROR couldnt create color palette selection")

In [None]:
def create_color_palette_selection_callback(curr:ColumnDataSource,color_menu:Select,taxonomy_table_callback_dict:dict)->CustomJS:
    try:
        palettes = Spectral
        c_palette_callback = CustomJS(args=dict(sc=curr,color_menu=color_menu, tax_menu=taxonomy_table_callback_dict, pals=palettes),code="""
                                    // the callback value is a number 3,4,5,6,7,8,9,10,11,12
                                    var call_back_object = cb_obj.value
                                    
                                    var unique_organisms = []

                                    
                                    for(var i = 0; i<sc.get_length(); i++){
                                        if(unique_organisms.includes(sc.data[color_menu.value][i]) == false){
                                            unique_organisms.push(sc.data[color_menu.value][i])
                                        }
                                    }

                                    if(unique_organisms.length <= pals[call_back_object].length){
                                        var color_dict = {}
                                        for(var i = 0; i < unique_organisms.length; i++){
                                            if(i == pals[call_back_object].length){
                                                i = 0
                                            }
                                            color_dict[unique_organisms[i]] = pals[call_back_object][i]
                                        }
                                        sc.data['color'] = []
                                        for(var i = 0; i<sc.data[color_menu.value].length; i++){
                                            sc.data['color'].push(color_dict[sc.data[color_menu.value][i]])
                                        }
                                     sc.change.emit();
                                     }

        """)
        return c_palette_callback
    except Exception as e:
        raise Exception("[-] ERROR couldnt create color palette selection callback")

In [None]:
def create_initial_bokeh_data_selection(result_data:pd.DataFrame,taxonomic_unit:str):
    try:
        unique_tax = list(result_data[taxonomic_unit].unique())
        unique_qseqids = list(result_data['qseqid'].unique())
    
        if len(unique_tax) > 1:
            data_selection = result_data[
                (result_data[taxonomic_unit] == unique_tax[0]) | (result_data[taxonomic_unit] == unique_tax[1])
            ]
        else:
            data_selection = result_data[result_data[taxonomic_unit] == unique_tax[0]]        #prepare table dataframe
            
        if len(unique_qseqids) > 1:
            data_selection = data_selection[
                (data_selection['qseqid'] == unique_qseqids[0]) | (data_selection['qseqid'] == unique_qseqids[1])
            ]
        else:
            data_selection = data_selection[data_selection['qseqid'] == unique_qseqids[0]]
            
        taxcount_df = pd.DataFrame(data_selection.staxids.value_counts())
        taxcount_df['value'] = taxcount_df['staxids']
        taxcount_df['staxids'] = taxcount_df.index
        taxcount_df.index = pd.Index(range(len(taxcount_df)))
        taxid_to_taxonomic_unit = lambda taxid: data_selection[data_selection.staxids == taxid][taxonomic_unit].unique()[0]
        taxcount_df[taxonomic_unit] = taxcount_df.staxids.apply(taxid_to_taxonomic_unit)
        taxcount_df = pd.DataFrame(taxcount_df[taxonomic_unit].value_counts())
        
        taxcount_df.columns = ['value']
        taxcount_df[taxonomic_unit] = taxcount_df.index
        taxcount_df.index = range(len(taxcount_df))
        
        
        return data_selection, taxcount_df
    except Exception as e:
        raise Exception("[-] ERROR creating initial result dataframe selection for bokeh RBH result plot with exception: {}".format(e))

In [None]:
#result_df = pd.read_csv('../data/database_statistics/bokeh/big/reciprocal_results_with_taxonomy.csv',index_col=0,header=0)
#database = pd.read_csv('../data/database_statistics/bokeh/big/HIGH_QUALITY_BACTERIA_DATABASE_with_taxonomic_information.csv', index_col=0,header=0)
#result_df = pd.read_csv('../data/database_statistics/altair/cyano/reciprocal_results_with_taxonomy.csv',index_col=0,header=0)
#database = pd.read_csv('../data/database_statistics/altair/cyano/HIGH_QUALITY_CYANOBACTERIA_DATABASE_with_taxonomic_information.csv', index_col=0,header=0)
#result_df = pd.read_csv('../data/database_statistics/bokeh/small/reciprocal_results_with_taxonomy.csv',index_col=0,header=0)
#database = pd.read_csv('../data/database_statistics/bokeh/small/HIGH_QUALITY_CYANOBACTERIA_DATABASE_with_taxonomic_information.csv', index_col=0,header=0)
database = pd.read_csv('../data/database_statistics/bokeh/big_with_len/HIGH_QUALITY_DATABASE_with_taxonomic_information.csv',index_col=0,header=0)
result_df = pd.read_csv('../data/database_statistics/bokeh/big_with_len/reciprocal_results_with_taxonomy.csv',index_col=0,header=0)
#result_df = pd.read_csv("../data/mep_pathway_regulation/rec_blast/reciprocal_results_with_taxonomy.csv")
logfile = 'database_statistics.log'
taxonomic_unit = 'phylum'
project_id = 1

In [None]:
grid = create_unlinked_bokeh_plot(logfile,result_df,database,taxonomic_unit,project_id)

In [None]:
show(grid)

show(grid)

In [None]:
sum(database['class'].value_counts())