## Convert phyloprofile to 2D dataframe

In [1]:
import pandas as pd


cellulase = '/home/ingo/applications/uGene/examples/cellulase-euks/cellulase_euks-fasF075-presence_absence.phyloprofile'

def phyloprofile2matrix(path):
    """
    Convert a phyloprofile file into a 2D matrix
    """

    def initialize_phyloprofile_df(path):
        taxa = set()
        genes = set()
        with open(path) as fh:
            header = next(fh)
            for line in fh:
                dl = line.strip().split()
                taxa.add(dl[1])
                genes.add(dl[0])
    
        return pd.DataFrame(index=list(genes), columns=list(taxa))
    
    def fill_phyloprofile_dataframe(df, path):
        with open(path) as fh:
            header = next(fh)
            for line in fh:
                gene, taxid, ortholog, fasf, fasb = line.strip().split()
                df.loc[gene,taxid] = int(fasf)
        return df.fillna(0)

    
    df = initialize_phyloprofile_df(cellulase)
    df = fill_phyloprofile_dataframe(df, cellulase)
    return df


df = phyloprofile2matrix(cellulase)
display(df)

Unnamed: 0,Nelumbo_nucifera,Onychostruthus_taczanowskii,Cucumis_sativus,Apaloderma_vittatum,Nomia_melanderi,Notechis_scutatus,Cercospora_kikuchii,Heterostelium_album_PN500,Epinephelus_lanceolatus,Piliocolobus_tephrosceles,...,Saccharomyces_cerevisiae_S288C,Macaca_nemestrina,Protomyces_lactucae-debilis,Megalops_cyprinoides,Arabidopsis_thaliana9,Caenorhabditis_briggsae,Charadrius_vociferus,Talpa_occidentalis,Venustampulla_echinocandica,Nomascus_leucogenys
GH12_QRW23340.1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GH45_1_QRW26230.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
GH5_5_QRW24137.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GH51_2_QRW22189.1,1,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
GH74_QRW16773.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GH6_QRW27543.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
GH16_2_QRW19070.1,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
GH9_QRW16024.1,1,0,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
GH3_QRW22340.1,0,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,1,0
GH131_QRW22550.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


## Download taxstrings

In [2]:
taxpath = '/home/ingo/transfer/ncbi_euks.txt.2name'

name2taxid = {}
with open(taxpath) as fh:
    header = next(fh)
    for line in fh:
        taxid, name = line.strip().split('\t')
        name = name.replace(' ', '_').replace('/', '_')
        name2taxid[name] = taxid

if 'Arabidopsis_thaliana9' in df.columns:
    df = df.rename(columns={'Arabidopsis_thaliana9': 'Arabidopsis_thaliana', 'Cannabis_sativa9': 'Cannabis_sativa'})


for name in df.columns:
    if name.startswith('Galendromus'):
        print(name)

taxids4download = [name2taxid[name] for name in df.columns]
print(len(taxids4download))

if '34638' in taxids4download:
    print('yes')

1239


In [3]:
from ete3 import NCBITaxa
ncbi = NCBITaxa()
#ncbi.update_taxonomy_database()

taxid2lineage = {}
for taxid in taxids4download:
    nodelineage = ncbi.get_lineage(taxid)
    nodid2lineage = ncbi.get_taxid_translator(nodelineage)
    lineage = list(nodid2lineage.values())
    taxid2lineage[taxid] = lineage

print(len(taxid2lineage))

1237


## Dimonsionality reduction Function

In [25]:
from sklearn.decomposition import PCA
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.manifold import TSNE
from collections import Counter
pd.set_option('mode.chained_assignment', None)


def dimension_reduced_phyloprofile(
    df, taxid2lineage, name2taxid, cladelist, 
    to_summarize='', method='PCA', jitter='',
    n_components=2, scaler=StandardScaler(), transpose=False, seed=42
):
    """
    Take a 2D representation of a phylogenetic profile and apply dimensionality reduction
    """
    def species2clade(df, taxid2lineage, name2taxid, cladelist):

        taxid2name = {taxid: name for name, taxid in name2taxid.items()}
        
        name2cladelabel = {}
        allclades = set()
        for taxid, lineage in taxid2lineage.items():
            allclades.update(lineage)
            name = taxid2name[taxid]
            
            clade_of_interest = [clade for clade in cladelist if clade in lineage]
            if len(clade_of_interest) > 1:
                raise ValueError(f'Overlapping clades of interest. {name} is: {clade_of_interest}')
            elif len(clade_of_interest) == 0:
                cladelabel = 'other'
            else:
                cladelabel = clade_of_interest[0]
            name2cladelabel[name] = cladelabel
    
        return name2cladelabel, allclades


    def find_species_orientation(df, name2cladelabel):
        # check whether species in columns or in index
        species = []
        orient = ''
        for name, taxid in name2taxid.items():
            if name in df.columns.values:
                species = list(df.columns.values)
                orient = 'columns'
                break
            elif name in df.index.values:
                species = list(df.index.values)
                orient = 'index'
                break
            else:
                continue
        if not species:
            raise ValueError('Could not find any taxids from dictionary in datframe columns or index')
        return orient

    def rename_columns(column_name):
        return column_name.split('_')[0].replace('GH', '')

    
    # Standardize the features
    if transpose:
        df = df.transpose()
    if scaler:
        scaled_data = scaler.fit_transform(df)
    else:
        scaled_data = df

    # reduce dimensions
    if method == 'PCA':
        pca = PCA(n_components=n_components)
        result = pca.fit_transform(scaled_data)
    elif method == 'tSNE':
        tsne = TSNE(n_components=2, random_state=seed)
        result = tsne.fit_transform(scaled_data)
    else:
        raise ValueError(f'Unknown method "{method}". Choose "PCA" or "tSNE"')
        
    # store result in dataframe
    red_df = pd.DataFrame(data=result, columns=[f'PC{i}' for i in range(1, n_components+1)])
    red_df['label'] = df.index
    red_df['num_cellulases'] = df.sum(axis=1).values

    # store information which cellulases are found in df
    tmpdf = df.copy().astype('bool')
    tmpdf = tmpdf.rename(columns=rename_columns)
    true_columns_per_row = tmpdf.astype('bool').apply(lambda row: row[row].index.tolist(), axis=1)
    true_columns_per_row = true_columns_per_row.apply(lambda cellulase_list: sorted(map(int, cellulase_list)))
    true_columns_per_row = true_columns_per_row.apply(lambda cellulase_list: [f'GH{cellulase}' for cellulase in cellulase_list])
    red_df['cellulase_list'] = true_columns_per_row.values
    true_columns_per_row = true_columns_per_row.apply(lambda cellulase_list: ', '.join(cellulase_list))
    red_df['cellulases'] = true_columns_per_row.values
    
    # juggle taxonomic info
    name2cladelabel, allclades = species2clade(df, taxid2lineage, name2taxid, cladelist)
    for clade in cladelist:
        if clade not in allclades:
            print(f'Warning: "{clade}" cannot be found in any lineage')

    # add clade labels
    orient = find_species_orientation(df, name2cladelabel)
    if orient == 'index':
        red_df['clade'] = [name2cladelabel[name] for name in red_df.label if name in name2cladelabel]

    # summarize different clade labels
    if to_summarize:
        for newterm, sumterms in to_summarize.items():
            for sumterm in sumterms:
                if sumterm not in cladelist:
                    print(f'Warning: {sumterm} should be summarized to {newterm} but was not found in cladelist. Nothing done.')
                red_df['clade'][red_df.clade == sumterm] = newterm

    # add jitter
    if jitter:
        x_jitter = np.random.normal(loc=0, scale=jitter, size=red_df['PC1'].size)
        y_jitter = np.random.normal(loc=0, scale=jitter, size=red_df['PC2'].size)
        red_df['PC1'] = red_df['PC1'] + x_jitter
        red_df['PC2'] = red_df['PC2'] + y_jitter

    return red_df



to_summarize = ''
######################################################################################################

# PCA or tSNE
method = 'tSNE'

# clades to color
cladelist = [
    'Viridiplantae', 
    'Arthropoda', 
    'Ascomycota', 
    'Basidiomycota', 
    'Chordata', 
    'Cnidaria', 
    'Oomycota', 
    'Amoebozoa', 
    #'Apusozoa', 
    'Spiralia', 
    #'Haptista'
]

# clades to summarize
# to_summarize = {'apu_spir': ['Apusozoa', 'Spiralia']}

# include jitter to visualize overlapping dots
jitter = 0.3

# random seed for t-SNE
seed = 42

######################################################################################################

red_df = dimension_reduced_phyloprofile(
    df, taxid2lineage, name2taxid, cladelist, 
    to_summarize=to_summarize, method=method, jitter=jitter,
    n_components=2, scaler=StandardScaler(), transpose=True, seed=seed
)
red_df.to_csv(f'{method}_cellulase.csv', index=False)
display(red_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1239 entries, 0 to 1238
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   PC1             1239 non-null   float64
 1   PC2             1239 non-null   float64
 2   label           1239 non-null   object 
 3   num_cellulases  1239 non-null   int64  
 4   cellulase_list  1239 non-null   object 
 5   cellulases      1239 non-null   object 
 6   clade           1239 non-null   object 
dtypes: float64(2), int64(1), object(4)
memory usage: 67.9+ KB


None

In [23]:
search_terms = ['GH45']

#filtered_df = red_df[red_df['cellulase_list'].apply(lambda x: any(term in x for term in search_terms))]
filtered_df = red_df[red_df['cellulase_list'].apply(lambda x: any(term == item for term in search_terms for item in x))]
display(filtered_df)

Unnamed: 0,PC1,PC2,label,num_cellulases,cellulase_list,cellulases,clade
12,37.934564,-2.712388,Verticillium_nonalfalfae,13,"[GH1, GH2, GH3, GH3, GH5, GH6, GH7, GH10, GH12...","GH1, GH2, GH3, GH3, GH5, GH6, GH7, GH10, GH12,...",Ascomycota
16,38.579146,-4.614774,Fusarium_subglutinans,14,"[GH1, GH2, GH3, GH3, GH5, GH5, GH6, GH7, GH10,...","GH1, GH2, GH3, GH3, GH5, GH5, GH6, GH7, GH10, ...",Ascomycota
20,39.749156,-5.853824,Fusarium_venenatum,13,"[GH1, GH2, GH3, GH3, GH5, GH5, GH6, GH7, GH10,...","GH1, GH2, GH3, GH3, GH5, GH5, GH6, GH7, GH10, ...",Ascomycota
22,15.925061,-5.241167,Ustilago_maydis_521,8,"[GH3, GH3, GH5, GH9, GH10, GH16, GH45, GH51]","GH3, GH3, GH5, GH9, GH10, GH16, GH45, GH51",Basidiomycota
27,30.059046,-0.029015,Pyrenophora_tritici-repentis_Pt-1C-BFP,11,"[GH1, GH2, GH3, GH3, GH7, GH10, GH12, GH16, GH...","GH1, GH2, GH3, GH3, GH7, GH10, GH12, GH16, GH4...",Ascomycota
...,...,...,...,...,...,...,...
1191,30.451231,1.919259,Aaosphaeria_arxii_CBS_175.79,11,"[GH1, GH2, GH3, GH3, GH5, GH7, GH10, GH16, GH4...","GH1, GH2, GH3, GH3, GH5, GH7, GH10, GH16, GH45...",Ascomycota
1205,31.780628,4.653298,Rasamsonia_emersonii_CBS_393.64,12,"[GH1, GH2, GH3, GH3, GH5, GH6, GH7, GH10, GH12...","GH1, GH2, GH3, GH3, GH5, GH6, GH7, GH10, GH12,...",Ascomycota
1207,36.266535,-2.745260,Phaeoacremonium_minimum_UCRPA7,12,"[GH1, GH2, GH3, GH3, GH5, GH7, GH10, GH12, GH4...","GH1, GH2, GH3, GH3, GH5, GH7, GH10, GH12, GH45...",Ascomycota
1215,20.415588,-13.429669,Letharia_lupina,6,"[GH3, GH3, GH12, GH16, GH30, GH45]","GH3, GH3, GH12, GH16, GH30, GH45",Ascomycota


## Plot

In [5]:
to_summarize = ''
######################################################################################################

# PCA or tSNE
method = 'tSNE'

# clades to color
cladelist = [
    'Viridiplantae', 
    'Arthropoda', 
    'Ascomycota', 
    'Basidiomycota', 
    'Chordata', 
    'Cnidaria', 
    'Oomycota', 
    'Amoebozoa', 
    #'Apusozoa', 
    'Spiralia', 
    #'Haptista'
]

# clades to summarize
# to_summarize = {'apu_spir': ['Apusozoa', 'Spiralia']}

# include jitter to visualize overlapping dots
jitter = 0.3

# random seed for t-SNE
seed = 42

######################################################################################################

red_df = dimension_reduced_phyloprofile(
    df, taxid2lineage, name2taxid, cladelist, 
    to_summarize=to_summarize, method=method, jitter=jitter,
    n_components=2, scaler=StandardScaler(), transpose=True, seed=seed
)
red_df.to_csv(f'data/{method}_cellulase.csv', index=False)


# Plot the result
fig = px.scatter(
    red_df, x='PC1', y='PC2', #title=f'{method} plot', 
    labels={'PC1': f'{method} 1', 'PC2': f'{method} 2'}, 
    hover_data={'clade':True, 'num_cellulases': True, 'cellulases': True, 'label': True, 'PC1': False, 'PC2': False}, width=1000, height=1000,
    size='num_cellulases',
    color='clade',
    color_discrete_sequence=px.colors.qualitative.Vivid
)

fig.show()
fig.write_html(f"cellulases_{method}_2D.html")

ValueError: cannot reindex on an axis with duplicate labels

## Dash

In [None]:
from dash import dcc, html, Input, Output, Dash
import plotly.express as px
import pandas as pd

# Assuming red_df is your DataFrame and method is defined
#red_df = pd.read_csv('/home/felixl/PycharmProjects/cellulases/data/tSNE_cellulase.csv')
# method = ...

# Initialize the Jupyter Dash app
app = Dash(__name__)

app.layout = html.Div([
    dcc.Input(id='search-bar-cellulases', type='text', placeholder='Search Cellulases...'),
    dcc.Input(id='search-bar-species', type='text', placeholder='Search Species...'),
    dcc.Graph(id='scatter-plot'),
])

@app.callback(
    Output('scatter-plot', 'figure'),
    [Input('search-bar-cellulases', 'value'),
     Input('search-bar-species', 'value')]
)
def update_figure(search_value_cellulases, search_value_species):
    filtered_df = red_df

    if search_value_cellulases:
        search_terms = [term.strip() for term in search_value_cellulases.split(',')]
        for term in search_terms:
            filtered_df = filtered_df[filtered_df['cellulases'].apply(lambda x: term in x)]
    
    if search_value_species:
        filtered_df = filtered_df[filtered_df['label'].str.contains(search_value_species, case=False, na=False)]
    
    # Update the scatter plot based on the filtered DataFrame
    fig = px.scatter(
        filtered_df, x='PC1', y='PC2', title=f'{method} plot',
        labels={'PC1': 'Axis 1', 'PC2': 'Axis 2'}, 
        hover_data={'clade': True, 'num_cellulases': True, 'cellulases': True, 'label': True, 'PC1': False, 'PC2': False},
        width=1000, height=1000,
        size='num_cellulases',
        color='clade'
    )
    return fig

if __name__ == '__main__':
    #app.run_server(mode='inline')
    #app.run(jupyter_mode="jupyterlab")
    app.run(jupyter_mode="external")


## Panel

In [27]:
import panel as pn
import plotly.express as px
import pandas as pd

pn.extension('plotly')

# Assuming red_df is your DataFrame and method is defined
#red_df = pd.read_csv('/home/felixl/PycharmProjects/cellulases/data/tSNE_cellulase.csv')
# method = ...

# Define the function to update the plot
def update_plot(search_value_cellulases, search_value_species):
    filtered_df = red_df

    if search_value_cellulases:
        search_terms = [term.strip() for term in search_value_cellulases.split(',')]
        for term in search_terms:
            filtered_df = filtered_df[filtered_df['cellulases'].apply(lambda x: term in x)]

    if search_value_species:
        filtered_df = filtered_df[filtered_df['label'].str.contains(search_value_species, case=False, na=False)]

    fig = px.scatter(
        filtered_df, x='PC1', y='PC2', #title=f'{method} plot',
        labels={'PC1': f'tSNE 1', 'PC2': f'tSNE 2'}, 
        hover_data={'clade': True, 'num_cellulases': True, 'cellulases': True, 'label': True, 'PC1': False, 'PC2': False},
        width=1000, height=1000,
        size='num_cellulases',
        color='clade'
    )
    return fig

# Create interactive widgets
search_bar_cellulases = pn.widgets.TextInput(name='Search Cellulases', placeholder='Enter cellulases...')
search_bar_species = pn.widgets.TextInput(name='Search Species', placeholder='Enter species...')

# Bind the function and widgets
@pn.depends(search_bar_cellulases.param.value, search_bar_species.param.value)
def get_plot(search_value_cellulases, search_value_species):
    return update_plot(search_value_cellulases, search_value_species)

# Layout
layout = pn.Column(pn.Row(search_bar_cellulases, search_bar_species), get_plot)

# Serve the Panel app
layout.servable()
