In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

# Long Non-coding RNA (lncRNA) Appyter 

Using lncRNA-gene co-expression, this Appyter can predict the biological functions of ~5000 lncRNAs.

In [2]:
import pandas as pd 
import numpy as np
import h5py as h5
from plotly.offline import iplot
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from IPython.display import display,FileLink, Markdown, HTML
import ssl
import os, json
import urllib.request
import s3fs
import time
import requests
import itertools
import random
import re
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, CustomJS, ColumnDataSource, Select, Legend, Paragraph, LinearColorMapper, ColorBar, CategoricalColorMapper
from bokeh.layouts import row, column
from bokeh.palettes import Pastel2, Set2, Set1, Colorblind
from bokeh.models import Arrow, NormalHead
import hashlib
import colorcet as cc

output_notebook()

In [None]:
%%appyter hide_code_exec
{% do SectionField(name='section1', title = '1. Input a Gene Symbol or Ensembl ID', subtitle = '', img = 'lncRNA_appyter_logo.png')%}
{% do SectionField(name='section2', title = '2. Options', subtitle = '', img = 'lncRNA_appyter_logo.png')%}


In [None]:
%%appyter code_exec
{% set query = StringField(name='gene_symbol', label='Gene Symbol/Ensembl ID', default='HOTAIR', description='',section = 'section1') %}
{% set options_fast_compute = BoolField(name='fast_compute', label='Fast Compute', default='true', description='Fast Compute will retrieve pre-computed results for a faster run time. Select \'No\' to run whole analysis.', section='section2')%}

In [None]:
%%appyter code_exec
query = {{ query }}

In [None]:
# lncRNA of interest
query = query.upper()

%%appyter markdown
### Import gene-lncRNA co-expression matrix

This lncRNA-gene matrix was computed using Pearson Correlation on 10,000 randomly selected bulk RNA-seq samples from Recount3[1]. 

In [3]:
# Import lncRNA-gene co-expression matrix
s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(endpoint_url='https://s3.appyters.maayanlab.cloud'))
f = h5.File(s3.open('storage/lncRNA_Appyter/Recount3_lncRNA_pcorr.h5', 'rb'), 'r') 
corr =f["data/correlation"]
col_genes = [x.decode('UTF-8') for x in f["meta/columns/genes"]]
row_genes =  [x.decode('UTF-8') for x in f["meta/rows/genes"]]
row_genes_ensembl = [x.decode('UTF-8') for x in f["meta/rows/ensembl"]]

In [None]:
# Convert input Ensembl ID to gene symbol
ensembl_2_genes = dict(zip(row_genes_ensembl,row_genes))
if query in row_genes_ensembl:
    query_new = ensembl_2_genes[query]
    if query != query_new:
        print('Predicting functions for ' + query_new + '(' + query + ')')
        query = query_new
    else:
        print('Predicting functions for ' + query)
else:
    if query in row_genes:
        print('Predicting functions for ' + query )

%%appyter markdown
### Top correlated genes with {{query.raw_value}}

Similarly to Geneshot[2], gene-gene similarities are predicted using co-expression. All genes are ranked by Pearson Correlation with the input lncRNA.

In [None]:
# Find most correalted genes and lncRNAs with the input lncRNA
if not os.path.exists("gene_correlations/"):
        os.makedirs("gene_correlations/", exist_ok=True)

# Get index of lncRNA of interest
idx_query = np.where(np.asarray(row_genes) == query)[0][0]

# Ranks genes based on pearson correlation with the lncRNA of interest
lncRNA_coexp = pd.DataFrame(corr[idx_query,:])
lncRNA_coexp.index = col_genes
lncRNA_coexp.columns = ['Pearson Correlation']
lncRNA_coexp = lncRNA_coexp.sort_values(by='Pearson Correlation', ascending=False)
print(lncRNA_coexp[0:20])

# save gene correlations to csv file
lncRNA_coexp.to_csv('gene_correlations/'+ query + '_correlated_genes.csv')

In [None]:
display(FileLink('gene_correlations/' + query + '_correlated_genes.csv', result_html_prefix=str('Download Table 1: ')))

In [None]:
%%appyter markdown
### Enrichment results for the top 200 correlated genes with {{query.raw_value}}

The top 200 correlated genes are submitted to Enrichr[3-5] for enrichment analysis. Note: Only genes with gene symbols are submitted. Ensembl IDs are dropped.

In [None]:
# Get Enrichr link
def Enrichr_API(enrichr_gene_list, description):

    short_id = ''

    ENRICHR_URL = 'https://maayanlab.cloud/Enrichr/addList'
    genes_str = '\n'.join(enrichr_gene_list)
    description = description
    payload = {
        'list': (None, genes_str),
        'description': (None, description)
    }

    response = requests.post(ENRICHR_URL, files=payload)
    if not response.ok:
        raise Exception('Error analyzing gene list')

    data = json.loads(response.text)
    short_id = data["shortId"]
    return('https://maayanlab.cloud/Enrichr/enrich?dataset='+ str(short_id))

In [None]:
# Submit top 200 gene symbols to Enrichr
top200genes = [x for x in list(lncRNA_coexp.index) if not x.startswith('ENSG')]
top200genes = top200genes[0:200]
enrichr_link = Enrichr_API(top200genes,str('Top 200 correlated genes with the lncRNA: ' + query))

In [None]:
display(HTML("Access your enrichment analysis results here: <a href='{href}'>{link}</a>".format(href=enrichr_link, link = enrichr_link)))

%%appyter markdown
### Top correlated lncRNAs with {{query.raw_value}}

lncRNAs are ranked by Pearson Correlation with the input lncRNA.

In [None]:
# Download most correlated lncRNAs
lncRNA_lncRNA_coexp = lncRNA_coexp.loc[row_genes]
lncRNA_lncRNA_coexp = lncRNA_lncRNA_coexp.sort_values(by='Pearson Correlation', ascending=False)
print(lncRNA_lncRNA_coexp [0:20])

# save gene correlations to csv file
lncRNA_coexp.to_csv('gene_correlations/' + query + '_correlated_lncRNAs.csv')

In [None]:
display(FileLink('gene_correlations/' + query + '_correlated_lncRNAs.csv', result_html_prefix=str('Download Table 2: ')))

In [None]:
%%appyter markdown
### Predicted Biological Functions of {{query.raw_value}}

For each Enrichr library, the mean Pearson Correlation is calulated between each gene set and the lncRNA of interest. Terms with a high mean Pearson Correaltion are prioritized and predicted to be associated with the lncRNA.

In [None]:
%%appyter code_exec
{% if options_fast_compute.raw_value == False %}
# Load Enrichr libraries
def loadLibrary(library: str, overwrite: bool = False) -> str:
    ssl._create_default_https_context = ssl._create_unverified_context
    if not os.path.exists("gmts/"+library +'.gmt' or overwrite):
        os.makedirs("gmts", exist_ok=True)
        print("Download Enrichr geneset library")
        urllib.request.urlretrieve("https://maayanlab.cloud/Enrichr/geneSetLibrary?mode=text&libraryName="+library, "gmts/"+library+".gmt")
    else:
        print("File cached. To reload use loadLibrary(\""+library+"\", overwrite=True) instead.")
    return("gmts/"+library+".gmt")

# Predict functions based on mean pearson correaltion for each term in a library 
def predict_functions(library, matrix, query):
    library_path = loadLibrary(library)
    open_gmt = open(library_path,'r')
    library_dict = {}
    for line in open_gmt.readlines():
        line = line.strip().split('\t')
        term = line[0]
        gene_set = line[2:]
        library_dict[term]=gene_set
    open_gmt.close()  

    all_terms = []
    all_scores = []

    for lib_term, gene_set in library_dict.items():
        all_terms.append(lib_term)
        lib_term_set = list(set(gene_set)&set(matrix.index))
        lib_term_set = [x for x in lib_term_set if x!= query]
        all_scores.append(np.mean(matrix.loc[lib_term_set]['Pearson Correlation']))

    df_results = pd.DataFrame({'Term':all_terms,'Mean Pearson Correlation':all_scores})
    df_results = df_results.sort_values(by ='Mean Pearson Correlation',ascending=False)
    return(df_results)

{% endif %}

In [None]:
# Plot the top terms for each prediction library
def plot_results(library_names, results_dfs, top_results=20):
    
    fig = make_subplots(rows=1, cols=2, print_grid=False,shared_xaxes=False)
    max_scores = []
    for i in range(0,2):
        results_df = results_dfs[i][0:top_results].sort_values(by='Mean Pearson Correlation')
        library_name = library_names[i]
        max_scores.append(np.max(results_df['Mean Pearson Correlation']))
        bar = go.Bar(x=results_df['Mean Pearson Correlation'],
            y=results_df['Term'],
            orientation='h',
            name=library_name,
            showlegend=False,
            hovertext=['<b>Term: {Term}</b><br><b>Mean Pearson Correlation</b>: <i>{Mean Pearson Correlation:.3}</i>'.format(**rowData) for index, rowData in results_df[0:top_results].iterrows()],
            hoverinfo='text', 
            marker={'color': 'dodgerblue'})
        fig.append_trace(bar, 1, i+1)
        
        #Get text
        text = go.Scatter(
            x=[max(bar['x'])/50 for x in range(len(bar['y']))],
            y=bar['y'],
            mode='text',
            hoverinfo='none',
            showlegend=False,
            text=['<b>{}</b>'.format(rowData['Term']) for index, rowData in results_df[0:top_results].iterrows()],
            textposition="middle right",
            textfont={'color': 'black','size':8})
        fig.append_trace(text, 1, i+1)
    
    annotations= [{'x': 0.25, 'y': 1.1, 'text': '<span style="color: black; font-size: 15pt; font-weight: 600;">' +library_names[0]+'</span>', 'showarrow': False, 'xref': 'paper', 'yref': 'paper', 'xanchor': 'center'},{'x': 0.75, 'y': 1.1, 'text': '<span style="color: black; font-size: 15pt; font-weight: 600;">' +library_names[1]+'</span>', 'showarrow': False, 'xref': 'paper', 'yref': 'paper', 'xanchor': 'center'}]
    fig['layout'].update(height = 500, hovermode='closest', annotations=annotations)
    fig.update_layout(title='',height = 500,title_font_size = 25,title_x=0.5)
    
    fig['layout']['xaxis1'].update(domain=[0, 0.49], title='Mean Pearson Correlation' ,range=(0,max_scores[0]+max_scores[0]*.01))
    fig['layout']['xaxis2'].update(domain=[0.51, 1], title='Mean Pearson Correlation',range=(0,max_scores[1]+max_scores[1]*.01))
    fig['layout']['yaxis1'].update(showticklabels=False)
    fig['layout']['yaxis2'].update(showticklabels=False)
    fig['layout']['margin'].update(l=30, t=65, r=30, b=35)
    
    iplot(fig)

In [None]:
%%appyter code_exec
{% if options_fast_compute.raw_value == False %}
# Make function predictions
prediction_libraries = ['MGI_Mammalian_Phenotype_Level_4_2021','GO_Biological_Process_2021','KEGG_2021_Human','DisGeNET']
prediction_libraries = np.array_split(prediction_libraries, int(np.ceil(len(prediction_libraries)/2))) 
for i_group,group in enumerate(prediction_libraries):
    predictions = []
    library_names = []
    for pred_library in group:
        predictions.append(predict_functions(pred_library,lncRNA_coexp,query))
        library_names.append(pred_library.replace('_',' '))
    plot_results(library_names, predictions)
    
    # Save Predictions 
    if not os.path.exists("predicted_functions/"):
        os.makedirs("predicted_functions/", exist_ok=True)
    for ii,prediction in enumerate(predictions):
        prediction.to_csv("predicted_functions/" + library_names[ii]+'_' + query + '.csv')
        display(FileLink("predicted_functions/" + library_names[ii]+'_' + query + '.csv', result_html_prefix=str('Download predictions: ')))
{% endif %}

In [None]:
%%appyter code_exec
{% if options_fast_compute.raw_value == True %}
# Import pre-computed lncRNA functions
prediction_libraries = ['MGI_Mammalian_Phenotype_Level_4_2021','GO_Biological_Process_2021','KEGG_2021_Human','DisGeNET']
prediction_libraries = np.array_split(prediction_libraries, int(np.ceil(len(prediction_libraries)/2))) 
for i_group,group in enumerate(prediction_libraries):
    predictions = []
    library_names = []
    for pred_library in group:
        precomputed_avg_coexp = pd.read_csv(s3.open('storage/lncRNA_Appyter/'+ pred_library + '_lncRNA_avg_coexpression.csv','rb'),index_col=0)
        precomputed_avg_coexp = pd.DataFrame(precomputed_avg_coexp.loc[query]).sort_values(by=query,ascending = False).reset_index().rename({'index': 'Term', query: 'Mean Pearson Correlation'}, axis='columns')
        predictions.append(precomputed_avg_coexp)
        library_names.append(pred_library.replace('_',' '))
    plot_results(library_names, predictions)

    # Save Predictions 
    if not os.path.exists("predicted_functions/"):
        os.makedirs("predicted_functions/", exist_ok=True)
    for ii,prediction in enumerate(predictions):
        prediction.to_csv("predicted_functions/" + library_names[ii]+'_' + query + '.csv')
        display(FileLink("predicted_functions/" + library_names[ii]+'_' + query + '.csv', result_html_prefix=str('Download predictions: ')))
{% endif %}

In [None]:
%%appyter markdown
### Tissue and Cell Line Specific Expression: {{query.raw_value}}
#### Return the Z-score(Normalized Median Expression) for the lncRNA in various tissues and cell lines. 

Method: Samples from Recount3[1] were automatically labelled by tissue type or cell line. Tissue and cell line samples were log2 transformed and quantile normalized separately. For each lncRNA, the median expression was then calucalted within each tissue type/cell line. Tissues/cell lines with less than 20 samples were removed. Z-score was then applied along the lncRNA axis to compare expression across all tissues and cell lines.

In [7]:
# Import z-score data
tissue_expr_zscore  = pd.read_csv(s3.open('storage/lncRNA_Appyter/lncRNA_zscore_median_expr_by_tissue_filtered.csv', 'rb'),header=0, index_col=0) 
tissue_expr_zscore.index = [x.split(',')[0] for x in tissue_expr_zscore.index]

cell_line_expr_zscore = pd.read_csv(s3.open('storage/lncRNA_Appyter/lncRNA_zscore_median_expr_by_cell_line_filtered.csv', 'rb'),header=0, index_col=0) 
cell_line_expr_zscore.index = [x.split(',')[0] for x in cell_line_expr_zscore.index]

# Create folder for tissue and cell line specific expression
if not os.path.exists("tissue_and_cell_line_expression/"):
    os.makedirs("tissue_and_cell_line_expression/", exist_ok=True)

In [None]:
# Rank tissues by z-score(median expression)
tissue_specific_lncRNA = pd.DataFrame(tissue_expr_zscore.loc[query])
tissue_specific_lncRNA = tissue_specific_lncRNA.sort_values(by=query,ascending=False)
tissue_specific_lncRNA

In [None]:
tissue_specific_lncRNA.to_csv("tissue_and_cell_line_expression/" + query + '_tissue_zscore' + '.csv')
display(FileLink("tissue_and_cell_line_expression/" + query + '_tissue_zscore' + '.csv', result_html_prefix=str('Download table for z-score(median expression)in tissue types: ')))

In [None]:
# Rank cell lines by z-score(median expression)
cell_line_specific_lncRNA = pd.DataFrame(cell_line_expr_zscore.loc[query])
cell_line_specific_lncRNA = cell_line_specific_lncRNA.sort_values(by=query,ascending=False)
cell_line_specific_lncRNA[0:20]

In [None]:
cell_line_specific_lncRNA.to_csv("tissue_and_cell_line_expression/" + query + '_cell_line_zscore' + '.csv')
display(FileLink("tissue_and_cell_line_expression/" + query + '_cell_line_zscore' + '.csv', result_html_prefix=str('Download table for z-score(median expression)in cell lines: ')))

In [None]:
%%appyter markdown
### Visualizing the lncRNA similarity network in Tissues: {{query.raw_value}}

UMAP[6] showing 5,050 lncRNAs. Color lncRNAs by tissue type with the highest expression or color by median expression and z-score in a specific tissue.

In [4]:
def str_to_int(string, mod):
    string = re.sub(r"\([^()]*\)", "", string).strip()
    byte_string = bytearray(string, "utf8")
    return int(hashlib.sha256(byte_string).hexdigest(), base=16)%mod

def plot_scatter(umap_df, values_dict, option_list, sample_names, caption_text, category_list_dict=None, location='right', category=True, dropdown=False, figure_counter=0, additional_info=None, color_by_title='',highlight_query=None):
    
    # init plot 
    source = ColumnDataSource(data=dict(x=umap_df["x"], y=umap_df["y"], values=values_dict[option_list[0]], names=sample_names))
    
    # node size
    if umap_df.shape[0] > 1000:
        node_size = 4
    else:
        node_size = 6
    plot = figure(plot_width=1000, plot_height=800)  
    # if location == 'right':
    #     plot = figure(plot_width=1000, plot_height=800)   
    # else:
    #     plot = figure(plot_width=1000, plot_height=1000+20*len(category_list_dict[option_list[0]]))   
    if category == True:
        unique_category_dict = dict()
        for option in option_list:
            unique_category_dict[option] = sorted(list(set(values_dict[option])))
        
        # map category to color
        # color is mapped by its category name 
        # if a color is used by other categories, use another color
        factors_dict = dict()
        colors_dict = dict()
        for key in values_dict.keys():
            if key in ['all tissues','all cell lines']:
                palette = list(itertools.chain(*zip(Pastel2[8], Set2[8], Set1[9], Colorblind[8])))
                unused_color = list(itertools.chain(*zip(Pastel2[8], Set2[8], Set1[9], Colorblind[8])))
                factors_dict[key] = category_list_dict[key]
                colors_dict[key] = list()
                for category_name in factors_dict[key]:
                    color_for_category = palette[str_to_int(category_name, len(palette))]
                    
                    if color_for_category not in unused_color:
                        if len(unused_color) > 0:
                            color_for_category = unused_color[0]                        
                        else:
                            color_for_category = random.sample(palette,1)[0]
                    
                    colors_dict[key].append(color_for_category)
                    if color_for_category in unused_color:
                        unused_color.remove(color_for_category)
            else:
                factors_dict[key] = category_list_dict[key]
                colors_dict[key] = list()
                for category_name in factors_dict[key]:
                    if category_name == 'other':
                        colors_dict[key].append('#cccccc')
                    else:
                        colors_dict[key].append('#e41a1c')
           
        color_mapper = CategoricalColorMapper(factors=factors_dict[option_list[0]], palette=colors_dict[option_list[0]])
        legend = Legend()
        
        plot.add_layout(legend, location)
        scatter = plot.scatter('x', 'y', size=node_size, source=source, color={'field': 'values', 'transform': color_mapper}, legend_field="values")
        plot.legend.label_width = 30
        plot.legend.click_policy='hide'
        plot.legend.spacing = 1
        if location == 'below':
            location = 'bottom_left'
        plot.legend.location = location
        plot.legend.label_text_font_size = '10pt'
    else:
        unique_category_dict = dict()
        for option in option_list:
            unique_category_dict[option] = sorted(list(set(values_dict[option])))
        
        # map category to color
        # color is mapped by its category name 
        # if a color is used by other categories, use another color
        colors_dict = dict()
        for key in values_dict.keys():
            colors_dict[key]= cc.CET_D1A

        color_mapper = LinearColorMapper(palette=colors_dict[option_list[0]] , low=min(values_dict[option_list[0]]), high=max(values_dict[option_list[0]]))
        color_bar = ColorBar(color_mapper=color_mapper, label_standoff=12)
        plot.add_layout(color_bar, 'right')
        scatter = plot.scatter('x', 'y', size=node_size,  source=source, color={'field': 'values', 'transform': color_mapper})
       
    if additional_info is not None:
            tooltips = [
            ("lncRNA", "@names"),
            ("Label", "@values"),
            ("p-value", "@info")
        ]
    else:
        tooltips = [
            ("lncRNA", "@names"),
            ("Label", "@values"),
        ]
    
    if highlight_query!=None:
        arrow_loc = umap_df.loc[highlight_query]
        plot.add_layout(Arrow(end=NormalHead(size=10,fill_color="black"),x_start=arrow_loc['x']+0.2, y_start=arrow_loc['y']+0.6, x_end=arrow_loc['x'], y_end=arrow_loc['y']))
    
    plot.add_tools(HoverTool(tooltips=tooltips))
    plot.output_backend = "webgl"
    
    plot.xaxis.axis_label = "UMAP 1"
    plot.xaxis.axis_label_text_font_size = "12pt"
    plot.yaxis.axis_label = "UMAP 2"
    plot.yaxis.axis_label_text_font_size = "12pt"
    
    plot.xaxis.major_tick_line_color = None  # turn off x-axis major ticks
    plot.xaxis.minor_tick_line_color = None  # turn off x-axis minor ticks
    plot.yaxis.major_tick_line_color = None  # turn off y-axis major ticks
    plot.yaxis.minor_tick_line_color = None  # turn off y-axis minor ticks
    plot.xaxis.major_label_text_font_size = '0pt'  # preferred method for removing tick labels
    plot.yaxis.major_label_text_font_size = '0pt'  # preferred method for removing tick labels
    default_text = "Figure {}. {}{}"
    pre = Paragraph(text = default_text.format(figure_counter, caption_text, option_list[0]), width=500, height=100, style={"font-family":'Helvetica', "font-style": "italic"})
    figure_counter += 1
    if dropdown == True:
        if category == True:
            callback_adt = CustomJS(args=dict(source=source, \
                                              pre=pre, \
                                              values_dict=values_dict, \
                                              figure_counter=figure_counter, \
                                              color_mapper=color_mapper,\
                                              unique_category_dict=unique_category_dict,\
                                              category_list_dict=category_list_dict,\
                                              additional_info=additional_info,\
                                              factors_dict=factors_dict,\
                                              colors_dict=colors_dict,\
                                              plot=plot,\
                                              scatter=scatter,
                                              caption_text=caption_text
                                             ), code="""        
                const val = cb_obj.value;                    
                source.data.values = values_dict[val]  
                if (additional_info != null) {
                    source.data.info = additional_info[val]
                }
                color_mapper.factors = category_list_dict[val]
                color_mapper.palette = colors_dict[val]
                plot.legend = unique_category_dict[val]
                pre.text = "Figure "+figure_counter+". "+caption_text+val+"."; 
                source.change.emit();
            """)
            #plot.height = 1000+10*(category_list_dict[val].length)
        else:
            if len(option_list) >0:
                callback_adt = CustomJS(args=dict(source=source, \
                                                pre=pre, \
                                                values_dict=values_dict, \
                                                additional_info=additional_info,\
                                                figure_counter=figure_counter,
                                                color_mapper=color_mapper,\
                                                colors_dict=colors_dict,\
                                                scatter=scatter,
                                                caption_text=caption_text), code="""        
                    const val = cb_obj.value;    
                    source.data.values = values_dict[val]
                    if (additional_info != null) {
                        source.data.info = additional_info[val]
                    }
                    
                    pre.text = "Figure "+figure_counter+". "+caption_text+val+".";  
                    source.change.emit();
                """)
            else:
                callback_adt = CustomJS(args=dict(source=source, \
                                                pre=pre, \
                                                values_dict=values_dict, \
                                                additional_info=additional_info,\
                                                figure_counter=figure_counter,
                                                caption_text=caption_text), code="""        
                    const val = cb_obj.value;    
                    source.data.values = values_dict[val]
                    if (additional_info != null) {
                        source.data.info = additional_info[val]
                    }
                    
                    pre.text = "Figure "+figure_counter+". "+caption_text+val+".";  
                    source.change.emit();
                """)

        # init dropdown menu
        select = Select(title="Color by " + color_by_title + ':', value=option_list[0], options=option_list)
        select.js_on_change('value', callback_adt)
        
        col = column(select, row(column(plot, pre)))
        show(col)

    else:
        col = column(plot, pre)
        show(col)

In [None]:
umap_tissue_results = pd.read_csv(s3.open('storage/lncRNA_Appyter/umap_tissues.csv', 'rb'),header=0, index_col=0) 
tissue_expr_median_expr = pd.read_csv(s3.open('storage/lncRNA_Appyter/lncRNA_median_expr_by_tissue_filtered.csv', 'rb'),header=0, index_col=0) 

In [None]:
lncRNA_2_tissue = {}
for gene in tissue_expr_zscore.index:
    lncRNA_2_tissue[gene]=tissue_expr_zscore.loc[gene].idxmax()

values_dict_th = dict({'all tissues':[lncRNA_2_tissue[x] for x in list(umap_tissue_results.index)]}) 
category_list_dict_th = dict()
category_list_dict_th['all tissues'] = list(sorted(np.unique([lncRNA_2_tissue[x] for x in list(umap_tissue_results.index)])))
for t in np.unique(values_dict_th['all tissues']):
    values_dict_th[t] = [lncRNA_2_tissue[x] if lncRNA_2_tissue[x] == t else "other" for x in list(umap_tissue_results.index)]
    category_list_dict_th[t]= ['other',t]

plot_scatter(umap_df=umap_tissue_results, values_dict=values_dict_th,option_list=list(['all tissues'])+list(np.unique(values_dict_th['all tissues'])) ,sample_names=list(umap_tissue_results.index),caption_text='UMAP of lncRNAs colored by highest expression in ',category_list_dict=category_list_dict_th, dropdown=True,figure_counter=2,color_by_title='Highest Expression',highlight_query=query)

In [None]:
values_dict_tm = dict()
unique_tissues = np.unique(tissue_expr_median_expr.columns)
for t in unique_tissues:
    values_dict_tm[t] = list(tissue_expr_median_expr[t])
    
plot_scatter(umap_df=umap_tissue_results, values_dict=values_dict_tm,option_list=list(unique_tissues) ,sample_names=list(umap_tissue_results.index),caption_text='UMAP of lncRNAs colored by median expression in ', figure_counter=4,category_list_dict=None, category=False,dropdown=True,color_by_title='Median Expression',highlight_query=query)

In [None]:
values_dict_tz = dict()
unique_tissues = np.unique(tissue_expr_zscore.columns)
for t in unique_tissues:
    values_dict_tz[t] = list(tissue_expr_zscore[t])
    
plot_scatter(umap_df=umap_tissue_results, values_dict=values_dict_tz,option_list=list(unique_tissues) ,sample_names=list(umap_tissue_results.index),caption_text='UMAP of lncRNAs colored by z-score in ', figure_counter=4,category_list_dict=None, category=False,dropdown=True,color_by_title='Z-score',highlight_query=query)

In [None]:
%%appyter markdown
### Visualizing the lncRNA similarity network in Cell Lines: {{query.raw_value}}

UMAP[6] showing 5,050 lncRNAs. Color lncRNAs by cell line with the highest expression or color by median expression and z-score in a specific cell line.

In [5]:
umap_cell_line_results = pd.read_csv(s3.open('storage/lncRNA_Appyter/umap_cell_lines.csv', 'rb'),header=0, index_col=0) 
cell_line_expr_median_expr = pd.read_csv(s3.open('storage/lncRNA_Appyter/lncRNA_median_expr_by_cell_line_filtered.csv', 'rb'),header=0, index_col=0) 

In [11]:
lncRNA_2_cell_line = {}
for gene in cell_line_expr_zscore.index:
    lncRNA_2_cell_line[gene]=cell_line_expr_zscore.loc[gene].idxmax()

values_dict_ch = dict({'all cell lines':[lncRNA_2_cell_line[x] for x in list(umap_cell_line_results.index)]}) 
category_list_dict_ch = dict()
category_list_dict_ch['all cell lines'] = list(sorted(np.unique([lncRNA_2_cell_line[x] for x in list(umap_cell_line_results.index)]))) 
for t in np.unique(values_dict_ch['all cell lines']):
    values_dict_ch[t] = [lncRNA_2_cell_line[x] if lncRNA_2_cell_line[x] == t else "other" for x in list(umap_cell_line_results.index)]
    category_list_dict_ch[t]= ['other',t]

plot_scatter(umap_df=umap_cell_line_results, values_dict=values_dict_ch,option_list=list(['all cell lines'])+list(np.unique(values_dict_ch['all cell lines'])) ,sample_names=list(umap_cell_line_results.index),caption_text='UMAP of lncRNAs colored by highest expression in ',category_list_dict=category_list_dict_ch, dropdown=True,figure_counter=6,color_by_title='Highest Expression',highlight_query=query)

In [None]:
values_dict_cm = dict()
unique_cell_lines = np.unique(cell_line_expr_median_expr.columns)
for t in unique_cell_lines:
    values_dict_cm[t] = list(cell_line_expr_median_expr[t])
    
plot_scatter(umap_df=umap_cell_line_results, values_dict=values_dict_cm,option_list=list(unique_cell_lines) ,sample_names=list(umap_cell_line_results.index),caption_text='UMAP of lncRNAs colored by median expression in ', figure_counter=7,category_list_dict=None, category=False,dropdown=True,color_by_title='Median Expression',highlight_query=query)

In [None]:
values_dict_cz = dict()
unique_cell_lines = np.unique(cell_line_expr_zscore.columns)
for t in unique_cell_lines:
    values_dict_cz[t] = list(cell_line_expr_zscore[t])
    
plot_scatter(umap_df=umap_cell_line_results, values_dict=values_dict_cz,option_list=list(unique_cell_lines) ,sample_names=list(umap_cell_line_results.index),caption_text='UMAP of lncRNAs colored by z-score in ', figure_counter=8,category_list_dict=None, category=False,dropdown=True,color_by_title='Z-score',highlight_query=query)

In [None]:
# close h5 file
f.close()

### References
[1] Wilks C, Zheng SC, Chen FY, Charles R, Solomon B, Ling JP, Imada EL, Zhang D, Joseph L, Leek JT: recount3: summaries and queries for large-scale RNA-seq expression and splicing. bioRxiv 2021:2021.2005.2021.445138.

[2] Lachmann A, Schilder BM, Wojciechowicz ML, Torre D, Kuleshov MV, Keenan AB, Ma’ayan A: Geneshot: search engine for ranking genes from arbitrary text queries. Nucleic Acids Research 2019, 47(W1):W571-W577.

[3]Xie Z, Bailey A, Kuleshov MV, Clarke DJB, Evangelista JE, Jenkins SL, Lachmann A, Wojciechowicz ML, Kropiwnicki E, Jagodnik KM: Gene Set Knowledge Discovery with Enrichr. Current Protocols 2021, 1(3):e90.

[4]Chen EY, Tan CM, Kou Y, Duan Q, Wang Z, Meirelles GV, Clark NR, Ma’ayan A: Enrichr: interactive and collaborative HTML5 gene list enrichment analysis tool. BMC Bioinformatics 2013, 14(1):128.

[5]Kuleshov MV, Jones MR, Rouillard AD, Fernandez NF, Duan Q, Wang Z, Koplev S, Jenkins SL, Jagodnik KM, Lachmann A: Enrichr: a comprehensive gene set enrichment analysis web server 2016 update. Nucleic Acids Research 2016, 44(W1):W90-W97.

[6]McInnes L, Healy J, Melville J: Umap: Uniform manifold approximation and projection for dimension reduction. arXiv preprint arXiv:180203426 2018.
