In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
%%appyter hide_code_exec
{% do SectionField(name='section1', title = '1. Please input a valid gene symbol or Ensembl ID', subtitle = '', img = 'lncRNA_appyter_logo.png')%}
{% do SectionField(name='section2', title = '2. Options', subtitle = '', img = 'lncRNA_appyter_logo.png')%}

In [None]:
%%appyter code_exec
{% set query = AutocompleteField(name='gene',label='Gene Symbol/Ensembl ID',description='',default='HOTAIR',required=True, choices = load_static("lncRNAs.json"), section='section1') %}
{% set options_fast_compute = BoolField(name='fast_compute', label='Precompute', default='true', description='Precompute will retrieve precomputed results from the example files below for a faster run time. Select \'No\' to run whole analysis.', section='section2')%}


In [None]:
%%appyter code_exec
query = {{ query }}

In [None]:
%%appyter markdown
# Report about the Long Non-coding RNA (lncRNA) {{query.raw_value}}

Based on lncRNA-gene co-expression, this report provides predictions on the biological functions of {{query.raw_value}}, displays the median expression of {{query.raw_value}} across tissues and cell-lines, and predicts small molecules that may specifically up- or down-regulate the expression of {{query.raw_value}}.

In [None]:
# Import libraries 
import pandas as pd 
import numpy as np
import h5py as h5
from IPython.display import display,FileLink, HTML, Markdown
import os
from utils import *
import s3fs
from bokeh.io import output_notebook
import json
output_notebook()

In [None]:
# save gene name in json
with open('gene.json', 'w') as f:
    json.dump([query], f)

In [None]:
%%appyter markdown
### Import lncRNA-gene co-expression matrix

This lncRNA-gene co-expression matrix was generated by computing the Pearson correlation coefficients for 10,000 randomly selected bulk RNA-seq samples from Recount3 [1]. NOTE: If an Ensembl ID was entered, it will be converted to its corresponding gene symbol if available. 

In [None]:
%%appyter code_exec
# Import lncRNA-gene co-expression matrix
s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(endpoint_url='https://s3.appyters.maayanlab.cloud'))
f = h5.File(s3.open('storage/lncRNA_Appyter/v0.0.6/Recount3_lncRNA_pcorr_cols.h5', 'rb'), 'r') 
corr =f["data/correlation"]
col_genes = [x.decode('UTF-8') for x in f["meta/columns/genes"]]
col_genes_ensembl = [x.decode('UTF-8') for x in f["meta/columns/ensembl"]]
row_genes =  [x.decode('UTF-8') for x in f["meta/rows/genes"]]
row_genes_ensembl = [x.decode('UTF-8') for x in f["meta/rows/ensembl"]]

In [None]:
# Convert input Ensembl ID to gene symbol
ensembl_2_genes = dict(zip(row_genes_ensembl,row_genes))
genes_2_ensembl = dict(zip(row_genes,row_genes_ensembl))
if query in col_genes_ensembl:
    query_new = ensembl_2_genes[query]
    if query != query_new:
        print('The Ensembl ID ' + query + ' has been converted to the gene symbol ' + query_new )
        query = query_new
        print('Predicting functions for ' + query + '(' + genes_2_ensembl[query] + ')')
    else:
        print('Predicting functions for ' + query + '(' + genes_2_ensembl[query] + ')')
else:
    if query in row_genes:
        print('Predicting functions for ' + query + '(' + genes_2_ensembl[query] + ')')

In [None]:
%%appyter markdown
### Top genes correlated with {{query.raw_value}}

In [None]:
%%appyter markdown
Using the loaded lncRNA-gene correlation matrix, we report the genes that are most correlated with {{query.raw_value}}.

In [None]:
# Find most correlated genes and lncRNAs with the input lncRNA
if not os.path.exists("gene_correlations/"):
        os.makedirs("gene_correlations/", exist_ok=True)

# Get index of lncRNA of interest
idx_query = np.where(np.asarray(col_genes) == query)[0][0]

# Ranks genes based on pearson correlation with the lncRNA of interest
lncRNA_coexp = pd.DataFrame(corr[:,idx_query])
lncRNA_coexp.index = row_genes
lncRNA_coexp.columns = ["Pearson's Correlation Coefficient"]
lncRNA_coexp = lncRNA_coexp.sort_values(by="Pearson's Correlation Coefficient", ascending=False)

# save gene correlations to csv file
lncRNA_coexp.to_csv('gene_correlations/'+ query + '_correlated_genes.csv')

In [None]:
display(lncRNA_coexp[0:20])
display(Markdown(f"*Table 1. The Top 20 genes correlated with {query} ranked by Pearson’s correlation coefficients.*"))
display(FileLink('gene_correlations/' + query + '_correlated_genes.csv', result_html_prefix=str('Download Table 1: ')))

In [None]:
%%appyter markdown
### Interactive network visualization of the top 100 genes correlated with {{query.raw_value}}

In [None]:
%%appyter markdown

Interactive network visualization of the top 100 genes correlated with {{query.raw_value}}. Each node represents a gene and is colored by chromosome location, except for the bright red node which represents the lncRNA {{query.raw_value}}. The thickness of the edges corresponds to Pearson correlation coefficients. Clicking on a gene node will highlight its corresponding edges in orange. Hovering over a node will display the gene name and chromosome location.

Network Methods: All pairwise correlations between the top 100 genes correlated with {{query.raw_value}} are extracted. The 3 edges with the highest correlation per gene node are used to initialize the network. Edges with weights < 0.3 are dropped. To further prune the network, the edge with the lowest weight for each hub node is dropped. At the start, a hub node is defined as a node with > 10 edges. The pruning process is repeated until the network has an average of < 3 edges per node. The top 5 edges for {{query.raw_value}} are shown regardless of their weights.

In [None]:
%%appyter code_exec
# Visualize the co-expression network for the top 100 correlated genes
if not os.path.exists("coexpression_network/"):
        os.makedirs("coexpression_network/", exist_ok=True)
g = network_vis(query,lncRNA_coexp,genes_2_ensembl,row_genes)
g[1].to_csv("coexpression_network/"+query+'_network_node_metadata.csv')
g[2].to_csv("coexpression_network/"+query+'_network_edge_metadata.csv')
g[0].show('coexpression_network/' + query + '_network.html')

In [None]:
display(Markdown(f"*Figure 1. Interactive network visualization of the top 100 genes correlated with {query}.*"))

In [None]:
display(FileLink('coexpression_network/' + query + '_network.html', result_html_prefix=str('Download Figure 1: ')))
display(FileLink("coexpression_network/"+query+'_network_node_metadata.csv', result_html_prefix=str('Download Node metadata: ')))
display(FileLink("coexpression_network/"+query+'_network_edge_metadata.csv', result_html_prefix=str('Download Edge metadata: ')))

In [None]:
%%appyter markdown
### Enrichment analysis applied to the top 200 genes most correlated with {{query.raw_value}}

In [None]:
%%appyter markdown
The top 200 genes most correlated with {{query.raw_value}} are submitted to Enrichr [2-4] for enrichment analysis. NOTE: Only genes with official Entrez gene symbols are submitted to Enrichr. Ensembl IDs that do not map to an official gene symbol were dropped.

In [None]:
# Submit top 200 gene symbols to Enrichr
# For users running the notebook locally: To adjust the number of genes submitted to Enrichr, change the value of n below
n = 200
top_n_genes = [x for x in list(lncRNA_coexp.index) if not x.startswith('ENSG')]
top_n_genes = top_n_genes[0:n]
enrichr_link = Enrichr_API(top_n_genes,str('Top 200 correlated genes with the lncRNA: ' + query))

# Save enrichr link to text file
if not os.path.exists("enrichment_analysis/"):
        os.makedirs("enrichment_analysis/", exist_ok=True)
open_file = open('enrichment_analysis/'+query+'_top_'+str(n)+'_correlated_genes_Enrichr_link.txt','w')
open_file.write(enrichr_link)
open_file.close()

In [None]:
display(HTML("Access the enrichment analysis results for the top 200 most correlated with {query} here: <a href='{href}'>{link}</a>".format(href=enrichr_link, link = enrichr_link, query=query)))

In [None]:
%%appyter markdown
### Top lncRNAs correlated with {{query.raw_value}}

In [None]:
%%appyter markdown
Below we list the top 20 lncRNAs, out of all 15,862 lncRNAs within our database, that correlate most with {{query.raw_value}} based on their Pearson correlation coefficients.

In [None]:
# Download most correlated lncRNAs
lncRNA_lncRNA_coexp = lncRNA_coexp.loc[col_genes]
lncRNA_lncRNA_coexp = lncRNA_lncRNA_coexp.sort_values(by="Pearson's Correlation Coefficient", ascending=False)

# save gene correlations to csv file
lncRNA_lncRNA_coexp.to_csv('gene_correlations/' + query + '_correlated_lncRNAs.csv')

In [None]:
display(lncRNA_lncRNA_coexp[0:20])
display(Markdown(f"*Table 2. Top 20 lncRNAs that correlate most with {query} ranked by Pearson correlation coefficients.*"))
display(FileLink('gene_correlations/' + query + '_correlated_lncRNAs.csv', result_html_prefix=str('Download Table 2: ')))

In [None]:
%%appyter markdown
### Predicted biological functions of {{query.raw_value}}

In [None]:
%%appyter markdown
For each Enrichr [2-4] library, we compute the mean Pearson correlation coefficients for each gene set by averaging the Pearson correlation coefficients between each gene in the gene set and {{query.raw_value}}. Terms with high mean Pearson correlation coefficients are prioritized. These terms are predicted to be associated with {{query.raw_value}}.

In [None]:
%%appyter code_exec
{% if options_fast_compute.raw_value == False %}

# Folder for biological function predictions
if not os.path.exists("predicted_functions/"):
    os.makedirs("predicted_functions/", exist_ok=True)

# Make function predictions
prediction_libraries = ['MGI_Mammalian_Phenotype_Level_4_2021','GO_Biological_Process_2021','KEGG_2021_Human','DisGeNET']
prediction_libraries = np.array_split(prediction_libraries, int(np.ceil(len(prediction_libraries)/2))) 
for i_group,group in enumerate(prediction_libraries):
    predictions = []
    library_names = []
    for pred_library in group:
        predictions.append(predict_functions(pred_library,lncRNA_coexp,query))
        library_names.append(pred_library.replace('_',' '))
    if i_group == 0:
        plot_results(library_names=library_names, results_dfs=predictions,file_name='predicted_functions/'+query+'_biological_function_predictions_figure2')
        display(Markdown(f"*Figure 2. Predicted MGI Mammalian Phenotypes and GO Biological Processes for the lncRNA {query}. Terms are ranked by averaging the mean Pearson correlation coefficients between each gene in a gene set and {query}.*" ))
    if i_group == 1:
        plot_results(library_names=library_names, results_dfs=predictions,file_name='predicted_functions/'+query+'_biological_function_predictions_figure3')
        display(Markdown(f"*Figure 3. Predicted KEGG pathways and DisGeNET disease terms for the lncRNA {query}. Terms are ranked by averaging the Pearson correlation coefficients between each gene in a gene set and {query}.*" ))

    
    # Save Predictions 
    for ii,prediction in enumerate(predictions):
        prediction.to_csv("predicted_functions/" + library_names[ii]+'_' + query + '.csv')
        display(FileLink("predicted_functions/" + library_names[ii]+'_' + query + '.csv', result_html_prefix=str('Download predictions: ')))

{% endif %}

In [None]:
%%appyter code_exec
{% if options_fast_compute.raw_value == True %}

# Folder for biological function predictions
if not os.path.exists("predicted_functions/"):
    os.makedirs("predicted_functions/", exist_ok=True)

# Import pre-computed lncRNA functions
prediction_libraries = ['MGI_Mammalian_Phenotype_Level_4_2021','GO_Biological_Process_2021','KEGG_2021_Human','DisGeNET','ChEA_2016','ENCODE_TF_ChIP-seq_2015']
versions = ['v0.0.6','v0.0.6','v0.0.6','v0.0.6','v0.0.8','v0.0.8']
predlib_2_version = dict(zip(prediction_libraries,versions))
prediction_libraries = np.array_split(prediction_libraries, int(np.ceil(len(prediction_libraries)/2))) 
for i_group,group in enumerate(prediction_libraries):
    predictions = []
    library_names = []
    for pred_library in group:
        precomputed_df_indexes = list(pd.read_csv(s3.open('storage/lncRNA_Appyter/'+predlib_2_version[pred_library]+'/'+ pred_library + '_lncRNA_avg_coexpression_gene_index_order.txt','rb'),header=None)[0])
        query_idx = np.where(np.asarray(precomputed_df_indexes) == query)[0][0] 
        precomputed_avg_coexp = pd.read_csv(s3.open('storage/lncRNA_Appyter/'+predlib_2_version[pred_library]+'/'+ pred_library + '_lncRNA_avg_coexpression.csv','rb'),index_col=0,header=None, skiprows=query_idx+1,nrows=1)
        cols = list(pd.read_csv(s3.open('storage/lncRNA_Appyter/'+predlib_2_version[pred_library]+'/'+ pred_library + '_lncRNA_avg_coexpression.csv','rb'),header=0, index_col = 0, nrows=1).columns)
        precomputed_avg_coexp.columns = cols
        precomputed_avg_coexp = precomputed_avg_coexp.T
        precomputed_avg_coexp = precomputed_avg_coexp.sort_values(by=query,ascending = False).reset_index().rename({'index': 'Term', query: 'Mean Pearson Correlation'}, axis='columns') 
        cols=0
        predictions.append(precomputed_avg_coexp)
        library_names.append(pred_library.replace('_',' '))
    if i_group == 0:
        plot_results(library_names=library_names, results_dfs=predictions,file_name='predicted_functions/'+query+'_biological_function_predictions_figure2')
        display(Markdown(f"*Figure 2. Predicted MGI Mammalian Phenotypes and GO Biological Processes for the lncRNA {query}. Terms are ranked by averaging the mean Pearson correlation coefficients between each gene in a gene set and {query}.*" ))
    if i_group == 1:
        plot_results(library_names=library_names, results_dfs=predictions,file_name='predicted_functions/'+query+'_biological_function_predictions_figure3')
        display(Markdown(f"*Figure 3. Predicted KEGG pathways and DisGeNET disease terms for the lncRNA {query}. Terms are ranked by averaging the mean Pearson correlation coefficients between each gene in a gene set and {query}.*" ))
    if i_group == 2:
        plot_results(library_names=library_names, results_dfs=predictions,file_name='predicted_functions/'+query+'_biological_function_predictions_figure4')
        display(Markdown(f"*Figure 4. Predicted ChEA and ENCODE terms for the lncRNA {query}. Terms are ranked by averaging the mean Pearson correlation coefficients between each gene in a gene set and {query}.*" ))

    # Save Predictions 
    for ii,prediction in enumerate(predictions):
        prediction.to_csv("predicted_functions/" + library_names[ii]+'_' + query + '.csv')
        display(FileLink("predicted_functions/" + library_names[ii]+'_' + query + '.csv', result_html_prefix=str('Download predictions: ')))

{% endif %}

In [None]:
%%appyter markdown
### Expression of {{query.raw_value}} across tissues and cell lines
This part of the report provides the Z-score (Normalized Median Expression) for the lncRNA {{query.raw_value}} in various tissues and cell lines.

In [None]:
%%appyter markdown
Samples from Recount3 [1] were automatically labelled by tissue type or cell line of origin. Tissue and cell line samples were log2 transformed and quantile normalized separately. Tissues and cell lines with less than 20 samples were removed, and z-scores were computed along the lncRNA axis to compare expression levels across all tissues and cell lines. The median expression of {{query.raw_value}} was then calculated for each tissue type and cell line.

In [None]:
%%appyter code_exec
# Import z-score data
tissue_expr_zscore  = pd.read_csv(s3.open('storage/lncRNA_Appyter/v0.0.6/lncRNA_zscore_median_expr_by_tissue_filtered.csv', 'rb'),header=0, index_col=0) 
tissue_expr_zscore.index = [x.split(',')[0] for x in tissue_expr_zscore.index]


cell_line_expr_zscore = pd.read_csv(s3.open('storage/lncRNA_Appyter/v0.0.6/lncRNA_zscore_median_expr_by_cell_line_filtered.csv', 'rb'),header=0, index_col=0) 
cell_line_expr_zscore.index = [x.split(',')[0] for x in cell_line_expr_zscore.index]
cell_line_expr_zscore.columns = [x.upper().replace(' CELL','') for x in cell_line_expr_zscore.columns]

# Create folder for tissue and cell line specific expression
if not os.path.exists("tissue_and_cell_line_expression/"):
    os.makedirs("tissue_and_cell_line_expression", exist_ok=True)

In [None]:
# Rank tissues by z-score(median expression)
tissue_specific_lncRNA = pd.DataFrame(tissue_expr_zscore.loc[query])
tissue_specific_lncRNA = tissue_specific_lncRNA.sort_values(by=query,ascending=False)

In [None]:
plot_bar(tissue_specific_lncRNA ,query,'Tissues','Z-score (Median Expression)','tissue_and_cell_line_expression/'+query+'_zscore_tissue_expression')
display(Markdown(f"*Figure 5. Z-score (median expression) for the lncRNA {query} in various tissue types.*"))
tissue_specific_lncRNA.to_csv("tissue_and_cell_line_expression/" + query + '_tissue_zscore' + '.csv')
display(FileLink("tissue_and_cell_line_expression/" + query + '_tissue_zscore' + '.csv', result_html_prefix=str('Download table with z-score (median expression) values for ' + query + ' in various tissue types: ')))

In [None]:
# Rank cell lines by z-score(median expression)
cell_line_specific_lncRNA = pd.DataFrame(cell_line_expr_zscore.loc[query])
cell_line_specific_lncRNA = cell_line_specific_lncRNA.sort_values(by=query,ascending=False)

In [None]:
plot_bar(cell_line_specific_lncRNA[0:30],query,'Cell Lines','Z-score (Median Expression)','tissue_and_cell_line_expression/'+query+'_zscore_cell_line_expression')
display(Markdown(f"*Figure 6. Z-score (median expression) for the lncRNA {query} in the top 30 cell lines.*"))
cell_line_specific_lncRNA.to_csv("tissue_and_cell_line_expression/" + query + '_cell_line_zscore' + '.csv')
display(FileLink("tissue_and_cell_line_expression/" + query + '_cell_line_zscore' + '.csv', result_html_prefix=str('Download table with z-score (median expression) values for ' + query + ' in various cell lines: ')))

In [None]:
%%appyter markdown
### Visualizing all lncRNAs based on their gene expression similarity across tissues

In [None]:
%%appyter markdown
We applied UMAP [5] to visualize lncRNA expression across 3,000 randomly selected samples (with tissue type labels) from Recount3[1]. Samples were first log2 transformed and quantile normalized along the gene axis, then UMAP was applied to the lncRNA expression data with samples as features. Each data point represents a single lncRNA (n=15,862). Use the drop-down menu to color lncRNAs by expression z-score in a specific tissue. The black arrow is pointing to the location of {{query.raw_value}}.

In [None]:
%%appyter code_exec
umap_tissue_results = pd.read_csv(s3.open('storage/lncRNA_Appyter/v0.0.6/umap_tissues.csv', 'rb'),header=0, index_col=0) 
tissue_expr_median_expr = pd.read_csv(s3.open('storage/lncRNA_Appyter/v0.0.6/lncRNA_zscore_median_expr_by_tissue_filtered.csv', 'rb'),header=0, index_col=0) 

In [None]:
values_dict_tz = dict()
unique_tissues = np.unique(tissue_expr_zscore.columns)
for t in unique_tissues:
    values_dict_tz[t] = list(tissue_expr_zscore[t])


# Create folder for tissue and cell line specific expression
if not os.path.exists("umap/"):
    os.makedirs("umap/cell_lines/figures/static", exist_ok=True)
    os.makedirs("umap/tissues/figures/static", exist_ok=True)
    
plot_dynamic_scatter(umap_df=umap_tissue_results, values_dict=values_dict_tz,option_list=list(unique_tissues) ,sample_names=list(umap_tissue_results.index),caption_text='UMAP was applied to 3,000 randomly selected samples (with tissue type labels) from Recount3. Each data point represents a lncRNA (n=15,862) and are colored by z-score (median expression) in ', figure_counter=7,category_list_dict=None, category=False,dropdown=True,color_by_title='Z-score',highlight_query=query,first_selection=list(tissue_specific_lncRNA.index)[0],static_images_save = list(tissue_specific_lncRNA.index[0:2]), file_path='umap/tissues/figures/')

In [None]:
%%appyter markdown
### Visualizing all lncRNAs based on their gene expression similarity across cell lines 

In [None]:
%%appyter markdown
We applied UMAP [5] to visualize lncRNA expression across 3,000 randomly selected samples (with cell line labels) from Recount3[1]. Samples were first log2 transformed and quantile normalized along the gene axis, then UMAP was applied to the lncRNA expression data with samples as features. Each data point represents a single lncRNA (n=15,862). Use the drop-down menu to color lncRNAs by expression z-score in a specific cell line. The black arrow is pointing to the location of {{query.raw_value}}.

In [None]:
%%appyter code_exec
umap_cell_line_results = pd.read_csv(s3.open('storage/lncRNA_Appyter/v0.0.6/umap_cell_lines.csv', 'rb'),header=0, index_col=0) 
cell_line_expr_median_expr = pd.read_csv(s3.open('storage/lncRNA_Appyter/v0.0.6/lncRNA_zscore_median_expr_by_cell_line_filtered.csv', 'rb'),header=0, index_col=0) 

In [None]:
values_dict_cz = dict()
unique_cell_lines = np.unique(cell_line_expr_zscore.columns)
for t in unique_cell_lines:
    values_dict_cz[t] = list(cell_line_expr_zscore[t])
    
plot_dynamic_scatter(umap_df=umap_cell_line_results, values_dict=values_dict_cz,option_list=list(unique_cell_lines) ,sample_names=list(umap_cell_line_results.index),caption_text='UMAP was applied to 3,000 randomly selected samples (with cell line labels) from Recount3. Each data point represents a lncRNA (n=15,862) and are colored by z-score (median expression) in ', figure_counter=8,category_list_dict=None, category=False,dropdown=True,color_by_title='Z-score',highlight_query=query,first_selection=list(cell_line_specific_lncRNA.index)[0], static_images_save = list(cell_line_specific_lncRNA.index[0:2]), file_path='umap/cell_lines/figures/')

In [None]:
%%appyter markdown
### L1000 small molecules predicted to modulate {{query.raw_value}}

In [None]:
%%appyter markdown
~1.4 million Level 5 L1000 chemical perturbation gene expression signatures were downloaded from SigCom LINCS (https://maayanlab.cloud/sigcom-lincs) [6]. For each unique signature and lncRNA pair, a mean Pearson correlation coefficient was computed by taking the average Pearson coefficient between the lncRNA and all genes in the signature. All 15,862 lncRNAs were then ranked by mean Pearson correlation coefficient, and the top 1,000 lncRNAs with the highest coefficients were retained for each signature. The top 500 lncRNA-L1000 signature associations are reported here for {{query.raw_value}}, separated by direction. If {{query.raw_value}} is highly correlated with the up-regulated genes for a specific small molecule, then this small molecule is predicted to up-regulate {{query.raw_value}}.

In [None]:
%%appyter markdown
### L1000 small molecules predicted to up-Regulate {{query.raw_value}}

In [None]:
%%appyter markdown
The prioritized small molecules below are predicted to specifically up-regulate {{query.raw_value}}.

In [None]:
%%appyter code_exec
# Load predicted small molecules to modulate the lncRNA
l1000_prediction_file = pd.read_csv(s3.open('storage/lncRNA_Appyter/v0.0.6/l1000_sm_lncRNAs_final.tsv', 'rb'),sep='\t',header=None, index_col=0)
if query in list(l1000_prediction_file.index):
    l1000_prediction_file = l1000_prediction_file.loc[query]
    l1000_prediction_file = list(l1000_prediction_file.dropna(axis=0))
    drugs_up = []
    drugs_up_corr = []
    drugs_down = []
    drugs_down_corr=[]
    for sig in l1000_prediction_file:
        sig_id = sig.split(',')[0]
        sig_id_corr = float(sig.split(',')[1])
        if sig_id.split(' ')[1] == 'down':
            drugs_down.append(sig_id)
            drugs_down_corr.append(sig_id_corr)
        else:
            drugs_up.append(sig_id)
            drugs_up_corr.append(sig_id_corr)

    up_results = pd.DataFrame({'L1000 Signature ID':drugs_up,'Drug': [x.split('_')[4] for x in drugs_up],'Up/Down':[x.split(' ')[1] for x in drugs_up],'Dose':[x.split(' ')[0].split('_')[-1] for x in drugs_up],'Cell line':[x.split('_')[1] for x in drugs_up],'Time point':[x.split('_')[2] for x in drugs_up],'Mean Pearson Correlation':drugs_up_corr})
    down_results = pd.DataFrame({'L1000 Signature ID':drugs_down,'Drug': [x.split('_')[4] for x in drugs_down],'Up/Down':[x.split(' ')[1] for x in drugs_down],'Dose':[x.split(' ')[0].split('_')[-1] for x in drugs_down],'Cell line':[x.split('_')[1] for x in drugs_down],'Time point':[x.split('_')[2] for x in drugs_down],'Mean Pearson Correlation':drugs_down_corr})
else:
    up_results = pd.DataFrame()
    down_results = pd.DataFrame()


In [None]:
if len(up_results) > 0:
    display(up_results[0:20])
    display(Markdown(f"*Table 3. L1000 small molecules predicted to up-regulate the lncRNA {query}.*"))

    # Create folder for tissue and cell line specific expression
    if not os.path.exists("l1000_sm_predictions/"):
        os.makedirs("l1000_sm_predictions/", exist_ok=True)
    up_results.to_csv("l1000_sm_predictions/" + query + '_l1000_sm_predictions_up' + '.csv')
    display(FileLink("l1000_sm_predictions/" + query + '_l1000_sm_predictions_up' + '.csv', result_html_prefix=str('Download table of L1000 small molecules predicted to up-regulate ' + query + ': ')))
else:
    display(Markdown(f"**There are no small molecules predicted to specifically up-regulate the expression of {query}.**"))

In [None]:
%%appyter markdown
### L1000 small molecules predicted to down-regulate {{query.raw_value}}

In [None]:
%%appyter markdown
The prioritized small molecules below are predicted to specifically down-regulate {{query.raw_value}}.

In [None]:
if len(down_results) > 0:
    display(down_results[0:20])
    display(Markdown(f"*Table 4. L1000 small molecules predicted to down-regulate the lncRNA {query}.*"))
    # Create folder for tissue and cell line specific expression
    if not os.path.exists("l1000_sm_predictions/"):
        os.makedirs("l1000_sm_predictions/", exist_ok=True)
    down_results.to_csv("l1000_sm_predictions/" + query + '_l1000_sm_predictions_down' + '.csv')
    display(FileLink("l1000_sm_predictions/" + query + '_l1000_sm_predictions_down' + '.csv', result_html_prefix=str('Download table of L1000 small molecules predicted to down-regulate ' + query + ': ')))
else:
    display(Markdown(f"**There are no small molecules predicted to specifically down-regulate the expression of {query}.**"))

In [None]:
# close h5 file
f.close()

### References
[1] Wilks C, Zheng SC, Chen FY, Charles R, Solomon B, Ling JP, Imada EL, Zhang D, Joseph L, Leek JT: recount3: summaries and queries for large-scale RNA-seq expression and splicing. bioRxiv 2021:2021.2005.2021.445138.

[2] Xie Z, Bailey A, Kuleshov MV, Clarke DJB, Evangelista JE, Jenkins SL, Lachmann A, Wojciechowicz ML, Kropiwnicki E, Jagodnik KM: Gene Set Knowledge Discovery with Enrichr. Current Protocols 2021, 1(3):e90.

[3] Chen EY, Tan CM, Kou Y, Duan Q, Wang Z, Meirelles GV, Clark NR, Ma’ayan A: Enrichr: interactive and collaborative HTML5 gene list enrichment analysis tool. BMC Bioinformatics 2013, 14(1):128.

[4] Kuleshov MV, Jones MR, Rouillard AD, Fernandez NF, Duan Q, Wang Z, Koplev S, Jenkins SL, Jagodnik KM, Lachmann A: Enrichr: a comprehensive gene set enrichment analysis web server 2016 update. Nucleic Acids Research 2016, 44(W1):W90-W97.

[5] McInnes L, Healy J, Melville J: Umap: Uniform manifold approximation and projection for dimension reduction. arXiv preprint arXiv:180203426 2018.

[6] Evangelista et al. SigCom LINCS: Data and Metadata Search Engine for Gene Expression Signatures. 2021. In preparation.
