In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
%%appyter hide_code_exec
{% do SectionField(name='section1', title = '1. Please input a valid gene symbol or Ensembl ID', subtitle = '', img = 'lncRNA_appyter_logo.png')%}
{% do SectionField(name='section2', title = '2. Options', subtitle = '', img = 'lncRNA_appyter_logo.png')%}

In [None]:
%%appyter code_exec
{% set query = AutocompleteField(name='gene',label='Gene Symbol/Ensembl ID',description='',default='HOTAIR',required=True, choices = load_static("lncRNAs.json"), section='section1') %}
{% set options_fast_compute = BoolField(name='fast_compute', label='Precompute', default='true', description='Precompute will retrieve precomputed results from the example files below for a faster run time. Select \'No\' to run whole analysis.', section='section2')%}


In [None]:
%%appyter code_exec
query = {{ query }}

In [None]:
%%appyter markdown
# Report about the Long Non-coding RNA (lncRNA) {{query.raw_value}}
Based on lncRNA-gene co-expression, this report contains predictions regarding the biological functions of {{query.raw_value}} and small molecules that may specifically up- or down-regulate {{query.raw_value}} expression. This report also contains the gene coordinates for {{query.raw_value}}, canonical/alternative transcript sequences, publications per year, median expression of {{query.raw_value}} across tissues and cell-lines, and lncRNA-lncRNA expression similarities.

In [None]:
# Import libraries 
import pandas as pd 
import numpy as np
import h5py as h5
from IPython.display import display,FileLink, HTML, Markdown
import os
from utils import *
import s3fs
from bokeh.io import output_notebook
import json
output_notebook()

In [None]:
# save gene name in json
with open('gene.json', 'w') as f:
    json.dump([query], f)

In [None]:
# Import gene mapping data
s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(endpoint_url='https://s3.appyters.maayanlab.cloud'))
f = h5.File(s3.open('storage/lncRNA_Appyter/v0.0.6/Recount3_lncRNA_pcorr_cols.h5', 'rb'), 'r') 
col_genes = [x.decode('UTF-8') for x in f["meta/columns/genes"]]
col_genes_ensembl = [x.decode('UTF-8') for x in f["meta/columns/ensembl"]]
row_genes =  [x.decode('UTF-8') for x in f["meta/rows/genes"]]
row_genes_ensembl = [x.decode('UTF-8') for x in f["meta/rows/ensembl"]]

# Convert input Ensembl ID to gene symbol
ensembl_2_genes = dict(zip(row_genes_ensembl,row_genes))
genes_2_ensembl = dict(zip(row_genes,row_genes_ensembl))
if query in col_genes_ensembl:
    query_new = ensembl_2_genes[query]
    if query != query_new:
        print('The Ensembl ID ' + query + ' has been converted to the gene symbol ' + query_new )
        query = query_new
        print('Predicting functions for ' + query + '(' + genes_2_ensembl[query] + ')')
    else:
        print('Predicting functions for ' + query + '(' + genes_2_ensembl[query] + ')')
else:
    if query in row_genes:
        print('Predicting functions for ' + query + '(' + genes_2_ensembl[query] + ')')

In [None]:
%%appyter markdown
### Genomic coordinates for {{query.raw_value}} 
The genomic coordinates for {{query.raw_value}} are provided from GENCODE (gencode.v38.long_noncoding_RNAs.gtf)[1].

In [None]:
# Import genomic coordinates data
gene_coordinates = pd.read_csv(s3.open('storage/lncRNA_Appyter/v0.1.0/lncrna_coordinates.csv'),header=0,index_col=0)
gene_coordinates['ensembl_id'] = [x.split('.')[0] for x in gene_coordinates['gene_id']]
if query.startswith('ENSG'):
    gene_coordinates = gene_coordinates[gene_coordinates['ensembl_id']==query].reset_index(drop=True)
else:
    gene_coordinates = gene_coordinates[gene_coordinates['gene_name']==query].reset_index(drop=True)
gene_coordinates = gene_coordinates.drop(['ensembl_id'],axis=1)

# Save gene coordinates
if not os.path.exists("gene_info/"):
        os.makedirs("gene_info/", exist_ok=True)
gene_coordinates.to_csv('gene_info/' + query.replace('/','-') + '_gene_coordinates.csv')

In [None]:
%%appyter code_exec
display(gene_coordinates)
display(Markdown(f"*Table 1. Genomic coordinates for {query}.*"))
display(FileLink('gene_info/' + query.replace('/','-') + '_gene_coordinates.csv', result_html_prefix=str('Download Table 1: ')))

In [None]:
%%appyter markdown
###  Transcript sequences for HOTAIR
The canonical and alternative cDNA sequneces for {{query.raw_value}} are provided from Ensembl (Homo_sapiens.GRCh38.ncrna.fa)[2].

In [None]:
%%appyter code_exec
try:
    # Get canonical transcript ID for input lncRNA
    canonical_transcript_id = pd.read_csv(s3.open('storage/lncRNA_Appyter/v0.1.0/gene_canonical_metadata.csv'),header=0,index_col=0)
    canonical_transcript_id  = canonical_transcript_id[canonical_transcript_id['Gene stable ID']==genes_2_ensembl[query]]['Transcript stable ID version'].values[0]

    # Import all canonical cDNA sequences
    canonical_sequences = pd.read_csv(s3.open('storage/lncRNA_Appyter/v0.1.0/lncrna_canonical_sequences.csv'),header=0,index_col=0)

    # Get canonical sequence for input lncRNA
    canonical_sequences = canonical_sequences[canonical_sequences['id']==canonical_transcript_id].reset_index(drop=True)

    if len(canonical_sequences) > 0:
        canonical_sequences.to_csv('gene_info/' + query.replace('/','-') + '_canonical_sequence.csv')

        display(canonical_sequences)
        display(Markdown(f"*Table 2. Canonical transcript sequence for {query}.*"))
        display(FileLink('gene_info/' + query.replace('/','-') + '_canonical_sequence.csv', result_html_prefix=str('Download Table 2: ')))
    else:
        display(Markdown(f"**No canonical transcript sequence found for {query}.**"))

except:
    display(Markdown(f"**No canonical transcript sequence found for {query}.**"))

In [None]:
%%appyter code_exec
try:
    # Get alternative transcript IDs for input lncRNA
    alternative_transcript_id = pd.read_csv(s3.open('storage/lncRNA_Appyter/v0.1.0/gene_alternative_metadata.csv'),header=0,index_col=0)
    alternative_transcript_id = alternative_transcript_id[alternative_transcript_id['Gene stable ID']==genes_2_ensembl[query]]['Transcript stable ID version'].values

    # Import all alternative cDNA sequences
    alternative_sequences = pd.read_csv(s3.open('storage/lncRNA_Appyter/v0.1.0/lncrna_alternative_sequences.csv'),header=0,index_col=0)

    # Get alternative sequences for input lncRNA
    alternative_sequences = alternative_sequences[alternative_sequences['id'].isin(alternative_transcript_id)].reset_index(drop=True)

    if len(alternative_sequences) > 0:
        alternative_sequences.to_csv('gene_info/' + query.replace('/','-') + '_alternative_sequence.csv')

        display(alternative_sequences)
        display(Markdown(f"*Table 3. Alternative transcript sequence(s) for {query}.*"))
        display(FileLink('gene_info/' + query.replace('/','-') + '_alternative_sequence.csv', result_html_prefix=str('Download Table 3: ')))
    else:
        display(Markdown(f"**No alternative transcript sequence(s) found for {query}.**"))
except:
      display(Markdown(f"**No alternative transcript sequence(s) found for {query}.**"))

In [None]:
%%appyter markdown
### Publications that mention {{query.raw_value}} 
The PubMed API was used to generate AutoRIF data for {{query.raw_value}}. All PubMed IDs and dates were automatically collected for articles mentioning the lncRNA {{query.raw_value}}. The Ensembl ID[2], lncRNA gene symbol from GENCODE[1], and any previous symbols found in the HGNC database[3] along with the terms ‘lncRNA’ or ‘long non-coding RNA’ were used to query PubMed (e.g., “(ENSG00000228630 OR HOTAIR) AND (lncRNA OR long non-coding RNA)”)

In [None]:
%%appyter code_exec
# Get AutoRIF data for input lncRNA
autorif_file = s3.open('storage/lncRNA_Appyter/v0.1.0/lncRNA_autorif_final.tsv','rb')
read = 'start'
while read == 'start':
    for line in autorif_file.readlines():
        line = line.strip()
        line = line.decode('UTF-8')
        line = line.split('\t')
        lncrna = line[0].split("(")[0]
        if lncrna == query:
            autorif_results =  line[1:]
            read = 'stop'

# Save autorif data
if not os.path.exists("autorif/"):
        os.makedirs("autorif/", exist_ok=True)

if len(autorif_results) > 0:

    autorif_results_df = pd.DataFrame({'pmid':[x.split(',')[0] for x in autorif_results],'date':[x.split(',')[1] for x in autorif_results]})
    autorif_results_df = autorif_results_df.sort_values(by='date',ascending=False) 
    autorif_results_df = autorif_results_df[~autorif_results_df.date.str.contains("2022-")].reset_index(drop=True)# remove early 2022 publications
    total_pubs = len(autorif_results_df)
    autorif_results_df.to_csv('autorif/' + query.replace('/','-') + '_autorif_results.csv')

    autorif_plot(autorif_results_df,query,'autorif/' + query.replace('/','-') + '_autorif')
    display(Markdown(f"*Figure 1. Publications mentioning the lncRNA {query}. {total_pubs} total publications mentioned {query} from 1992 to 2021*"))
    display(FileLink('autorif/' + query.replace('/','-') + '_autorif.png', result_html_prefix=str('Download Figure 1 (PNG): ')))
    display(FileLink('autorif/' + query.replace('/','-') + '_autorif.svg', result_html_prefix=str('Download Figure 1 (SVG): ')))
    display(FileLink('autorif/' + query.replace('/','-') + '_autorif.pdf', result_html_prefix=str('Download Figure 1 (PDF): ')))
    display(FileLink('autorif/' + query.replace('/','-') + '_autorif_results.csv', result_html_prefix=str('Download AutoRIF data: ')))
else:
    display(Markdown(f"**No publications found for {query}.**"))

In [None]:
%%appyter markdown
### Import lncRNA-gene co-expression matrix

This lncRNA-gene co-expression matrix was generated by computing the Pearson correlation coefficients for 10,000 randomly selected bulk RNA-seq samples from Recount3[4]. NOTE: If an Ensembl ID was entered, it will be converted to its corresponding gene symbol if available. 

In [None]:
# Import lncRNA-gene co-expression matrix
corr =f["data/correlation"]

In [None]:
%%appyter markdown
### Top genes correlated with {{query.raw_value}}

In [None]:
%%appyter markdown
Using the loaded lncRNA-gene correlation matrix, we report the genes that are most positively and negatively correlated with {{query.raw_value}}.

In [None]:
# Save correlation files
if not os.path.exists("gene_correlations/"):
        os.makedirs("gene_correlations/", exist_ok=True)

# Get index of lncRNA of interest
idx_query = np.where(np.asarray(col_genes) == query)[0][0]

# Rank genes based on pearson correlation with the lncRNA of interest
lncRNA_coexp = pd.DataFrame(corr[:,idx_query])
lncRNA_coexp.index = row_genes
lncRNA_coexp.columns = ["Pearson's Correlation Coefficient"]
lncRNA_coexp = lncRNA_coexp.sort_values(by="Pearson's Correlation Coefficient", ascending=False)
lncRNA_neg_coexp = lncRNA_coexp.sort_values(by="Pearson's Correlation Coefficient", ascending=True)

# Save gene correlations to csv files
lncRNA_coexp.to_csv('gene_correlations/'+ query.replace('/','-') + '_positively_correlated_genes.csv')
lncRNA_neg_coexp.to_csv('gene_correlations/' + query.replace('/','-') + '_negatively_correlated_genes.csv')

In [None]:
%%appyter code_exec
display(lncRNA_coexp[0:20])
display(Markdown(f"*Table 4. The Top 20 genes positively correlated with {query} ranked by Pearson’s correlation coefficients.*"))
display(FileLink('gene_correlations/' + query.replace('/','-') + '_positively_correlated_genes.csv', result_html_prefix=str('Download Table 4: ')))

In [None]:
%%appyter code_exec
display(lncRNA_neg_coexp[0:20])
display(Markdown(f"*Table 5. The Top 20 genes negatively correlated with {query} ranked by Pearson’s correlation coefficients.*"))
display(FileLink('gene_correlations/' + query.replace('/','-') + '_negatively_correlated_genes.csv', result_html_prefix=str('Download Table 5: ')))

In [None]:
%%appyter markdown
### Interactive network visualization of the top 100 genes positively correlated with {{query.raw_value}}

In [None]:
%%appyter markdown

Interactive network visualization of the top 100 genes positively correlated with {{query.raw_value}}. Each node represents a gene and is colored by chromosome location, except for the bright red node which represents the lncRNA {{query.raw_value}}. The thickness of the edges corresponds to Pearson correlation coefficients. Clicking on a gene node will highlight its corresponding edges in orange. Hovering over a node will display the gene name and chromosome location.

Network Methods: All pairwise correlations between the top 100 genes positively correlated with {{query.raw_value}} are extracted. The 3 edges with the highest correlation per gene node are used to initialize the network. Edges with weights < 0.3 are dropped. To further prune the network, the edge with the lowest weight for each hub node is dropped. At the start, a hub node is defined as a node with > 10 edges. The pruning process is repeated until the network has an average of < 3 edges per node. The top 5 edges for {{query.raw_value}} are shown regardless of their weights.

In [None]:
# Visualize the co-expression network for the top 100 positively correlated genes
if not os.path.exists("coexpression_network/"):
        os.makedirs("coexpression_network/", exist_ok=True)
g = network_vis(query,lncRNA_coexp,genes_2_ensembl,row_genes)
g[1].to_csv("coexpression_network/"+query.replace('/','-')+'_network_node_metadata.csv')
g[2].to_csv("coexpression_network/"+query.replace('/','-')+'_network_edge_metadata.csv')
g[0].show('coexpression_network/' + query.replace('/','-') + '_network.html')

In [None]:
%%appyter code_exec
display(Markdown(f"*Figure 2. Interactive network visualization of the top 100 genes positively correlated with {query}.*"))
display(FileLink('coexpression_network/' + query.replace('/','-') + '_network.html', result_html_prefix=str('Download Figure 2: ')))
display(FileLink("coexpression_network/"+query.replace('/','-')+'_network_node_metadata.csv', result_html_prefix=str('Download Node metadata: ')))
display(FileLink("coexpression_network/"+query.replace('/','-')+'_network_edge_metadata.csv', result_html_prefix=str('Download Edge metadata: ')))

In [None]:
%%appyter markdown
### Enrichment analysis applied to the top genes most positively and negatively correlated with {{query.raw_value}}

In [None]:
%%appyter markdown
The top genes most positively and negatively correlated with {{query.raw_value}} are submitted to Enrichr[5-7] for enrichment analysis. NOTE: Only genes with gene symbols are submitted to Enrichr. Ensembl IDs that do not map to an official gene symbol were dropped.

In [None]:
%%appyter code_exec
# Get positively and negatively correlated genes
n_genes = [25, 50, 100, 200, 300, 500]
top_genes = [x for x in list(lncRNA_coexp.index) if not x.startswith('ENSG')] # only keep genes with gene symbols
top_genes = [x for x in top_genes if x != query] # remove query from end of list 
top_neg_genes = top_genes[::-1] # reverse list for top negatively correlated genes

# Get enrichemnt reuslts for top n positively correlated genes with input lncRNA
display(HTML("<font size=4> <b>Enrichment analysis reuslts for positively correlated genes with {query} </b></font>".format(query=query)))
for n in n_genes:
        enrichr_link_pos = Enrichr_API(top_genes[0:n],str('Top 200 positively correlated genes with the lncRNA: ' + query))
        
        # Save enrichr link to text file
        if not os.path.exists("enrichment_analysis/"):
                os.makedirs("enrichment_analysis/", exist_ok=True)
        open_file = open('enrichment_analysis/'+query.replace('/','-')+'_top_'+str(n)+'_positively_correlated_genes_Enrichr_link.txt','w')
        open_file.write(enrichr_link_pos)
        open_file.close()

        display(HTML("Access the enrichment analysis results for the top  <b>{n} positively</b> correlated genes with {query} here: <a href='{href}'>{link}</a>".format(href=enrichr_link_pos, link = enrichr_link_pos, query=query, n=n)))

display(HTML("<font size=4> \n\n\n <b>Enrichment analysis reuslts for negatively correlated genes with {query} </b></font>".format(query=query)))

# Get enrichemnt reuslts for top n negatively correlated genes with input lncRNA
for n in n_genes:
        enrichr_link_neg = Enrichr_API(top_neg_genes[0:n],str('Top 200 negatively correlated genes with the lncRNA: ' + query))
        open_file = open('enrichment_analysis/'+query.replace('/','-')+'_top_'+str(n)+'_negatively_correlated_genes_Enrichr_link.txt','w')
        open_file.write(enrichr_link_neg)
        open_file.close()
        
        display(HTML("Access the enrichment analysis results for the top <b>{n} negatively</b> correlated genes with {query} here: <a href='{href}'>{link}</a>".format(href=enrichr_link_neg, link = enrichr_link_neg, query=query, n=n)))

In [None]:
%%appyter markdown
### Top lncRNAs positively correlated with {{query.raw_value}}

In [None]:
%%appyter markdown
Below we list the top 20 lncRNAs, out of all 15,862 lncRNAs within our database, that correlate most with {{query.raw_value}} based on their Pearson correlation coefficients.

In [None]:
# Get correlated lncRNAs
lncRNA_lncRNA_coexp = lncRNA_coexp.loc[col_genes]
lncRNA_lncRNA_coexp = lncRNA_lncRNA_coexp.sort_values(by="Pearson's Correlation Coefficient", ascending=False)

# Save gene correlations to csv file
lncRNA_lncRNA_coexp.to_csv('gene_correlations/' + query.replace('/','-') + '_positively_correlated_lncRNAs.csv')

In [None]:
%%appyter code_exec
display(lncRNA_lncRNA_coexp[0:20])
display(Markdown(f"*Table 6. Top 20 lncRNAs that correlate most with {query} ranked by Pearson correlation coefficients.*"))
display(FileLink('gene_correlations/' + query.replace('/','-') + '_positively_correlated_lncRNAs.csv', result_html_prefix=str('Download Table 6: ')))

In [None]:
%%appyter markdown
### Predicted biological functions of {{query.raw_value}}

In [None]:
%%appyter markdown
For each Enrichr[5-7] library, we compute the mean Pearson correlation coefficient for each gene set by averaging the Pearson correlation coefficients between each gene in the gene set and {{query.raw_value}}. Terms with high mean Pearson correlation coefficients are prioritized. These terms are predicted to be associated with {{query.raw_value}}.

In [None]:
%%appyter code_exec
{% if options_fast_compute.raw_value == False %}

# Store biological function predictions
if not os.path.exists("predicted_functions/"):
    os.makedirs("predicted_functions/", exist_ok=True)

# Make function predictions
prediction_libraries = ['MGI_Mammalian_Phenotype_Level_4_2021','GO_Biological_Process_2021','KEGG_2021_Human','DisGeNET','ChEA_2016','ENCODE_TF_ChIP-seq_2015']
prediction_libraries = np.array_split(prediction_libraries, int(np.ceil(len(prediction_libraries)/2))) 
for i_group,group in enumerate(prediction_libraries):
    predictions = []
    library_names = []
    for pred_library in group:
        predictions.append(predict_functions(pred_library,lncRNA_coexp,query))
        library_names.append(pred_library.replace('_',' '))
    if i_group == 0:
        plot_results(library_names=library_names, results_dfs=predictions,file_name='predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f1')
        display(Markdown(f"*Figure 3. Predicted MGI Mammalian Phenotypes and GO Biological Processes for the lncRNA {query}. Terms are ranked by averaging the mean Pearson correlation coefficients between each gene in a gene set and {query}.*" ))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f1.png', result_html_prefix=str('Download Figure 3 (PNG): ')))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f1.svg', result_html_prefix=str('Download Figure 3 (SVG): ')))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f1.pdf', result_html_prefix=str('Download Figure 3 (PDF): ')))
    if i_group == 1:
        plot_results(library_names=library_names, results_dfs=predictions,file_name='predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f2')
        display(Markdown(f"*Figure 4. Predicted KEGG pathways and DisGeNET disease terms for the lncRNA {query}. Terms are ranked by averaging the mean Pearson correlation coefficients between each gene in a gene set and {query}.*" ))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f2.png', result_html_prefix=str('Download Figure 4 (PNG): ')))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f2.svg', result_html_prefix=str('Download Figure 4 (SVG): ')))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f2.pdf', result_html_prefix=str('Download Figure 4 (PDF): ')))
    if i_group == 2:
        plot_results(library_names=library_names, results_dfs=predictions,file_name='predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f3')
        display(Markdown(f"*Figure 5. Predicted ChEA and ENCODE terms for the lncRNA {query}. Terms are ranked by averaging the mean Pearson correlation coefficients between each gene in a gene set and {query}.*" ))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f3.png', result_html_prefix=str('Download Figure 5 (PNG): ')))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f3.svg', result_html_prefix=str('Download Figure 5 (SVG): ')))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f3.pdf', result_html_prefix=str('Download Figure 5 (PDF): ')))

    # Save Predictions 
    for ii,prediction in enumerate(predictions):
        prediction.to_csv("predicted_functions/" + library_names[ii]+'_' + query.replace('/','-') + '.csv')
        display(FileLink("predicted_functions/" + library_names[ii]+'_' + query.replace('/','-') + '.csv', result_html_prefix=str('Download predictions: ')))

{% endif %}

In [None]:
%%appyter code_exec
{% if options_fast_compute.raw_value == True %}

# Store biological function predictions
if not os.path.exists("predicted_functions/"):
    os.makedirs("predicted_functions/", exist_ok=True)

# Import pre-computed lncRNA function predictions
prediction_libraries = ['MGI_Mammalian_Phenotype_Level_4_2021','GO_Biological_Process_2021','KEGG_2021_Human','DisGeNET','ChEA_2016','ENCODE_TF_ChIP-seq_2015']
prediction_libraries = np.array_split(prediction_libraries, int(np.ceil(len(prediction_libraries)/2))) 
for i_group,group in enumerate(prediction_libraries):
    predictions = []
    library_names = []
    for pred_library in group:
        precomputed_avg_coexp = get_bf_pvalues('storage/lncRNA_Appyter/v0.1.0/' + pred_library + '_lncRNA_avg_coexpression.h5', 'storage/lncRNA_Appyter/v0.1.0/' + pred_library + '_lncRNA_pvalue.h5' , query)
        predictions.append(precomputed_avg_coexp)
        library_names.append(pred_library.replace('_',' '))
    if i_group == 0:
        plot_results(library_names=library_names, results_dfs=predictions,file_name='predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f1',pvalue=True)
        display(Markdown(f"*Figure 3. Predicted MGI Mammalian Phenotypes and GO Biological Processes for the lncRNA {query}. Terms are ranked by averaging the mean Pearson correlation coefficients between each gene in a gene set and {query}.*" ))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f1.png', result_html_prefix=str('Download Figure 3 (PNG): ')))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f1.svg', result_html_prefix=str('Download Figure 3 (SVG): ')))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f1.pdf', result_html_prefix=str('Download Figure 3 (PDF): ')))
    if i_group == 1:
        plot_results(library_names=library_names, results_dfs=predictions,file_name='predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f2',pvalue=True)
        display(Markdown(f"*Figure 4. Predicted KEGG pathways and DisGeNET disease terms for the lncRNA {query}. Terms are ranked by averaging the mean Pearson correlation coefficients between each gene in a gene set and {query}.*" ))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f2.png', result_html_prefix=str('Download Figure 4 (PNG): ')))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f2.svg', result_html_prefix=str('Download Figure 4 (SVG): ')))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f2.pdf', result_html_prefix=str('Download Figure 4 (PDF): ')))
    if i_group == 2:
        plot_results(library_names=library_names, results_dfs=predictions,file_name='predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f3',pvalue=True)
        display(Markdown(f"*Figure 5. Predicted ChEA and ENCODE terms for the lncRNA {query}. Terms are ranked by averaging the mean Pearson correlation coefficients between each gene in a gene set and {query}.*" ))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f3.png', result_html_prefix=str('Download Figure 5 (PNG): ')))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f3.svg', result_html_prefix=str('Download Figure 5 (SVG): ')))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f3.pdf', result_html_prefix=str('Download Figure 5 (PDF): ')))

    # Save Predictions 
    for ii,prediction in enumerate(predictions):
        prediction.to_csv("predicted_functions/" + query.replace('/','-') +'_' + library_names[ii]  + '.csv')
        display(FileLink("predicted_functions/" + query.replace('/','-') +'_' + library_names[ii]  + '.csv', result_html_prefix=str('Download predictions: ')))

{% endif %}

In [None]:
%%appyter markdown
### Expression of {{query.raw_value}} across tissues and cell lines
This part of the report provides the Z-score (Normalized Median Expression) for the lncRNA {{query.raw_value}} in various tissues and cell lines.

In [None]:
%%appyter markdown
Samples from Recount3[4] were automatically labelled by tissue type or cell line of origin. Tissue and cell line samples were log2 transformed and quantile normalized separately. Tissues and cell lines with less than 20 samples were removed, and z-scores were computed along the lncRNA axis to compare expression levels across all tissues and cell lines. The median expression of {{query.raw_value}} was then calculated for each tissue type and cell line.

In [None]:
# Import z-score data
tissue_expr_zscore  = pd.read_csv(s3.open('storage/lncRNA_Appyter/v0.0.6/lncRNA_zscore_median_expr_by_tissue_filtered.csv', 'rb'),header=0, index_col=0) 
tissue_expr_zscore.index = [x.split(',')[0] for x in tissue_expr_zscore.index]


cell_line_expr_zscore = pd.read_csv(s3.open('storage/lncRNA_Appyter/v0.0.6/lncRNA_zscore_median_expr_by_cell_line_filtered.csv', 'rb'),header=0, index_col=0) 
cell_line_expr_zscore.index = [x.split(',')[0] for x in cell_line_expr_zscore.index]
cell_line_expr_zscore.columns = [x.upper().replace(' CELL','') for x in cell_line_expr_zscore.columns]

# Create folder for tissue and cell line specific expression
if not os.path.exists("tissue_and_cell_line_expression/"):
    os.makedirs("tissue_and_cell_line_expression", exist_ok=True)

# Rank tissues by z-score(median expression)
tissue_specific_lncRNA = pd.DataFrame(tissue_expr_zscore.loc[query])
tissue_specific_lncRNA = tissue_specific_lncRNA.sort_values(by=query,ascending=False)
tissue_specific_lncRNA.to_csv("tissue_and_cell_line_expression/" + query.replace('/','-') + '_tissue_zscore' + '.csv')

# Rank cell lines by z-score(median expression)
cell_line_specific_lncRNA = pd.DataFrame(cell_line_expr_zscore.loc[query])
cell_line_specific_lncRNA = cell_line_specific_lncRNA.sort_values(by=query,ascending=False)
cell_line_specific_lncRNA.to_csv("tissue_and_cell_line_expression/" + query.replace('/','-') + '_cell_line_zscore' + '.csv')

In [None]:
%%appyter code_exec
plot_bar(tissue_specific_lncRNA ,query,'Tissue','Z-score (Median Expression)','tissue_and_cell_line_expression/'+query.replace('/','-')+'_zscore_tissue_expression')
display(Markdown(f"*Figure 6. Z-score (median expression) for the lncRNA {query} in various tissue types.*"))
display(FileLink('tissue_and_cell_line_expression/'+query.replace('/','-')+'_zscore_tissue_expression'+'.png', result_html_prefix=str('Download Figure 6 (PNG): ')))
display(FileLink('tissue_and_cell_line_expression/'+query.replace('/','-')+'_zscore_tissue_expression'+'.svg', result_html_prefix=str('Download Figure 6 (SVG): ')))
display(FileLink('tissue_and_cell_line_expression/'+query.replace('/','-')+'_zscore_tissue_expression'+'.pdf', result_html_prefix=str('Download Figure 6 (PDF): ')))
display(FileLink("tissue_and_cell_line_expression/" + query.replace('/','-') + '_tissue_zscore' + '.csv', result_html_prefix=str('Download table with z-score (median expression) values for ' + query + ' in various tissue types: ')))

In [None]:
%%appyter code_exec
plot_bar(cell_line_specific_lncRNA[0:30],query,'Cell Line','Z-score (Median Expression)','tissue_and_cell_line_expression/'+query.replace('/','-')+'_zscore_cell_line_expression')
display(Markdown(f"*Figure 7. Z-score (median expression) for the lncRNA {query} in the top 30 cell lines.*"))
display(FileLink('tissue_and_cell_line_expression/'+query.replace('/','-')+'_zscore_cell_line_expression'+'.png', result_html_prefix=str('Download Figure 7 (PNG): ')))
display(FileLink('tissue_and_cell_line_expression/'+query.replace('/','-')+'_zscore_cell_line_expression'+'.svg', result_html_prefix=str('Download Figure 7 (SVG): ')))
display(FileLink('tissue_and_cell_line_expression/'+query.replace('/','-')+'_zscore_cell_line_expression'+'.pdf', result_html_prefix=str('Download Figure 7 (PDF): ')))
display(FileLink("tissue_and_cell_line_expression/" + query.replace('/','-') + '_cell_line_zscore' + '.csv', result_html_prefix=str('Download table with z-score (median expression) values for ' + query + ' in various cell lines: ')))

In [None]:
%%appyter markdown
### Visualizing all lncRNAs based on their gene expression similarity across tissues

In [None]:
%%appyter markdown
We applied UMAP[8] to 3,000 randomly selected samples from Recount3[4] to visualize lncRNA expression patterns. Samples were first log2 transformed and quantile normalized, then UMAP was applied to the lncRNA expression data with samples as features. Each data point represents a single lncRNA (n=15,862). Use the drop-down menu to color lncRNAs by expression z-score in a specific tissue. The black arrow is pointing to the location of {{query.raw_value}}.

In [None]:
# Import UMAP coordinates
umap_results_df = pd.read_csv(s3.open('storage/lncRNA_Appyter/v0.1.0/umap_tissues.csv', 'rb'),header=0, index_col=0) 

# Find z-score (median expression) for each lncRNA in each tissue
tissue_expr_median_expr = pd.read_csv(s3.open('storage/lncRNA_Appyter/v0.0.6/lncRNA_zscore_median_expr_by_tissue_filtered.csv', 'rb'),header=0, index_col=0) 

values_dict_tz = dict()
unique_tissues = np.unique(tissue_expr_zscore.columns)
for t in unique_tissues:
    values_dict_tz[t] = list(tissue_expr_zscore[t])

# Create folder for tissue and cell line specific expression
if not os.path.exists("umap/"):
    os.makedirs("umap/cell_lines/figures/static", exist_ok=True)
    os.makedirs("umap/tissues/figures/static", exist_ok=True)
    
plot_dynamic_scatter(umap_df=umap_results_df , values_dict=values_dict_tz,option_list=list(unique_tissues) ,sample_names=list(umap_results_df.index),caption_text='UMAP was applied to 3,000 randomly selected samples from Recount3. Each data point represents a lncRNA (n=15,862) and are colored by z-score (median expression) in ', figure_counter=8,category_list_dict=None, category=False,dropdown=True,color_by_title='Z-score',highlight_query=query,first_selection=list(tissue_specific_lncRNA.index)[0],static_images_save = list(tissue_specific_lncRNA.index[0:1]), file_path='umap/tissues/figures/')

In [None]:
%%appyter markdown
### Visualizing all lncRNAs based on their gene expression similarity across cell lines 

In [None]:
%%appyter markdown
We applied UMAP[8] to 3,000 randomly selected samples from Recount3[4] to visualize lncRNA expression patterns. Samples were first log2 transformed and quantile normalized, then UMAP was applied to the lncRNA expression data with samples as features. Each data point represents a single lncRNA (n=15,862). Use the drop-down menu to color lncRNAs by expression z-score in a specific cell line. The black arrow is pointing to the location of {{query.raw_value}}.

In [None]:
%%appyter code_exec
# Find z-score (median expression) for each lncRNA in each cell line
cell_line_expr_median_expr = pd.read_csv(s3.open('storage/lncRNA_Appyter/v0.0.6/lncRNA_zscore_median_expr_by_cell_line_filtered.csv', 'rb'),header=0, index_col=0) 

values_dict_cz = dict()
unique_cell_lines = np.unique(cell_line_expr_zscore.columns)
for t in unique_cell_lines:
    values_dict_cz[t] = list(cell_line_expr_zscore[t])
    
plot_dynamic_scatter(umap_df=umap_results_df, values_dict=values_dict_cz,option_list=list(unique_cell_lines) ,sample_names=list(umap_results_df.index),caption_text='UMAP was applied to 3,000 randomly selected samples from Recount3. Each data point represents a lncRNA (n=15,862) and are colored by z-score (median expression) in ', figure_counter=9,category_list_dict=None, category=False,dropdown=True,color_by_title='Z-score',highlight_query=query,first_selection=list(cell_line_specific_lncRNA.index)[0], static_images_save = list(cell_line_specific_lncRNA.index[0:1]), file_path='umap/cell_lines/figures/')

In [None]:
%%appyter markdown
### L1000 small molecules predicted to modulate {{query.raw_value}}

In [None]:
%%appyter markdown
~1.4 million Level 5 L1000 chemical perturbation gene expression signatures were downloaded from SigCom LINCS (https://maayanlab.cloud/sigcom-lincs)[9]. For each unique signature and lncRNA pair, a mean Pearson correlation coefficient was computed by taking the average Pearson coefficient between the lncRNA and all genes in the signature. All 15,862 lncRNAs were then ranked by mean Pearson correlation coefficient, and the top 1,000 lncRNAs with the highest coefficients were retained for each signature. The top 500 lncRNA-L1000 signature associations are reported here for {{query.raw_value}}, separated by direction. If {{query.raw_value}} is highly correlated with the up-regulated genes for a specific small molecule, then this small molecule is predicted to up-regulate {{query.raw_value}}.

In [None]:
%%appyter markdown
### L1000 small molecules predicted to up-regulate {{query.raw_value}}

In [None]:
%%appyter markdown
The prioritized small molecules below are predicted to specifically up-regulate {{query.raw_value}}.

In [None]:
%%appyter code_exec
# Load predicted small molecules to modulate the lncRNA
l1000_prediction_file = pd.read_csv(s3.open('storage/lncRNA_Appyter/v0.1.0/l1000_sm_lncRNAs_pvalues.tsv', 'rb'),sep='\t',header=None, index_col=0)
if query in list(l1000_prediction_file.index):
    l1000_prediction_file = l1000_prediction_file.loc[query]
    l1000_prediction_file = list(l1000_prediction_file.dropna(axis=0))
    drugs_up = []
    drugs_up_corr = []
    drugs_up_pval = []
    drugs_down = []
    drugs_down_corr=[]
    drugs_down_pval = []
    for sig in l1000_prediction_file:
        sig_id = sig.split(',')[0]
        sig_id_corr = float(sig.split(',')[1])
        sig_pval = float(sig.split(',')[2])
        if sig_id.split(' ')[1] == 'down':
            drugs_down.append(sig_id)
            drugs_down_corr.append(sig_id_corr)
            drugs_down_pval.append(sig_pval)
        else:
            drugs_up.append(sig_id)
            drugs_up_corr.append(sig_id_corr)
            drugs_up_pval.append(sig_pval)

    up_results = pd.DataFrame({'L1000 Signature ID':drugs_up,'Drug': [x.split('_')[4] for x in drugs_up],'Up/Down':[x.split(' ')[1] for x in drugs_up],'Dose':[x.split(' ')[0].split('_')[-1] for x in drugs_up],'Cell line':[x.split('_')[1] for x in drugs_up],'Time point':[x.split('_')[2] for x in drugs_up],'Mean Pearson Correlation':drugs_up_corr,'P-value':drugs_up_pval})
    down_results = pd.DataFrame({'L1000 Signature ID':drugs_down,'Drug': [x.split('_')[4] for x in drugs_down],'Up/Down':[x.split(' ')[1] for x in drugs_down],'Dose':[x.split(' ')[0].split('_')[-1] for x in drugs_down],'Cell line':[x.split('_')[1] for x in drugs_down],'Time point':[x.split('_')[2] for x in drugs_down],'Mean Pearson Correlation':drugs_down_corr,'P-value':drugs_down_pval})
else:
    up_results = pd.DataFrame()
    down_results = pd.DataFrame()

In [None]:
%%appyter code_exec
# L1000 small molecules predicted to up-regulate the lncRNA of interest
if len(up_results) > 0:
    display(up_results[0:20])
    display(Markdown(f"*Table 7. L1000 small molecules predicted to up-regulate the lncRNA {query}.*"))

    # Create folder for tissue and cell line specific expression
    if not os.path.exists("l1000_sm_predictions/"):
        os.makedirs("l1000_sm_predictions/", exist_ok=True)
    up_results.to_csv("l1000_sm_predictions/" + query.replace('/','-') + '_l1000_sm_predictions_up' + '.csv')
    display(FileLink("l1000_sm_predictions/" + query.replace('/','-') + '_l1000_sm_predictions_up' + '.csv', result_html_prefix=str('Download Table 7: ')))
else:
    display(Markdown(f"**There are no small molecules predicted to specifically up-regulate the expression of {query}.**"))

In [None]:
%%appyter markdown
### L1000 small molecules predicted to down-regulate {{query.raw_value}}

In [None]:
%%appyter markdown
The prioritized small molecules below are predicted to specifically down-regulate {{query.raw_value}}.

In [None]:
%%appyter code_exec
# L1000 small molecules predicted to down-regulate the lncRNA of interest
if len(down_results) > 0:
    display(down_results[0:20])
    display(Markdown(f"*Table 8. L1000 small molecules predicted to down-regulate the lncRNA {query}.*"))
    # Create folder for tissue and cell line specific expression
    if not os.path.exists("l1000_sm_predictions/"):
        os.makedirs("l1000_sm_predictions/", exist_ok=True)
    down_results.to_csv("l1000_sm_predictions/" + query.replace('/','-') + '_l1000_sm_predictions_down' + '.csv')
    display(FileLink("l1000_sm_predictions/" + query.replace('/','-') + '_l1000_sm_predictions_down' + '.csv', result_html_prefix=str('Download Table 8: ')))
else:
    display(Markdown(f"**There are no small molecules predicted to specifically down-regulate the expression of {query}.**"))

In [None]:
# close h5 file
f.close()

### References
[1] Frankish A, Diekhans M, Jungreis I, Lagarde J, Loveland Jane E, Mudge JM, Sisu C, Wright JC, Armstrong J, Barnes I: GENCODE 2021. Nucleic Acids Research 2021, 49(D1):D916-D923.

[2] Howe KL, Achuthan P, Allen J, Allen J, Alvarez-Jarreta J, Amode MR, Armean IM, Azov AG, Bennett R, Bhai J: Ensembl 2021. Nucleic Acids Research 2021, 49(D1):D884-D891.

[3] Tweedie S, Braschi B, Gray K, Jones TEM, Seal Ruth L, Yates B, Bruford EA: Genenames.org: the HGNC and VGNC resources in 2021. Nucleic Acids Research 2021, 49(D1):D939-D946.

[4] Wilks C, Zheng SC, Chen FY, Charles R, Solomon B, Ling JP, Imada EL, Zhang D, Joseph L, Leek JT: recount3: summaries and queries for large-scale RNA-seq expression and splicing. bioRxiv 2021:2021.2005.2021.445138.

[5] Xie Z, Bailey A, Kuleshov MV, Clarke DJB, Evangelista JE, Jenkins SL, Lachmann A, Wojciechowicz ML, Kropiwnicki E, Jagodnik KM: Gene Set Knowledge Discovery with Enrichr. Current Protocols 2021, 1(3):e90.

[6] Chen EY, Tan CM, Kou Y, Duan Q, Wang Z, Meirelles GV, Clark NR, Ma’ayan A: Enrichr: interactive and collaborative HTML5 gene list enrichment analysis tool. BMC Bioinformatics 2013, 14(1):128.

[7] Kuleshov MV, Jones MR, Rouillard AD, Fernandez NF, Duan Q, Wang Z, Koplev S, Jenkins SL, Jagodnik KM, Lachmann A: Enrichr: a comprehensive gene set enrichment analysis web server 2016 update. Nucleic Acids Research 2016, 44(W1):W90-W97.

[8] McInnes L, Healy J, Melville J: Umap: Uniform manifold approximation and projection for dimension reduction. arXiv preprint arXiv:180203426 2018.

[9] Evangelista et al. SigCom LINCS: Data and Metadata Search Engine for Gene Expression Signatures. 2021. In preparation.