In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
%%appyter hide_code_exec
{% do SectionField(name='section1', title = '1. Please input a valid gene symbol or Ensembl ID', subtitle = '', img = 'lncRNA_appyter_logo.png')%}

In [None]:
%%appyter code_exec
{% set query = TabField(
    name = 'species_input',
    label = 'Species of Interest',
    default = 'Human',
    description = 'Select the species of interest.',
    section = 'section1',
    choices = {
        'Human': [
            AutocompleteField(
                name = 'Homo_sapiens',
                label = 'Human Gene of Interest',
                default = 'HOTAIR',
                description = 'Enter the lncRNA symbol or ENSEMBL id of interest (human).',
                file_path = 'https://appyters.maayanlab.cloud/storage/lncRNA_Appyter/v0.1.3/human_lncRNAs.json'
            )
        ],
        'Mouse': [
            AutocompleteField(
                name = 'Mus_musculus',
                label = 'Mouse Gene of Interest',
                default = 'Crnde',
                description = 'Enter the lncRNA symbol or ENSEMBL id of interest (mouse).',
                file_path = 'https://appyters.maayanlab.cloud/storage/lncRNA_Appyter/v0.1.3/mouse_lncRNAs.json'
            )
        ]
    }
)%}


In [None]:
%%appyter code_exec
species = {{ query.value[0]["args"]["name"]|jsonify }}
query = {{ query.value[0] }}

In [None]:
%%appyter code_exec

{% if query.value[0]["args"]["name"] == "Mus_musculus" %}
path = 'mm_'
num_lnc = '11,274'
{% set num_lnc = "11,274" %}
{% else %}
path = ''
num_lnc = '18,705'
{% set num_lnc = "18,705" %}
{% endif %}

In [None]:
%%appyter markdown
# Report about the Long Non-coding RNA (lncRNA) {{query.value[0].raw_value}}
Based on lncRNA-gene co-expression, this report contains predictions regarding 
the biological functions of {{query.value[0].raw_value}} and small molecules that may specifically up- or 
down-regulate {{query.value[0].raw_value}} expression. This report also contains the gene coordinates for 
{{query.value[0].raw_value}}, canonical/alternative transcript sequences, publications per year, 
expression statistics of {{query.value[0].raw_value}} across tissues and cell-lines, subcellular 
localization across cell types and lncRNA-lncRNA expression similarities.


In [None]:
# Import libraries 
import pandas as pd 
import numpy as np
import h5py as h5
from IPython.display import display,FileLink, HTML, Markdown, IFrame
import os
from utils import *
import s3fs
from bokeh.io import output_notebook
import json
output_notebook()

In [None]:
# save gene name in json
with open('gene.json', 'w') as f:
    json.dump([query], f)

In [None]:
# Import gene mapping data
s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(endpoint_url='https://s3.appyters.maayanlab.cloud'))
f = h5.File(s3.open(f'storage/lncRNA_Appyter/v0.1.3/{path}ARCHS4_lncRNA_corr.h5', 'rb'), 'r')

col_genes = [x.decode('UTF-8') for x in f["meta/columns/genes"]]
col_genes_ensembl = [x.decode('UTF-8') for x in f["meta/columns/ensembl"]]
row_genes =  [x.decode('UTF-8') for x in f["meta/rows/genes"]]
row_genes_ensembl = [x.decode('UTF-8') for x in f["meta/rows/ensembl"]]

# Convert input Ensembl ID to gene symbol
ensembl_2_genes = dict(zip(row_genes_ensembl,row_genes))
genes_2_ensembl = dict(zip(row_genes,row_genes_ensembl))
if query in col_genes_ensembl:
    query_new = ensembl_2_genes[query]
    if query != query_new:
        print('The Ensembl ID ' + query + ' has been converted to the gene symbol ' + query_new )
        query = query_new
        print('Predicting functions for ' + query + '(' + genes_2_ensembl[query] + ')')
    else:
        print('Predicting functions for ' + query + '(' + genes_2_ensembl[query] + ')')
else:
    if query in row_genes:
        print('Predicting functions for ' + query + '(' + genes_2_ensembl[query] + ')')

In [None]:
%%appyter markdown
### Genomic coordinates for {{query.value[0].raw_value}} 
{% if query.value[0]["args"]["name"] == "Homo_sapiens" %}
The genomic coordinates for {{query.value[0].raw_value}} are provided from GENCODE (gencode.v41.long_noncoding_RNAs.gtf)[1].
{% else %}
The genomic coordinates for {{query.value[0].raw_value}} are provided from GENCODE (gencode.vM30.long_noncoding_RNAs.gtf)[1].
{% endif %}

In [None]:
# Import genomic coordinates data
gene_coordinates = pd.read_csv(s3.open(f'storage/lncRNA_Appyter/v0.1.3/{path}lncrna_coordinates.csv'),header=0,index_col=0)
gene_coordinates['ensembl_id'] = [x.split('.')[0] for x in gene_coordinates['gene_id']]
if query.startswith('ENSG'):
    gene_coordinates = gene_coordinates[gene_coordinates['ensembl_id']==query].reset_index(drop=True)
else:
    gene_coordinates = gene_coordinates[gene_coordinates['ensembl_id']==genes_2_ensembl[query]].reset_index(drop=True)
gene_coordinates = gene_coordinates.drop(['ensembl_id'],axis=1)

# Save gene coordinates
if not os.path.exists("gene_info/"):
        os.makedirs("gene_info/", exist_ok=True)
gene_coordinates.to_csv('gene_info/' + query.replace('/','-') + '_gene_coordinates.csv')

In [None]:
%%appyter code_exec
display(gene_coordinates)
display(Markdown(f"*Table 1. Genomic coordinates for {query}.*"))
display(FileLink('gene_info/' + query.replace('/','-') + '_gene_coordinates.csv', result_html_prefix=str('Download Table 1: ')))

In [None]:
%%appyter markdown
###  Transcript sequences for {{query.value[0].raw_value}}
The canonical and alternative cDNA sequneces for {{query.value[0].raw_value}} are provided from Ensembl ({{ query.value[0]["args"]["name"] }}.GRCh38.ncrna.fa)[2].

In [None]:
%%appyter code_exec
try:
    # Get canonical transcript ID for input lncRNA
    canonical_transcript_id = pd.read_csv(s3.open(f'storage/lncRNA_Appyter/v0.1.3/{path}gene_canonical_metadata.csv'),header=0,index_col=0)
    canonical_transcript_id  = canonical_transcript_id[canonical_transcript_id['Gene stable ID']==genes_2_ensembl[query]]['Transcript stable ID version'].values[0]
    # Import all canonical cDNA sequences
    canonical_sequences = pd.read_csv(s3.open(f'storage/lncRNA_Appyter/v0.1.3/{path}lncrna_canonical_sequences.csv'),header=0,index_col=0)
    # Get canonical sequence for input lncRNA
    canonical_sequences = canonical_sequences[canonical_sequences['id']==canonical_transcript_id].reset_index(drop=True)

    if len(canonical_sequences) > 0:
        canonical_sequences.to_csv('gene_info/' + query.replace('/','-') + '_canonical_sequence.csv')

        display(canonical_sequences)
        display(Markdown(f"*Table 2. Canonical transcript sequence for {query}.*"))
        display(FileLink('gene_info/' + query.replace('/','-') + '_canonical_sequence.csv', result_html_prefix=str('Download Table 2: ')))
    else:
        display(Markdown(f"**No canonical transcript sequence found for {query}.**"))

except:
    display(Markdown(f"**No canonical transcript sequence found for {query}.**"))

In [None]:
%%appyter code_exec
try:
    # Get alternative transcript IDs for input lncRNA
    alternative_transcript_id = pd.read_csv(s3.open(f'storage/lncRNA_Appyter/v0.1.3/{path}gene_alternative_metadata.csv'),header=0,index_col=0)
    alternative_transcript_id = alternative_transcript_id[alternative_transcript_id['Gene stable ID']==genes_2_ensembl[query]]['Transcript stable ID version'].values

    # Import all alternative cDNA sequences
    alternative_sequences = pd.read_csv(s3.open(f'storage/lncRNA_Appyter/v0.1.3/{path}lncrna_alternative_sequences.csv'),header=0,index_col=0)

    # Get alternative sequences for input lncRNA
    alternative_sequences = alternative_sequences[alternative_sequences['id'].isin(alternative_transcript_id)].reset_index(drop=True)

    if len(alternative_sequences) > 0:
        alternative_sequences.to_csv('gene_info/' + query.replace('/','-') + '_alternative_sequence.csv')

        display(alternative_sequences)
        display(Markdown(f"*Table 3. Alternative transcript sequence(s) for {query}.*"))
        display(FileLink('gene_info/' + query.replace('/','-') + '_alternative_sequence.csv', result_html_prefix=str('Download Table 3: ')))
    else:
        display(Markdown(f"**No alternative transcript sequence(s) found for {query}.**"))
except:
      display(Markdown(f"**No alternative transcript sequence(s) found for {query}.**"))

In [None]:
%%appyter markdown
### Publications that mention {{query.value[0].raw_value}} 
The PubMed API was used to generate AutoRIF data for {{query.value[0].raw_value}}. All PubMed IDs and dates were automatically collected for articles mentioning the lncRNA {{query.value[0].raw_value}}. The Ensembl ID[2], lncRNA gene symbol from GENCODE[1], and any previous symbols found in the HGNC database[3] along with the terms ‘lncRNA’ or ‘long non-coding RNA’ were used to query PubMed (e.g., “(ENSG00000228630 OR HOTAIR) AND (lncRNA OR long non-coding RNA)”)

In [None]:
#%%appyter code_exec
# Get AutoRIF data for input lncRNA

autorif_file = s3.open(f'storage/lncRNA_Appyter/v0.1.3/{path}lncRNA-autorif_final.tsv', 'rb')
autorif_results = []

for line in autorif_file.readlines()[:-2]:
    line = line.strip()
    line = line.decode('UTF-8')
    line = list(filter(lambda x: x != '', line.split(' ')))
    lncrna = line[0].split("(")[1].replace(")",'')

    if lncrna == genes_2_ensembl[query]:
        autorif_results =  line[1:]
        break

# Save autorif data
if not os.path.exists("autorif/"):
        os.makedirs("autorif/", exist_ok=True)

if len(autorif_results) > 0:

    autorif_results_df = pd.DataFrame({'pmid':[x.split(',')[0] for x in autorif_results],'date':[x.split(',')[1] for x in autorif_results]})
    autorif_results_df = autorif_results_df.sort_values(by='date',ascending=False) 
    autorif_results_df = autorif_results_df[~autorif_results_df.date.str.contains("2022-")].reset_index(drop=True)# remove early 2022 publications
    total_pubs = len(autorif_results_df)
    autorif_results_df.to_csv('autorif/' + query.replace('/','-') + '_autorif_results.csv')

    autorif_plot(autorif_results_df,query,'autorif/' + query.replace('/','-') + '_autorif')
    display(Markdown(f"*Figure 1. Publications mentioning the lncRNA {query}. {total_pubs} total publications mentioned {query} from 1992 to 2021*"))
    display(FileLink('autorif/' + query.replace('/','-') + '_autorif.png', result_html_prefix=str('Download Figure 1 (PNG): ')))
    display(FileLink('autorif/' + query.replace('/','-') + '_autorif.svg', result_html_prefix=str('Download Figure 1 (SVG): ')))
    display(FileLink('autorif/' + query.replace('/','-') + '_autorif.pdf', result_html_prefix=str('Download Figure 1 (PDF): ')))
    display(FileLink('autorif/' + query.replace('/','-') + '_autorif_results.csv', result_html_prefix=str('Download AutoRIF data: ')))
else:
    display(Markdown(f"**No publications found for {query}.**"))

In [None]:
%%appyter markdown
### Import lncRNA-gene co-expression matrix
This lncRNA-gene co-expression matrix was generated by computing the Pearson correlation coefficients for 6,000 randomly selected bulk RNA-seq samples from ARCHS4[4]. NOTE: If an Ensembl ID was entered, it will be converted to its corresponding gene symbol if available. 

In [None]:
# Import lncRNA-gene co-expression matrix
corr =f["data/correlation"]

In [None]:
%%appyter markdown
### Top genes correlated with {{query.value[0].raw_value}}

In [None]:
%%appyter markdown
Using the loaded lncRNA-gene correlation matrix, we report the genes that are most positively and negatively correlated with {{query.value[0].raw_value}}.

In [None]:
# Save correlation files
if not os.path.exists("gene_correlations/"):
        os.makedirs("gene_correlations/", exist_ok=True)

# Get index of lncRNA of interest
idx_query = np.where(np.asarray(col_genes) == query)[0][0]

# Rank genes based on pearson correlation with the lncRNA of interest
lncRNA_coexp = pd.DataFrame(corr[:,idx_query])

lncRNA_coexp.index = row_genes
lncRNA_coexp.columns = ["Pearson's Correlation Coefficient"]
lncRNA_coexp = lncRNA_coexp.sort_values(by="Pearson's Correlation Coefficient", ascending=False)

lncRNA_neg_coexp = lncRNA_coexp.sort_values(by="Pearson's Correlation Coefficient", ascending=True)


# Save gene correlations to csv files
lncRNA_coexp[1:].to_csv('gene_correlations/'+ query.replace('/','-') + '_positively_correlated_genes.csv')
lncRNA_neg_coexp.to_csv('gene_correlations/' + query.replace('/','-') + '_negatively_correlated_genes.csv')

In [None]:
%%appyter code_exec
display(lncRNA_coexp[1:21])
display(Markdown(f"*Table 4. The Top 20 genes positively correlated with {query} ranked by Pearson’s correlation coefficients.*"))
display(FileLink('gene_correlations/' + query.replace('/','-') + '_positively_correlated_genes.csv', result_html_prefix=str('Download Table 4: ')))

In [None]:
%%appyter code_exec
display(lncRNA_neg_coexp[0:20])
display(Markdown(f"*Table 5. The Top 20 genes negatively correlated with {query} ranked by Pearson’s correlation coefficients.*"))
display(FileLink('gene_correlations/' + query.replace('/','-') + '_negatively_correlated_genes.csv', result_html_prefix=str('Download Table 5: ')))

In [None]:
%%appyter markdown
### Interactive network visualization of the top 100 genes positively correlated with {{query.value[0].raw_value}}
Interactive network visualization of the top 100 genes positively correlated with {{query.value[0].raw_value}}. 
Each node represents a gene and is colored by chromosome location, except for the bright red node which 
represents the lncRNA {{query.value[0].raw_value}}. The thickness of the edges corresponds to Pearson correlation coefficients.
Clicking on a gene node will highlight its corresponding edges in orange. Hovering over a node will display the gene 
name and chromosome location.
Network Methods: All pairwise correlations between the top 100 genes positively correlated with 
{{query.value[0].raw_value}} are extracted. The 3 edges with the highest correlation per gene node are used to 
initialize the network. Edges with weights < 0.3 are dropped. To further prune the network, the edge 
with the lowest weight for each hub node is dropped. At the start, a hub node is defined as a node with > 10 edges. 
The pruning process is repeated until the network has an average of < 3 edges per node. The top 5 edges for 
{{query.value[0].raw_value}} are shown regardless of their weights.

In [None]:
# Visualize the co-expression network for the top 100 positively correlated genes
if not os.path.exists("coexpression_network/"):
        os.makedirs("coexpression_network/", exist_ok=True)
g = network_vis(query,lncRNA_coexp,genes_2_ensembl,row_genes, path)
g[1].to_csv("coexpression_network/"+query.replace('/','-')+'_network_node_metadata.csv')
g[2].to_csv("coexpression_network/"+query.replace('/','-')+'_network_edge_metadata.csv')
with open('coexpression_network/' + query.replace('/','-') + '_network.html', "w") as out:
        out.write(g[0].generate_html(notebook=True))
IFrame('coexpression_network/' + query.replace('/','-') + '_network.html', width="1000px", height="600px")

In [None]:
%%appyter code_exec
display(Markdown(f"*Figure 2. Interactive network visualization of the top 100 genes positively correlated with {query}.*"))
display(FileLink('coexpression_network/' + query.replace('/','-') + '_network.html', result_html_prefix=str('Download Figure 2: ')))
display(FileLink("coexpression_network/"+query.replace('/','-')+'_network_node_metadata.csv', result_html_prefix=str('Download Node metadata: ')))
display(FileLink("coexpression_network/"+query.replace('/','-')+'_network_edge_metadata.csv', result_html_prefix=str('Download Edge metadata: ')))

In [None]:
%%appyter markdown
### Enrichment analysis applied to the top genes most positively and negatively correlated with {{query.value[0].raw_value}}

In [None]:
%%appyter markdown
The top genes most positively and negatively correlated with {{query.value[0].raw_value}} are submitted to Enrichr[6-8] for enrichment analysis. NOTE: Only genes with gene symbols are submitted to Enrichr. Ensembl IDs that do not map to an official gene symbol were dropped.

In [None]:
%%appyter code_exec
# Get positively and negatively correlated genes
n_genes = [25, 50, 100, 200, 300, 500]
top_genes = [x for x in list(lncRNA_coexp.index) if not x.startswith('ENSG')] # only keep genes with gene symbols
top_genes = [x for x in top_genes if x != query] # remove query from end of list 
top_neg_genes = top_genes[::-1] # reverse list for top negatively correlated genes

# Get enrichemnt reuslts for top n positively correlated genes with input lncRNA
display(HTML("<font size=4> <b>Enrichment analysis reuslts for positively correlated genes with {query} </b></font>".format(query=query)))
for n in n_genes:
        enrichr_link_pos = Enrichr_API(top_genes[0:n],str('Top 200 positively correlated genes with the lncRNA: ' + query))
        
        # Save enrichr link to text file
        if not os.path.exists("enrichment_analysis/"):
                os.makedirs("enrichment_analysis/", exist_ok=True)
        if enrichr_link_pos != 'Error':
                open_file = open('enrichment_analysis/'+query.replace('/','-')+'_top_'+str(n)+'_positively_correlated_genes_Enrichr_link.txt','w')
                open_file.write(enrichr_link_pos)
                open_file.close()

                display(HTML("Access the enrichment analysis results for the top  <b>{n} positively</b> correlated genes with {query} here: <a href='{href}'>{link}</a>".format(href=enrichr_link_pos, link = enrichr_link_pos, query=query, n=n)))

display(HTML("<font size=4> \n\n\n <b>Enrichment analysis reuslts for negatively correlated genes with {query} </b></font>".format(query=query)))

# Get enrichemnt reuslts for top n negatively correlated genes with input lncRNA
for n in n_genes:
        enrichr_link_neg = Enrichr_API(top_neg_genes[0:n],str('Top 200 negatively correlated genes with the lncRNA: ' + query))
        if enrichr_link_neg != 'Error':
                open_file = open('enrichment_analysis/'+query.replace('/','-')+'_top_'+str(n)+'_negatively_correlated_genes_Enrichr_link.txt','w')
                open_file.write(enrichr_link_neg)
                open_file.close()
        
                display(HTML("Access the enrichment analysis results for the top <b>{n} negatively</b> correlated genes with {query} here: <a href='{href}'>{link}</a>".format(href=enrichr_link_neg, link = enrichr_link_neg, query=query, n=n)))

In [None]:
%%appyter markdown
### Top lncRNAs positively correlated with {{query.value[0].raw_value}}

In [None]:
%%appyter markdown
Below we list the top 20 lncRNAs, out of all {{num_lnc}} lncRNAs within our database, that correlate most with {{query.value[0].raw_value}} based on their Pearson correlation coefficients.

In [None]:
# Get correlated lncRNAs
lncRNA_lncRNA_coexp = lncRNA_coexp.loc[col_genes]
lncRNA_lncRNA_coexp = lncRNA_lncRNA_coexp.sort_values(by="Pearson's Correlation Coefficient", ascending=False)

# Save gene correlations to csv file
lncRNA_lncRNA_coexp[1:].to_csv('gene_correlations/' + query.replace('/','-') + '_positively_correlated_lncRNAs.csv')

In [None]:
%%appyter code_exec
display(lncRNA_lncRNA_coexp[1:21])
display(Markdown(f"*Table 6. Top 20 lncRNAs that correlate most with {query} ranked by Pearson correlation coefficients.*"))
display(FileLink('gene_correlations/' + query.replace('/','-') + '_positively_correlated_lncRNAs.csv', result_html_prefix=str('Download Table 6: ')))

In [None]:
%%appyter markdown
### lncAtlas localization information and predicted localization of {{query.value[0].raw_value}}

In [None]:
%%appyter markdown
Cell localization information was sourced from lncATLAS[5]. If lncATLAS does not contain information for the entered lncRNA, predicted localization scores will be shown. Predicted localizations were calculated with unsupravised learning data using ranked correlations from ARCHS4 and the available data from lncATLAS[5].

In [None]:
%%appyter code_exec
if not os.path.exists("localization/"):
        os.makedirs("localization/", exist_ok=True)

lncAtlas_ensembl = json.load(s3.open('storage/lncRNA_Appyter/v0.1.3/lnc_celltype_ranked_ensem.json'))

lncAtlas_localization = {'cell type': [], 'CN RCI': []}
for cell_type in lncAtlas_ensembl:
    if genes_2_ensembl[query] in lncAtlas_ensembl[cell_type]:
        lncAtlas_localization['CN RCI'].append(lncAtlas_ensembl[cell_type][genes_2_ensembl[query]])
        lncAtlas_localization['cell type'].append(cell_type)

if len(lncAtlas_localization['cell type']) > 0: 
    plt.Figure()
    df = pd.DataFrame(lncAtlas_localization)
    df.plot.bar(x='cell type', y='CN RCI')
    plt.title(f'{query}({genes_2_ensembl[query]}) Cytoplasmic/Nuclear Localization: RCI')
    plt.ylabel("<--- Nucleus   Cytoplasam --->")
    plt.xlabel("Cell Type")
    ax = plt.gca()
    ax.set_axisbelow(True)
    ax.yaxis.grid(color='gray', linestyle='dashed')
    ax.set_ylim([-4, 4])
else:
    display(Markdown(f"**No lncATLAS localization information found for {query}.**"))
{% if query.value[0]["args"]["name"] == 'Homo_sapiens'%}
    predicted = json.load(s3.open('storage/lncRNA_Appyter/v0.1.3/scores_normalized_trunc.json'))
    df = pd.DataFrame({'cell type': ['A549', 'H1.hESC', 'IMR.90', 'MCF.7', 'HUVEC'], 'Predicted localization': [predicted['A549'][query], predicted['H1.hESC'][query], predicted['IMR.90'][query], predicted['MCF.7'][query], predicted['HUVEC'][query]]})
    df.plot.bar(x='cell type', y='CN RCI')
    plt.ylabel("<--- Nucleus   Cytoplasam --->")
    plt.xlabel("Predicted Localization")
    ax = plt.gca()
    ax.set_axisbelow(True)
    ax.yaxis.grid(color='gray', linestyle='dashed')
    ax.set_ylim([-.5, .5])
# Save gene coordinates
plt.show()
df.to_csv('localization/' + query.replace('/','-') + '_localization.csv')
plt.savefig('localization/' + query.replace('/','-') + '_localization.png')
{% endif %}

In [None]:
%%appyter markdown
### Predicted biological functions of {{query.value[0].raw_value}}

In [None]:
%%appyter markdown
For each Enrichr[6-8] library, we compute the mean Pearson correlation coefficient for each gene set by averaging 
the Pearson correlation coefficients between each gene in the gene set and {{query.value[0].raw_value}}. Terms are ranked 
by the right-tailed and left-tailed p-values of their mean Pearson correlation coefficients. For each library, 
terms with significant right-tailed (red bar graphs) or left-tailed (blue bar graphs) p-values are predicted to 
be associated with {{query.value[0].raw_value}}.

In [None]:
# Store biological function predictions
if not os.path.exists("predicted_functions/"):
    os.makedirs("predicted_functions/", exist_ok=True)

# Import pre-computed lncRNA function predictions

prediction_libraries = ['MGI_Mammalian_Phenotype_Level_4_2021','GO_Biological_Process_2021','KEGG_2021_Human','DisGeNET','ChEA_2022','ENCODE_TF_ChIP-seq_2015']
prediction_libraries = np.array_split(prediction_libraries, int(np.ceil(len(prediction_libraries)/2))) 
for i_group,group in enumerate(prediction_libraries):
    predictions = []
    library_names = []
    for pred_library in group:
        precomputed_avg_coexp = get_bf_pvalues('storage/lncRNA_Appyter/v0.1.3/' + path + pred_library + '_lncRNA_avg_coexpression.h5', 'storage/lncRNA_Appyter/v0.1.3/' + path + pred_library + '_lncRNA_pvalue.h5' , query)
        predictions.append(precomputed_avg_coexp)
        library_names.append(pred_library.replace('_',' '))
    if i_group == 0:
        plot_results(library_names=library_names, results_dfs=predictions,file_name='predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f1',direction='right-tailed-pvalue')
        display(Markdown(f"*Figure 3. Predicted MGI Mammalian Phenotypes and GO Biological Processes for the lncRNA {query}. Terms are ranked by the right-tailed p-value for the mean Pearson correlation coefficient calculated between each gene set and {query}.*" ))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f1_right-tailed-pvalue.png', result_html_prefix=str('Download Figure 3 (PNG): ')))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f1_right-tailed-pvalue.svg', result_html_prefix=str('Download Figure 3 (SVG): ')))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f1_right-tailed-pvalue.pdf', result_html_prefix=str('Download Figure 3 (PDF): ')))
    if i_group == 1:
        plot_results(library_names=library_names, results_dfs=predictions,file_name='predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f2',direction='right-tailed-pvalue')
        display(Markdown(f"*Figure 4. Predicted KEGG pathways and DisGeNET disease terms for the lncRNA {query}. Terms are ranked by the right-tailed p-value for the mean Pearson correlation coefficient calculated between each gene set and {query}.*" ))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f2_right-tailed-pvalue.png', result_html_prefix=str('Download Figure 4 (PNG): ')))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f2_right-tailed-pvalue.svg', result_html_prefix=str('Download Figure 4 (SVG): ')))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f2_right-tailed-pvalue.pdf', result_html_prefix=str('Download Figure 4 (PDF): ')))
    if i_group == 2:
        plot_results(library_names=library_names, results_dfs=predictions,file_name='predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f3',direction='right-tailed-pvalue')
        display(Markdown(f"*Figure 5. Predicted ChEA and ENCODE terms for the lncRNA {query}. Terms are ranked by the right-tailed p-value for the mean Pearson correlation coefficient calculated between each gene set and {query}.*" ))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f3_right-tailed-pvalue.png', result_html_prefix=str('Download Figure 5 (PNG): ')))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f3_right-tailed-pvalue.svg', result_html_prefix=str('Download Figure 5 (SVG): ')))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f3_right-tailed-pvalue.pdf', result_html_prefix=str('Download Figure 5 (PDF): ')))

    # Save Predictions 
    for ii,prediction in enumerate(predictions):
        prediction.to_csv("predicted_functions/" + query.replace('/','-') +'_' + library_names[ii]  + '_' + 'right-tailed-pvalue' + '.csv')
        display(FileLink("predicted_functions/" + query.replace('/','-') +'_' + library_names[ii]  + '_' + 'right-tailed-pvalue' + '.csv', result_html_prefix=str('Download predictions: ')))

In [None]:
# Import pre-computed lncRNA function predictions
prediction_libraries = ['MGI_Mammalian_Phenotype_Level_4_2021','GO_Biological_Process_2021','KEGG_2021_Human','DisGeNET','ChEA_2022','ENCODE_TF_ChIP-seq_2015']
prediction_libraries = np.array_split(prediction_libraries, int(np.ceil(len(prediction_libraries)/2))) 
for i_group,group in enumerate(prediction_libraries):
    predictions = []
    library_names = []
    for pred_library in group:
        precomputed_avg_coexp = get_bf_pvalues('storage/lncRNA_Appyter/v0.1.3/' + path + pred_library + '_lncRNA_avg_coexpression.h5', 'storage/lncRNA_Appyter/v0.1.3/' + path + pred_library + '_lncRNA_pvalue_lt.h5' , query)
        predictions.append(precomputed_avg_coexp)
        library_names.append(pred_library.replace('_',' '))
    if i_group == 0:
        plot_results(library_names=library_names, results_dfs=predictions,file_name='predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f4',direction='left-tailed-pvalue')
        display(Markdown(f"*Figure 6. Predicted MGI Mammalian Phenotypes and GO Biological Processes for the lncRNA {query}. Terms are ranked by the left-tailed p-value for the mean Pearson correlation coefficient calculated between each gene set and {query}.*" ))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f4_left-tailed-pvalue.png', result_html_prefix=str('Download Figure 6 (PNG): ')))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f4_left-tailed-pvalue.svg', result_html_prefix=str('Download Figure 6 (SVG): ')))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f4_left-tailed-pvalue.pdf', result_html_prefix=str('Download Figure 6 (PDF): ')))
    if i_group == 1:
        plot_results(library_names=library_names, results_dfs=predictions,file_name='predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f5',direction='left-tailed-pvalue')
        display(Markdown(f"*Figure 7. Predicted KEGG pathways and DisGeNET disease terms for the lncRNA {query}. Terms are ranked by the left-tailed p-value for the mean Pearson correlation coefficient calculated between each gene set and {query}.*" ))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f5_left-tailed-pvalue.png', result_html_prefix=str('Download Figure 7 (PNG): ')))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f5_left-tailed-pvalue.svg', result_html_prefix=str('Download Figure 7 (SVG): ')))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f5_left-tailed-pvalue.pdf', result_html_prefix=str('Download Figure 7 (PDF): ')))
    if i_group == 2:
        plot_results(library_names=library_names, results_dfs=predictions,file_name='predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f6',direction='left-tailed-pvalue')
        display(Markdown(f"*Figure 8. Predicted ChEA and ENCODE terms for the lncRNA {query}. Terms are ranked by the left-tailed p-value for the mean Pearson correlation coefficient calculated between each gene set and {query}.*" ))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f6_left-tailed-pvalue.png', result_html_prefix=str('Download Figure 8 (PNG): ')))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f6_left-tailed-pvalue.svg', result_html_prefix=str('Download Figure 8 (SVG): ')))
        display(FileLink('predicted_functions/'+query.replace('/','-')+'_biological_function_predictions_f6_left-tailed-pvalue.pdf', result_html_prefix=str('Download Figure 8 (PDF): ')))

    # Save Predictions 
    for ii,prediction in enumerate(predictions):
        prediction.to_csv("predicted_functions/" + query.replace('/','-') +'_' + library_names[ii]  + '_' + 'left-tailed-pvalue' + '.csv')
        display(FileLink("predicted_functions/" + query.replace('/','-') +'_' + library_names[ii]  + '_' + 'left-tailed-pvalue' + '.csv', result_html_prefix=str('Download predictions: ')))



In [None]:
%%appyter markdown
### Expression of {{query.value[0].raw_value}} across tissues and cell lines
This part of the report provides expression statistics for the lncRNA {{query.value[0].raw_value}} in various tissues and cell lines.

In [None]:
%%appyter markdown
Samples from ARCHS4[4] were automatically labelled by tissue type or cell line of origin. Tissues and cell lines with 
less than 20 samples were removed, and expression statistics were computed across all remaining tissues and cell 
lines. The median expression of {{query.value[0].raw_value}} was then calculated for each tissue type and cell line.

In [None]:
# Create folder for tissue and cell line specific expression
if not os.path.exists("tissue_and_cell_line_expression/"):
    os.makedirs("tissue_and_cell_line_expression", exist_ok=True)

# Import tissue expression data
tissue_expr = pd.read_csv(s3.open(f'storage/lncRNA_Appyter/v0.1.3/{path}lncRNA_expr_by_tissue_filtered.csv', 'rb'),header=0, index_col=[0, 1])
tissue_expr.columns = [x.replace('tissue-','').replace('cell-', '') for x in tissue_expr.columns]
tissue_expr_query = tissue_expr[tissue_expr.index.get_level_values(0) == query]
tissue_expr_query.sort_values((query, 'mean'), axis=1, inplace=True)
tissue_expr_query.sort_values((query, 'mean'), axis=1, ascending=False).to_csv("tissue_and_cell_line_expression/" + query.replace('/','-') + '_tissue_median_expr' + '.csv')
    
if tissue_expr_query.shape[1] > 30:
    tissue_expr_query_plot = tissue_expr_query.iloc[:, -30:]
else:
    tissue_expr_query_plot = tissue_expr_query

cell_line_expr = pd.read_csv(s3.open(f'storage/lncRNA_Appyter/v0.1.3/{path}lncRNA_expr_by_cell_line_filtered.csv', 'rb'),header=0, index_col=[0, 1]) 
cell_line_expr.columns = [x.replace('cell line-', '') for x in cell_line_expr.columns]
cell_line_expr_query = cell_line_expr[cell_line_expr.index.get_level_values(0) == query]
cell_line_expr_query.sort_values((query, 'mean'), axis=1, inplace=True)
cell_line_expr_query.sort_values((query, 'mean'), axis=1, ascending=False).to_csv("tissue_and_cell_line_expression/" + query.replace('/','-') + '_cell_line_median_expr' + '.csv')

if cell_line_expr_query.shape[1] > 30:
    cell_line_expr_query_plot = cell_line_expr_query.iloc[:, -30:]
else:
    cell_line_expr_query_plot = cell_line_expr_query

In [None]:
%%appyter code_exec
display(Markdown("## RNA-Seq Expression Count Statistics"))
IQR = tissue_expr_query_plot.loc[(query, '75%')]-tissue_expr_query_plot.loc[(query,'25%')]
fig = go.Figure()
fig.add_trace(go.Box(
    lowerfence=np.maximum(
        tissue_expr_query_plot.loc[(query,'min')],
        tissue_expr_query_plot.loc[(query,'25%')] - (1.5*IQR),
    ),
    q1=tissue_expr_query_plot.loc[(query,'25%')],
    median=tissue_expr_query_plot.loc[(query,'50%')],
    q3=tissue_expr_query_plot.loc[(query,'75%')],
    upperfence=np.minimum(
        tissue_expr_query_plot.loc[(query,'max')],
        tissue_expr_query_plot.loc[(query,'75%')] + (1.5*IQR),
    ),
    mean=tissue_expr_query_plot.loc[(query,'mean')],
    sd=tissue_expr_query_plot.loc[(query,'std')],
    y=tissue_expr_query_plot.columns,
    name='Background',
    orientation='h'
))
fig.update_layout(title= query + " Tissue and Cell Type Expression", height=1200)
fig.show()
fig.write_image('tissue_and_cell_line_expression/'+query+'_tissue_expression.png')
fig.write_image('tissue_and_cell_line_expression/'+query+'_tissue_expression.svg')
fig.write_image('tissue_and_cell_line_expression/'+query+'_tissue_expression.pdf')
display(Markdown(f"*Figure 9. Expression statistics for the lncRNA {query} in various tissue types.*"))
display(FileLink("tissue_and_cell_line_expression/" + query.replace('/','-') + '_tissue_median_expr' + '.csv', result_html_prefix=str('Download table with expression statistics for ' + query + ' in various tissue types: ')))

In [None]:
%%appyter code_exec
display(Markdown("## RNA-Seq Expression Count Statistics"))
IQR = cell_line_expr_query_plot.loc[(query, '75%')]-cell_line_expr_query_plot.loc[(query,'25%')]
fig = go.Figure()
fig.add_trace(go.Box(
    lowerfence=np.maximum(
        cell_line_expr_query_plot.loc[(query,'min')],
        cell_line_expr_query_plot.loc[(query,'25%')] - (1.5*IQR),
    ),
    q1=cell_line_expr_query_plot.loc[(query,'25%')],
    median=cell_line_expr_query_plot.loc[(query,'50%')],
    q3=cell_line_expr_query_plot.loc[(query,'75%')],
    upperfence=np.minimum(
        cell_line_expr_query_plot.loc[(query,'max')],
        cell_line_expr_query_plot.loc[(query,'75%')] + (1.5*IQR),
    ),
    mean=cell_line_expr_query_plot.loc[(query,'mean')],
    sd=cell_line_expr_query_plot.loc[(query,'std')],
    y=cell_line_expr_query_plot.columns,
    name='Background',
    orientation='h'
))
fig.update_layout(title= query + " Cell Line Expression", height=1200)
fig.show()
fig.write_image('tissue_and_cell_line_expression/'+query+'_cell_line_expression.png')
fig.write_image('tissue_and_cell_line_expression/'+query+'_cell_line_expression.svg')
fig.write_image('tissue_and_cell_line_expression/'+query+'_cell_line_expression.pdf')

display(Markdown(f"*Figure 10. Expression statistics for the lncRNA {query} in the top 30 cell lines.*"))
display(FileLink("tissue_and_cell_line_expression/" + query.replace('/','-') + '_cell_line_median_expr' + '.csv', result_html_prefix=str('Download table with expression statistics for ' + query + ' in various cell lines: ')))

In [None]:
%%appyter markdown
### Visualizing all lncRNAs based on their gene expression similarity across tissues

In [None]:
%%appyter markdown
We applied UMAP[9] to 3,000 randomly selected samples from ARCHS4[4] to visualize lncRNA expression patterns. 
Samples were first log2 transformed and quantile normalized, then UMAP was applied to the lncRNA expression data 
with samples as features. Each data point represents a single lncRNA (n={{num_lnc}}). Use the drop-down menu to color 
lncRNAs by median expression in a specific tissue. The black arrow is pointing to the location of {{query.value[0].raw_value}}.

In [None]:
# Import UMAP coordinates
umap_results_df = pd.read_csv(s3.open(f'storage/lncRNA_Appyter/v0.1.3/{path}umap_tissues.csv', 'rb'),header=0, index_col=0) 

values_dict_tz = dict()
unique_tissues = np.unique(tissue_expr.columns)
medians = tissue_expr[tissue_expr.index.get_level_values(1) == '50%'].apply(lambda x: np.log(x + 1))
medians.set_index(medians.index.get_level_values(0), inplace=True)
lncRNAs_tissues = tissue_expr.index.get_level_values(0)
first_tissue = list(tissue_expr_query_plot.columns.values)[-1]

for t in unique_tissues:
    values_dict_tz[t] = medians[t].values.tolist()



# Create folder for tissue and cell line specific expression
if not os.path.exists("umap/"):
    os.makedirs("umap/cell_lines/figures/static", exist_ok=True)
    os.makedirs("umap/tissues/figures/static", exist_ok=True)


plot_dynamic_scatter(umap_df=umap_results_df, values_dict=values_dict_tz,option_list=list(unique_tissues),
                     sample_names=list(umap_results_df.index),caption_text=f'UMAP was applied to 3,000 randomly selected samples from ARCHS4. Each data point represents a lncRNA (n={num_lnc}) and are colored by log median expression in ', 
                     figure_counter=11,category_list_dict=None, category=False,dropdown=True,color_by_title='log Median expression',
                     highlight_query=query,first_selection=first_tissue, static_images_save= [first_tissue], 
                     file_path='umap/tissues/figures/')

In [None]:
%%appyter markdown
### Visualizing all lncRNAs based on their gene expression similarity across cell lines 

In [None]:
%%appyter markdown
We applied UMAP[9] to 3,000 randomly selected samples from ARCHS4[4] to visualize lncRNA expression patterns. 
Samples were first log2 transformed and quantile normalized, then UMAP was applied to the lncRNA expression data 
with samples as features. Each data point represents a single lncRNA (n={{num_lnc}}). Use the drop-down menu to color
lncRNAs by median expression in a specific cell line. The black arrow is pointing to the location of {{query.value[0].raw_value}}.

In [None]:
%%appyter code_exec

values_dict_cz = dict()
unique_cell_lines = np.unique(cell_line_expr.columns)

medians = cell_line_expr[cell_line_expr.index.get_level_values(1) == 'mean'].apply(lambda x: np.log(x + 1))
medians.set_index(medians.index.get_level_values(0), inplace=True)
lncRNAs_tissues = cell_line_expr.index.get_level_values(0)
first_tissue = list(cell_line_expr_query_plot.columns.values)[-1]

for t in unique_cell_lines:
    values_dict_cz[t] = medians[t].values.tolist()
    
plot_dynamic_scatter(umap_df=umap_results_df, values_dict=values_dict_cz,option_list=list(unique_cell_lines),
                           sample_names=list(umap_results_df.index),caption_text=f'UMAP was applied to 3,000 randomly selected samples from ARCHS4. Each data point represents a lncRNA (n={num_lnc}) and are colored by log median expression in ', 
                           figure_counter=12,category_list_dict=None, category=False,dropdown=True,color_by_title='log Median Expression',
                           highlight_query=query,first_selection=first_tissue, static_images_save= [first_tissue], 
                           file_path='umap/cell_lines/figures/')

In [None]:
%%appyter markdown
### L1000 small molecules predicted to modulate {{query.value[0].raw_value}}

In [None]:
%%appyter markdown
10850 L1000 consensus chemical perturbation and 10424 L1000 CRISPR KO consensus gene expression signatures were downloaded from Enrichr[6-8] and were created from the level 5 L1000 chemical perturbations from SigCom LINCS (https://maayanlab.cloud/sigcom-lincs)[10]. For each unique signature and lncRNA pair, a mean Pearson correlation coefficient was computed by taking the average Pearson coefficient between the lncRNA and all genes in the signature. All {{num_lnc}} lncRNAs were then ranked by mean Pearson correlation coefficient, and the top 1,000 lncRNAs with the highest coefficients were retained for each signature. The top 500 lncRNA-L1000 signature associations are reported here for {{query.value[0].raw_value}}, separated by direction. If {{query.value[0]}} is highly correlated with the up-regulated genes for a specific small molecule, then this small molecule is predicted to up-regulate {{query.value[0].raw_value}}.

In [None]:
%%appyter code_exec
# Load predicted small molecules to modulate the lncRNA
with s3.open(f'storage/lncRNA_Appyter/v0.1.3/{path}lnc-1000-cp.tsv', 'r') as f:
    lines = f.readlines()

idx = col_genes.index(query)
l1000_cp_preds = list(filter(lambda x: x!=  '', lines[idx].split('\t')))
l1000_cp_preds[-1] = l1000_cp_preds[-1].replace('\n', '')


if len(l1000_cp_preds) > 2:
    l1000_prediction_terms = l1000_cp_preds[1:]
    drugs_up = []
    drugs_up_corr = []
    drugs_up_pval = []
    drugs_down = []
    drugs_down_corr=[]
    drugs_down_pval = []
    for sig in l1000_prediction_terms:
        sig_id = sig.split(',')[0]
        sig_id_corr = float(sig.split(',')[1])
        sig_pval = float(sig.split(',')[2])
        if sig_id.split(' ')[1] == 'Down':
            drugs_down.append(sig_id)
            drugs_down_corr.append(sig_id_corr)
            drugs_down_pval.append(sig_pval)
        else:
            drugs_up.append(sig_id)
            drugs_up_corr.append(sig_id_corr)
            drugs_up_pval.append(sig_pval)

    up_results = pd.DataFrame({'Drug':[x.split(' ')[0] for x in drugs_up],'Up/Down':[x.split(' ')[1] for x in drugs_up], 'Mean Pearson Correlation':drugs_up_corr,'P-value':drugs_up_pval})
    down_results = pd.DataFrame({'Drug':[x.split(' ')[0] for x in drugs_down], 'Up/Down':[x.split(' ')[1] for x in drugs_down], 'Mean Pearson Correlation':drugs_down_corr,'P-value':drugs_down_pval})
else:
    up_results = pd.DataFrame()
    down_results = pd.DataFrame()

In [None]:
%%appyter markdown
### L1000 small molecules predicted to up-regulate {{query.value[0].raw_value}}
The prioritized small molecules below are predicted to specifically up-regulate {{query.value[0].raw_value}}.

In [None]:
%%appyter code_exec
# L1000 small molecules predicted to up-regulate the lncRNA of interest
if len(up_results) > 0:
    display(up_results[0:20])
    display(Markdown(f"*Table 7. L1000 small molecules predicted to up-regulate the lncRNA {query}.*"))

    # Create folder for tissue and cell line specific expression
    if not os.path.exists("l1000_sm_predictions/"):
        os.makedirs("l1000_sm_predictions/", exist_ok=True)
    up_results.to_csv("l1000_sm_predictions/" + query.replace('/','-') + '_l1000_sm_predictions_up' + '.csv')
    display(FileLink("l1000_sm_predictions/" + query.replace('/','-') + '_l1000_sm_predictions_up' + '.csv', result_html_prefix=str('Download Table 7: ')))
else:
    display(Markdown(f"**There are no small molecules predicted to specifically up-regulate the expression of {query}.**"))

In [None]:
%%appyter markdown
### L1000 small molecules predicted to down-regulate {{query.value[0].raw_value}}
The prioritized small molecules below are predicted to specifically down-regulate {{query.value[0].raw_value}}.

In [None]:
%%appyter code_exec
# L1000 small molecules predicted to down-regulate the lncRNA of interest
if len(down_results) > 0:
    display(down_results[0:20])
    display(Markdown(f"*Table 8. L1000 small molecules predicted to down-regulate the lncRNA {query}.*"))
    # Create folder for tissue and cell line specific expression
    if not os.path.exists("l1000_sm_predictions/"):
        os.makedirs("l1000_sm_predictions/", exist_ok=True)
    down_results.to_csv("l1000_sm_predictions/" + query.replace('/','-') + '_l1000_sm_predictions_down' + '.csv')
    display(FileLink("l1000_sm_predictions/" + query.replace('/','-') + '_l1000_sm_predictions_down' + '.csv', result_html_prefix=str('Download Table 8: ')))
else:
    display(Markdown(f"**There are no small molecules predicted to specifically down-regulate the expression of {query}.**"))

In [None]:
%%appyter code_exec
# Load predicted CRISPR KO genes to modulate the lncRNA

with s3.open(f'storage/lncRNA_Appyter/v0.1.3/{path}lnc-1000-crispr.tsv', 'r') as f:
    lines = f.readlines()

idx = col_genes.index(query)
l1000_crispr_preds = list(filter(lambda x: x!=  '', lines[idx].split('\t')))
l1000_crispr_preds[-1] = l1000_crispr_preds[-1].replace('\n', '')

if len(l1000_cp_preds) > 2:
    l1000_prediction_terms = l1000_crispr_preds[1:]
    drugs_up = []
    drugs_up_corr = []
    drugs_up_pval = []
    drugs_down = []
    drugs_down_corr=[]
    drugs_down_pval = []
    for sig in l1000_prediction_terms:
        sig_id = sig.split(',')[0]
        sig_id_corr = float(sig.split(',')[1])
        sig_pval = float(sig.split(',')[2])
        if sig_id.split(' ')[1] == 'Down':
            drugs_down.append(sig_id)
            drugs_down_corr.append(sig_id_corr)
            drugs_down_pval.append(sig_pval)
        else:
            drugs_up.append(sig_id)
            drugs_up_corr.append(sig_id_corr)
            drugs_up_pval.append(sig_pval)

    up_results = pd.DataFrame({'Drug':[x.split(' ')[0] for x in drugs_up],'Up/Down':[x.split(' ')[1] for x in drugs_up], 'Mean Pearson Correlation':drugs_up_corr,'P-value':drugs_up_pval})
    down_results = pd.DataFrame({'Drug':[x.split(' ')[0] for x in drugs_down], 'Up/Down':[x.split(' ')[1] for x in drugs_down], 'Mean Pearson Correlation':drugs_down_corr,'P-value':drugs_down_pval})
else:
    up_results = pd.DataFrame()
    down_results = pd.DataFrame()

In [None]:
%%appyter markdown
### L1000 CRISPR KO genes predicted to up-regulate {{query.value[0].raw_value}}

In [None]:
%%appyter markdown
The CRISPR KO genes below are predicted to specifically up-regulate {{query.value[0].raw_value}}.

In [None]:
%%appyter code_exec
# L1000 small molecules predicted to up-regulate the lncRNA of interest
if len(up_results) > 0:
    display(up_results[0:20])
    display(Markdown(f"*Table 7. L1000 CRISPR KO genes predicted to up-regulate the lncRNA {query}.*"))

    # Create folder for tissue and cell line specific expression
    if not os.path.exists("l1000_crispr_predictions/"):
        os.makedirs("l1000_crispr_predictions/", exist_ok=True)
    up_results.to_csv("l1000_crispr_predictions/" + query.replace('/','-') + '_l1000_crispr_predictions_up' + '.csv')
    display(FileLink("l1000_crispr_predictions/" + query.replace('/','-') + '_l1000_crispr_predictions_up' + '.csv', result_html_prefix=str('Download Table 7: ')))
else:
    display(Markdown(f"**There are no CRISPR KO genes predicted to specifically up-regulate the expression of {query}.**"))

In [None]:
%%appyter markdown
### L1000 CRISPR KO genes predicted to down-regulate {{query.value[0].raw_value}}

In [None]:
%%appyter markdown
The CRISPR KO genes below are predicted to specifically down-regulate {{query.value[0].raw_value}}.

In [None]:
%%appyter code_exec
# L1000 small molecules predicted to down-regulate the lncRNA of interest
if len(up_results) > 0:
    display(down_results[0:20])
    display(Markdown(f"*Table 7. L1000 CRISPR KO genes predicted to down-regulate the lncRNA {query}.*"))

    # Create folder for tissue and cell line specific expression
    if not os.path.exists("l1000_crispr_predictions/"):
        os.makedirs("l1000_crispr_predictions/", exist_ok=True)
    up_results.to_csv("l1000_crispr_predictions/" + query.replace('/','-') + '_l1000_crispr_predictions_down' + '.csv')
    display(FileLink("l1000_crispr_predictions/" + query.replace('/','-') + '_l1000_crispr_predictions_down' + '.csv', result_html_prefix=str('Download Table 7: ')))
else:
    display(Markdown(f"**There are no CRISPR KO genes predicted to specifically up-regulate the expression of {query}.**"))

In [None]:
# close h5 file
f.close()

### References
[1] Frankish A, Diekhans M, Jungreis I, Lagarde J, Loveland Jane E, Mudge JM, Sisu C, Wright JC, Armstrong J, Barnes I: GENCODE 2021. Nucleic Acids Research 2021, 49(D1):D916-D923.

[2] Howe KL, Achuthan P, Allen J, Allen J, Alvarez-Jarreta J, Amode MR, Armean IM, Azov AG, Bennett R, Bhai J: Ensembl 2021. Nucleic Acids Research 2021, 49(D1):D884-D891.

[3] Tweedie S, Braschi B, Gray K, Jones TEM, Seal Ruth L, Yates B, Bruford EA: Genenames.org: the HGNC and VGNC resources in 2021. Nucleic Acids Research 2021, 49(D1):D939-D946.

[4] Lachmann A, Torre D, Keenan AB, Jagodnik KM, Lee HJ, Wang L, Silverstein MC, Ma'ayan A: Massive mining of publicly available RNA-seq data from human and mouse. Nature Communications 2018, 10;9(1):1366.

[5] Mas-Ponte D, Carlevaro-Fita J, Palumbo E, Pulido TH, Guigo R, Johnson R. LncATLAS database for subcellular localization of long noncoding RNAs. Rna. 2017 Jul 1;23(7):1080-7.

[6] Xie Z, Bailey A, Kuleshov MV, Clarke DJB, Evangelista JE, Jenkins SL, Lachmann A, Wojciechowicz ML, Kropiwnicki E, Jagodnik KM: Gene Set Knowledge Discovery with Enrichr. Current Protocols 2021, 1(3):e90.

[7] Chen EY, Tan CM, Kou Y, Duan Q, Wang Z, Meirelles GV, Clark NR, Ma’ayan A: Enrichr: interactive and collaborative HTML5 gene list enrichment analysis tool. BMC Bioinformatics 2013, 14(1):128.

[8] Kuleshov MV, Jones MR, Rouillard AD, Fernandez NF, Duan Q, Wang Z, Koplev S, Jenkins SL, Jagodnik KM, Lachmann A: Enrichr: a comprehensive gene set enrichment analysis web server 2016 update. Nucleic Acids Research 2016, 44(W1):W90-W97.

[9] McInnes L, Healy J, Melville J: Umap: Uniform manifold approximation and projection for dimension reduction. arXiv preprint arXiv:180203426 2018.

[10] Evangelista JE, Clarke DJB, Xie Z, Lachmann A, Jeon M, Chen K, Jagodnik KM, Jenkins SL, Kuleshov MV, Wojciechowicz ML, Schürer SC, Medvedovic M, Ma'ayan A. SigCom LINCS: data and metadata search engine for a million gene expression signatures. Nucleic Acids Research 2022, 50(W1):W697–709.