In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
%%appyter code_exec

{% do SectionField(
    name='Datasets',
    title='Coding/Noncoding Gene Selection',
    img='TCGA.png'
)%}

gene = {{ AutocompleteField(
    name='gene_input',
    file_path="https://appyters.maayanlab.cloud/storage/ncRNA_predictions/TCGA.json",
    label='autocomplete',
    title='Search Gene Symbol:',
    description='Select a gene among 38,550 genes found in TCGA',
    section='Datasets',
    default='TSPAN6'
)}}

In [None]:
import h5py
import pandas as pd
import numpy as np
from tqdm import trange, tqdm
from scipy.stats import zscore
import urllib.request
from sklearn.metrics import roc_curve, roc_auc_score
from matplotlib import pyplot
from IPython.display import clear_output, display_html, HTML 
import requests
import os
from maayanlab_bioinformatics.harmonization import ncbi_genes
import s3fs
import base64
import json

# Fetch and load data

Fetch and load prepared TCGA correlation matrix into a DataFrame - 38550 coding and noncoding genes

In [None]:
s3 = s3fs.S3FileSystem(anon=True, client_kwargs={'endpoint_url': 'https://appyters.maayanlab.cloud/storage'})

In [None]:
with s3.open("ncRNA_predictions/tcga_cor.h5") as s3f:
    with h5py.File(s3f, 'r') as f: 
        tcga_genes = np.transpose([str(g[0])[2:-1] for g in f['tcga_genes']])
        gene_idx = np.where(tcga_genes == gene)[0][0]
        tcga_cor = pd.DataFrame(f['corr_matrix'][gene_idx], index=tcga_genes, columns=[gene])

Fetch and load prepared ARCHS4 human correlation matrix into a DataFrame - 26415 coding genes

In [None]:
with s3.open("ncRNA_predictions/human_correlation.h5") as s3f:
    with h5py.File(s3f, 'r') as f:
        archs4_genes = np.transpose([str(g[0])[2:-1] for g in f['genes']])
        if gene in set(archs4_genes):
            gene_idx = np.where(archs4_genes == gene)[0][0]
            filtered_genes = [str(g[0])[2:-1] for g in f['filtered_genes']] # only genes found in gene set libraries
            archs4_cor = pd.DataFrame(f['human_correlation'][gene_idx], index=filtered_genes, columns=[gene])
            archs4_cor.loc[gene] = 0
        else: archs4_cor = pd.DataFrame()

Fetch reference prediction matrices prepared from Enrichr [gene set libraries](https://amp.pharm.mssm.edu/Enrichr/#stats).

In [None]:
libraries = ['GO_Biological_Process_2018', 'ChEA_2016', 'MGI_Mammalian_Phenotype_Level_4_2019', 'KEGG_2019_Human',
              'KEA_2015', 'Human_Phenotype_Ontology', 'WikiPathways_2019_Human']

In [None]:
gene_set_libraries = {}
with s3.open("ncRNA_predictions/gene_set_libraries") as s3f:
    with h5py.File(s3f, 'r') as f:
        for idx in trange(len(libraries)):
            library = libraries[idx]
            if library in list(f.keys()):
                gene_set_libraries[library] = np.array(f[library]['gslib'])

Fetch NCBI data to determine gene types.

In [None]:
ncbi = pd.DataFrame(ncbi_genes.ncbi_genes_fetch())
all_symbols = ncbi.Symbol.values
ncbi = ncbi[["type_of_gene"]]
ncbi["Symbol"] = all_symbols
ncbi = ncbi.set_index("Symbol")
symbol_to_gene_type = ncbi.to_dict()["type_of_gene"]

# Predicting the top correlated functions and genes

In this section, we define a pipeline with functions that will determine the top functions correlated to the user-inputted gene for each Enrichr library, and the top correlated genes, for both TCGA and ARCHS4.

Given a gene set library's name, pull the library from Enrichr and return a dictionary with functions as keys and genes as values and a sorted set of the gene set library's genes.

In [None]:
def gene_set_dictionaries(library):
    print("Creating dictionary from %s." % library)
    enrichr_url = 'https://amp.pharm.mssm.edu/Enrichr/geneSetLibrary?mode=text&libraryName='
    data = urllib.request.urlopen(enrichr_url + library)
    function_to_genes = {}
    gene_set = set() 
    for line in data:
        lst = (str(line.strip())[2:-1]).split(r'\t')
        function = lst[0]
        genes = lst[2:]
        function_to_genes[function] = genes
        gene_set.update(set(genes))
    clear_output()
    print("Completed creating dictionary from %s." % library)
    return function_to_genes, sorted(gene_set)

Given the function-to-gene dictionary and set of genes from the above function, create a binary matrix with genes as rows and functions as columns (value of 1 means the gene is associated with the given function).

In [None]:
def gs_binary_matrix(function_to_genes, gene_set):
    print("Creating binary matrix.")
    binary_matrix = np.zeros((len(gene_set), len(function_to_genes)))
    binary_matrix = pd.DataFrame(data=binary_matrix, index=gene_set, columns=list(function_to_genes.keys()))
    for function in binary_matrix.columns: 
        gene_list = function_to_genes[function]
        binary_matrix.loc[gene_list, function] += 1
    clear_output()
    print("Completed creating binary matrix.")
    return binary_matrix

Given the library name as well as the function-to-gene dictionary and gene set from above, load the gene set prediction matrix as a DataFrame with genes as rows, functions as columns. In this prediction matrix, in each (gene, gene set/function) cell is the correlation of that gene for a given gene set (function) to every other gene in that gene set.

In [None]:
def gene_set_library(library, function_to_genes, gene_set):
    clear_output()
    gslib = pd.DataFrame(gene_set_libraries[library], index=gene_set, columns=function_to_genes.keys())
    return gslib

In [None]:
def new_gene_set_library(binary_matrix, function_to_genes):
    cor = pd.DataFrame(np.corrcoef(binary_matrix), index=binary_matrix.index, columns=binary_matrix.index)
    gslib = np.zeros((len(binary_matrix.index)))
    gslib = pd.DataFrame(data=gslib, index=list(binary_matrix.index))
    preds = []
    for function in function_to_genes:
        genes = function_to_genes[function]
        preds.append(cor.loc[:, genes].mean(axis=1))
    gslib = pd.concat(preds, axis=1)
    gslib.columns = list(function_to_genes.keys())
    return gslib

Generate a new prediction matrix combining the gene set prediction matrix above and the TCGA or ARCHS4 correlation matrix. The following formula is used to make predictions about the chosen gene using the TCGA or ARCHS4 correlation matrix.

<div style=margin-top:20px> 
    <img src="static/geneshot_formula.png" width="50%" height="50%" style='border:0.1px solid light gray' />
    <div style=text-align:center;margin-top:20px> 
        Fig. 1: Formula found in the <a href='https://academic.oup.com/nar/article/47/W1/W571/5494749'>Geneshot paper</a> to calculate the prediction matrix. G is the correlation matrix, GF is the gene set prediction matrix, and GF' is the new prediction matrix to make predictions about the given dataset. Note: the denominator should be GF instead of G.
    </div>
</div>

In [None]:
def prediction_matrix(gene, cor_matrix, gslib):
    idx = [ g for g in gslib.index if g in cor_matrix.index ]
    smaller_cor = cor_matrix.loc[idx] 
    smaller_gslib = gslib.loc[idx]
    pred_matrix = np.dot(np.transpose(smaller_cor), smaller_gslib)
    if gene in smaller_gslib.index: gslib = gslib.drop(index=gene)
    function_sums = np.array(np.sum(gslib))
    pred_matrix /= function_sums
    return pd.DataFrame(data=np.transpose(pred_matrix), index=gslib.columns, columns=[gene])

Return the top 50 most correlated functions and 100 most correlated genes associated with the given gene.

In [None]:
def top_ranked(matrix, gene): 
    top = matrix.sort_values(ascending=False)
    z_scores = pd.DataFrame(zscore(top), index=top.index)
    genes = pd.DataFrame(list(top.index), index=top.index) 
    top = pd.concat([genes, top, z_scores], axis=1)
    top.index = list(range(1, len(top)+1))
    return top

In [None]:
def get_top_functions(matrix, gene, name):
    top = top_ranked(matrix.loc[:,gene], gene).head(50)
    top.columns = ["%s - Annotation" % name, "%s - Score" % name, "%s - Z-Score" % name]
    return top

In [None]:
def get_top_genes(matrix, gene, name):
    top = top_ranked(matrix.T.loc[gene], gene)
    top.columns = ["%s - Symbol" % name, "%s - Score" % name, "%s - Z-Score" % name] 
    def id_to_type(key):
        if (key in symbol_to_gene_type):
            return symbol_to_gene_type[key]
        else:
            return None
    gene_types = np.transpose([ id_to_type(key) for key in top.loc[:, '%s - Symbol' % name ]])
    top.insert(1, "%s - Annotation" % name, gene_types)
    top_ncRNA = None
    if name == "TCGA": 
        top_ncRNA = top.loc[top['%s - Annotation' % name] == ('ncRNA' or 'rRNA' or 'snoRNA' or 'snRNA' or 'siRNA')].head(100)
    return top.head(100), top_ncRNA

Graph the AUROC for the TCGA and ARCHS4 predictions.

In [None]:
def auc(binary_matrix, tcga_probs, archs4_probs, gene):
    if gene not in binary_matrix.index: 
        print("Not enough gene annotations available.")
        return
    tcga_probs = tcga_probs.fillna(tcga_probs.mean())
    if not archs4_probs.empty: archs4_probs = archs4_probs.fillna(archs4_probs.mean())
    y_true = binary_matrix.loc[gene]
    ns_probs = [0 for _ in range(len(y_true))]
    fpr, tpr, _ = roc_curve(y_true, tcga_probs)
    apr, bpr, _ = roc_curve(y_true, archs4_probs)
    ns_fpr, ns_tpr, _ = roc_curve(y_true, ns_probs)
    tcga_auc = roc_auc_score(y_true, tcga_probs)
    archs4_auc = roc_auc_score(y_true, archs4_probs)
    pyplot.figure(figsize=(11,5))
    pyplot.subplot(1,2,1)
    pyplot.plot(ns_fpr, ns_tpr, linestyle='--')
    pyplot.plot(fpr, tpr, marker='.')
    pyplot.xlabel('False Positive Rate')
    pyplot.ylabel('True Positive Rate')
    pyplot.title("TCGA: " + gene)
    pyplot.text(0.75, 0.05, 'AUC: %.3f' % tcga_auc, fontsize=12)
    
    pyplot.subplot(1,2,2)
    pyplot.plot(ns_fpr, ns_tpr, linestyle='--')
    pyplot.plot(apr, bpr, marker='.')
    pyplot.xlabel('False Positive Rate')
    pyplot.ylabel('True Positive Rate')
    pyplot.title("ARCHS4: " + gene)
    pyplot.text(0.75, 0.05, 'AUC: %.3f' % archs4_auc, fontsize=12)
    pyplot.tight_layout()
    pyplot.show()

Process to determine top correlated functions using the above functions, display DataFrames alongside each other to facilitate comparison and a download link for the data.

In [None]:
def functional_comparison(current_lib):
    function_to_genes, gene_set = gene_set_dictionaries(current_lib)
    binary_matrix = gs_binary_matrix(function_to_genes, gene_set)
    if current_lib in gene_set_libraries: gslib = gene_set_library(current_lib, function_to_genes, gene_set)
    else: gslib = new_gene_set_library(binary_matrix, function_to_genes)
    tcga_pred_matrix = prediction_matrix(gene, tcga_cor, gslib)
    clear_output()
    print("Completed creating TCGA prediction matrix.")
    top_tcga_functions = get_top_functions(tcga_pred_matrix, gene, "TCGA")
    if not archs4_cor.empty:
        archs4_pred_matrix = prediction_matrix(gene, archs4_cor, gslib)
        clear_output()
        print("Completed creating ARCHS4 prediction matrix.")
        top_archs4_functions = get_top_functions(archs4_pred_matrix, gene, "ARCHS4")
        combined_df = pd.concat([top_tcga_functions, top_archs4_functions], axis=1)
    else:
        archs4_pred_matrix = pd.DataFrame()
        combined_df = top_tcga_functions
    clear_output()
    display(download_link(combined_df, "Download top 50 predicted functions from %s" % current_lib, "T50_functions_%s.csv" % current_lib)) 
    display_df(combined_df.head(10))
    return binary_matrix, tcga_pred_matrix, archs4_pred_matrix

In [None]:
def display_df(df):
#     symbol_indices = [i for i in df.columns if i.split(" - ")[-1] == "Symbol"]
#     if symbol_indices: n = df.style.set_table_attributes("style='max-width:95%;margin:auto'").format(get_enrichr_link, subset=symbol_indices)
    n = df.style.set_table_attributes("style='max-width:95%;margin:auto'")
    display_html(n._repr_html_(), raw=True)

In [None]:
def download_link(content, title, filename):  
    csv = content.to_csv(index=False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

In [None]:
def predictions():
    for library in libraries: 
        display(HTML('<h1> %s </h1>' % " ".join(library.split("_"))))
        display(HTML('<div> Side-by-side comparison of the top </div>'))

In [None]:
def get_enrichr_link(genes, name):
    ENRICHR_URL = 'http://amp.pharm.mssm.edu/Enrichr/addList'
    genes_str = '\n'.join(genes)
    payload = {'list': (None, genes_str)}
    response = requests.post(ENRICHR_URL, files=payload)
    if not response.ok:
        raise Exception('Error analyzing gene list')
    dataset = json.loads(response.text)['shortId']
    url = 'https://amp.pharm.mssm.edu/Enrichr/enrich?dataset=' + dataset 
    html = '<a href="%s">Learn more about the top 100 correlated %s genes in Enrichr</a>' % (url, name)
    display(HTML(html))

# Predicted pathways (GO)

Side-by-side comparison of the top predicted pathways.

In [None]:
binary_matrix, tcga_pred_matrix, archs4_pred_matrix = functional_comparison(libraries[0])

AUROC for the TCGA and ARCHS4-generated predictions.

In [None]:
auc(binary_matrix, tcga_pred_matrix, archs4_pred_matrix, gene)

# Predicted upstream transcription factors (ChEA)

Side-by-side comparison of the top predicted upstream transcription factors.

In [None]:
binary_matrix, tcga_pred_matrix, archs4_pred_matrix = functional_comparison(libraries[1])

AUROC for the TCGA and ARCHS4-generated predictions.

In [None]:
auc(binary_matrix, tcga_pred_matrix, archs4_pred_matrix, gene)

# Predicted mouse phenotypes (MGI)

Side-by-side comparison of the top predicted mouse phenotypes.

In [None]:
binary_matrix, tcga_pred_matrix, archs4_pred_matrix = functional_comparison(libraries[2])

AUROC for the TCGA and ARCHS4-generated predictions.

In [None]:
auc(binary_matrix, tcga_pred_matrix, archs4_pred_matrix, gene)

# Predicted pathways (KEGG)

Side-by-side comparison of the top predicted pathways.

In [None]:
binary_matrix, tcga_pred_matrix, archs4_pred_matrix = functional_comparison(libraries[3])

AUROC for the TCGA and ARCHS4-generated predictions.

In [None]:
auc(binary_matrix, tcga_pred_matrix, archs4_pred_matrix, gene)

# Predicted kinase interactions (KEA)

Side-by-side comparison of the top predicted kinase interactions.

In [None]:
binary_matrix, tcga_pred_matrix, archs4_pred_matrix = functional_comparison(libraries[4])

AUROC for the TCGA and ARCHS4-generated predictions.

In [None]:
auc(binary_matrix, tcga_pred_matrix, archs4_pred_matrix, gene)

# Predicted human phenotypes

Side-by-side comparison of the top predicted human phenotypes.

In [None]:
binary_matrix, tcga_pred_matrix, archs4_pred_matrix = functional_comparison(libraries[5])

AUROC for the TCGA and ARCHS4-generated predictions.

In [None]:
auc(binary_matrix, tcga_pred_matrix, archs4_pred_matrix, gene)

# Predicted Biological Pathways (WikiPathways)

Side-by-side comparison of the top predicted biological pathways from WikiPathways.

In [None]:
binary_matrix, tcga_pred_matrix, archs4_pred_matrix = functional_comparison(libraries[-1])

AUROC for the TCGA and ARCHS4-generated predictions.

In [None]:
auc(binary_matrix, tcga_pred_matrix, archs4_pred_matrix, gene)

# Top coding and non-coding genes
Side-by-side comparison of the top correlated genes.

The top 100 most correlated genes.

In [None]:
top_tcga_genes, top_ncRNA = get_top_genes(tcga_cor, gene, "TCGA")
top_archs4_genes, _ = get_top_genes(archs4_cor, gene, "ARCHS4")
combined_genes = pd.concat([top_tcga_genes, top_archs4_genes], axis=1)
display(download_link(combined_genes, "Download top 100 most correlated genes", "most_correlated_genes.csv"))
get_enrichr_link(combined_genes['TCGA - Symbol'], 'TCGA')
get_enrichr_link(combined_genes['ARCHS4 - Symbol'], 'ARCHS4')
display_df(combined_genes)

The gene types of the top 100 most correlated genes.

In [None]:
filtered_tcga_types = np.transpose(list(combined_genes['TCGA - Annotation']))
filtered_tcga_types = filtered_tcga_types[filtered_tcga_types!=None]
filtered_archs4_types = np.transpose(list(combined_genes['ARCHS4 - Annotation']))
filtered_archs4_types = filtered_archs4_types[filtered_archs4_types!=None]
tcga_counts = np.unique(filtered_tcga_types, return_counts=True)
archs4_counts = np.unique(filtered_archs4_types, return_counts=True)
rows = list( set(tcga_counts[0]) | set(archs4_counts[0]) )
gene_types = pd.DataFrame(0, index = rows, columns=["TCGA", "ARCHS4"])
gene_types.loc[tcga_counts[0], "TCGA"] = tcga_counts[1]
gene_types.loc[archs4_counts[0], "ARCHS4"] = archs4_counts[1]
if "unknown" in gene_types.columns: gene_types.loc["unknown"] += [100 - sum(tcga_counts[1]), 100 - sum(archs4_counts[1])]
else: gene_types.loc["unknown"] = [100 - sum(tcga_counts[1]), 100 - sum(archs4_counts[1])]
display_df(gene_types)

The top 100 most correlated noncoding genes. 

In [None]:
display(download_link(top_ncRNA, "Download top 100 ncRNA", "top_100_ncRNA.csv"))
get_enrichr_link(top_ncRNA['TCGA - Symbol'], 'ncRNA')
display_df(top_ncRNA)