In [21]:
import h5py
import pandas as pd
import numpy as np
from tqdm import trange, tqdm
from scipy.stats import zscore
import urllib.request
from sklearn.metrics import roc_curve, roc_auc_score
from matplotlib import pyplot
from IPython.display import display_html, HTML 
import requests
import os
from maayanlab_bioinformatics.harmonization import ncbi_genes

In [8]:
libraries = ['GO_Biological_Process_2018', 'ChEA_2016', 'MGI_Mammalian_Phenotype_Level_4_2019', 'KEGG_2019_Human',
              'KEA_2015', 'Human_Phenotype_Ontology', 'COVID-19_Related_Gene_Sets']

In [9]:
f = h5py.File("tcga.hdf5", "r")
tcga_genes = np.transpose([str(g[0])[2:-1] for g in f['meta']['genes']])
tcga_cor = pd.DataFrame(np.array(f['full correlation matrix']), index=tcga_genes, columns=tcga_genes)

In [10]:
archs4_cor = pd.read_feather("human_correlation")

In [32]:
def top_rank_genes(rank): 
    for lib in libraries:
        title = " ".join(lib.split("_"))
        display(HTML('<h1>%s</h1>' % title))
        function_to_genes, gene_set = gene_set_dictionaries(lib)
        binary_matrix = gs_binary_matrix(function_to_genes, gene_set)
        if lib not in list(h5py.File("gene_set_libraries", "r").keys()):
            gslib = new_gene_set_library(binary_matrix, function_to_genes)
        else: gslib = gene_set_library(lib, function_to_genes, gene_set)
        lib_sums = np.sum(binary_matrix.T)
        t50 = pd.DataFrame(lib_sums, columns=["Rank"]).sort_values(ascending=False, by="Rank").head(rank)
        display(HTML('<h2>Top 50 most annotated genes in %s</h2>' % title))
        display(t50)
        t50_pred = gslib.loc[t50.index]
        common = set(tcga_cor.index) & set(t50_pred.index)
        smaller_tcga = tcga_cor.loc[common].drop(columns=common)
        mean_cor = pd.DataFrame(np.sum(smaller_tcga)/50, columns=["Mean Correlation"]).sort_values(ascending=False, by="Mean Correlation")
        symbol = mean_cor.index
        mean_cor.insert(1, column="Rank", value=list(range(1, len(mean_cor)+1)))
        total_cor, ncRNA = get_top_genes(mean_cor)
        total_cor = total_cor.loc[:, ["Rank", "Annotation", "Mean Correlation"]]
        display(HTML('<h2>Top 100 most correlated genes from TCGA</h2>'))
        display(total_cor.head(100))
        ncRNA = ncRNA.loc[:, ["Rank", "Annotation", "Mean Correlation"]]
        display(HTML('<h2>Top 100 most correlated ncRNA from TCGA</h2>'))
        display(ncRNA.head(100))

In [24]:
def gene_set_dictionaries(library):
    enrichr_url = 'https://amp.pharm.mssm.edu/Enrichr/geneSetLibrary?mode=text&libraryName='
    data = urllib.request.urlopen(enrichr_url + library)
    function_to_genes = {}
    gene_set = set() 
    for line in data:
        lst = (str(line.strip())[2:-1]).split(r'\t')
        function = lst[0]
        genes = lst[2:]
        function_to_genes[function] = genes
        gene_set.update(set(genes))
    return function_to_genes, sorted(gene_set)

In [25]:
def gs_binary_matrix(function_to_genes, gene_set):
    binary_matrix = np.zeros((len(gene_set), len(function_to_genes)))
    binary_matrix = pd.DataFrame(data=binary_matrix, index=gene_set, columns=list(function_to_genes.keys()))
    for function in binary_matrix.columns: 
        gene_list = function_to_genes[function]
        binary_matrix.loc[gene_list, function] += 1
    return binary_matrix

In [26]:
def gene_set_library(library, function_to_genes, gene_set):
    f = h5py.File("gene_set_libraries", "r")
    gslib = pd.DataFrame(np.array(f[library]['gslib']), index=gene_set, columns=list(function_to_genes.keys())) 
    f.close()
    return gslib

In [27]:
def new_gene_set_library(binary_matrix, function_to_genes):
    cor = pd.DataFrame(np.corrcoef(binary_matrix), index=binary_matrix.index, columns=binary_matrix.index)
    gslib = np.zeros((len(binary_matrix.index)))
    gslib = pd.DataFrame(data=gslib, index=list(binary_matrix.index))
    preds = []
    for function in function_to_genes:
        genes = function_to_genes[function]
        preds.append(cor.loc[:, genes].mean(axis=1))
    gslib = pd.concat(preds, axis=1)
    gslib.columns = list(function_to_genes.keys())
    return gslib

In [28]:
def total_prediction(cor_matrix, gslib):
    common = set(cor_matrix.index) & set(gslib.index)
    smaller_cor = np.array(cor_matrix.loc[:, common])
    smaller_gslib = np.array(gslib.loc[common])
    pred_matrix = np.matmul(smaller_cor, smaller_gslib) 
    pheno_sums = np.sum(gslib)
    for i in trange(len(gslib)):
        for j in range(len(gslib.columns)):
            subtract = pred_matrix[i][j]
            denominator = pheno_sums[j]
            pred_matrix[i][j] /= (denominator-subtract)
    return pd.DataFrame(data=pred_matrix, index=cor_matrix.index, columns=gslib.columns)

In [29]:
def get_top_genes(matrix):
    ncbi = pd.DataFrame(ncbi_genes.ncbi_genes_fetch())
    all_symbols = ncbi.Symbol.values
    ncbi = ncbi[["type_of_gene"]]
    ncbi["Symbol"] = all_symbols
    ncbi = ncbi.set_index("Symbol")
    symbol_to_gene_type = ncbi.to_dict()["type_of_gene"]
    def id_to_type(key):
        if (key in symbol_to_gene_type):
            return symbol_to_gene_type[key]
        else:
            return None
    gene_types = np.transpose([ id_to_type(key) for key in matrix.index])
    matrix["Annotation"] = gene_types
    top_ncRNA = matrix.loc[matrix["Annotation"] == ('ncRNA' or 'rRNA' or 'snoRNA' or 'snRNA' or 'siRNA')]
    return matrix, top_ncRNA

In [33]:
top_rank_genes(50)

Unnamed: 0,Rank
,1945.0
TGFB1,270.0
SIRT1,215.0
TNF,202.0
AKT1,198.0
BMP4,195.0
INS,186.0
VEGFA,181.0
RPS27A,179.0
UBB,178.0


Unnamed: 0,Rank,Annotation,Mean Correlation
AXL,1,protein-coding,0.211688
CDH5,2,protein-coding,0.211078
SYNPO,3,protein-coding,0.210342
CCN1,4,protein-coding,0.210316
LRRC32,5,protein-coding,0.208231
...,...,...,...
ENSG00000254810,96,,0.181193
LMOD1,97,protein-coding,0.181069
NOVA2,98,protein-coding,0.181056
A2M,99,protein-coding,0.180960


Unnamed: 0,Rank,Annotation,Mean Correlation
LINC01197,142,ncRNA,0.173903
LINC02202,250,ncRNA,0.163812
NR2F1-AS1,276,ncRNA,0.161897
MAGI2-AS3,318,ncRNA,0.158816
MIR100HG,337,ncRNA,0.157606
...,...,...,...
LINC01550,2507,ncRNA,0.086410
SLC44A3-AS1,2537,ncRNA,0.086008
LINC02777,2554,ncRNA,0.085642
FGF13-AS1,2561,ncRNA,0.085400


Unnamed: 0,Rank
DUSP6,185.0
IER5,182.0
CITED2,182.0
TLE3,180.0
PTMA,178.0
MEIS1,177.0
KLF6,177.0
PPP1R15B,176.0
JARID2,176.0
HES1,174.0


Unnamed: 0,Rank,Annotation,Mean Correlation
HEG1,1,protein-coding,0.153219
PRDM1,2,protein-coding,0.147201
ELK3,3,protein-coding,0.144958
PLEKHG1,4,protein-coding,0.143551
LTBP2,5,protein-coding,0.142887
...,...,...,...
EGR3,96,protein-coding,0.113662
TGFB2,97,protein-coding,0.113629
RASSF9,98,protein-coding,0.113602
KCTD10,99,protein-coding,0.113432


Unnamed: 0,Rank,Annotation,Mean Correlation
TEX26-AS1,132,ncRNA,0.108971
LINC02408,136,ncRNA,0.108724
LOC100129434,302,ncRNA,0.099225
LOC112268148,341,ncRNA,0.097688
C10orf55,361,ncRNA,0.096586
...,...,...,...
PLBD1-AS1,2709,ncRNA,0.058039
MGAT3-AS1,2713,ncRNA,0.058016
HID1-AS1,2726,ncRNA,0.057898
LNCSRLR,2810,ncRNA,0.056972


Unnamed: 0,Rank
GT(ROSA)26SOR,252.0
LEPR,228.0
TRP53,216.0
FGFR2,214.0
LMNA,214.0
KIT,207.0
NOS3,199.0
LEP,194.0
ESR1,184.0
GLI3,181.0


Unnamed: 0,Rank,Annotation,Mean Correlation
HEG1,1,protein-coding,0.202967
PDGFRB,2,protein-coding,0.191515
FAT4,3,protein-coding,0.190804
ANTXR1,4,protein-coding,0.190546
AXL,5,protein-coding,0.190509
...,...,...,...
KCNE4,96,protein-coding,0.169507
OLFML1,97,protein-coding,0.169288
ARHGEF12,98,protein-coding,0.169249
A2M,99,protein-coding,0.169156


Unnamed: 0,Rank,Annotation,Mean Correlation
MSC-AS1,160,ncRNA,0.161673
MAGI2-AS3,208,ncRNA,0.157480
LINC01197,347,ncRNA,0.146157
LINC00473,364,ncRNA,0.145229
LOC100129434,415,ncRNA,0.141005
...,...,...,...
LINC00639,2538,ncRNA,0.083407
LINC00504,2570,ncRNA,0.083008
HOXC-AS1,2584,ncRNA,0.082809
FRMD6-AS1,2590,ncRNA,0.082748


Unnamed: 0,Rank
MAPK1,103.0
MAPK3,102.0
PIK3CA,86.0
PIK3R2,85.0
PIK3CD,85.0
PIK3CB,85.0
PIK3R3,84.0
PIK3R1,84.0
AKT2,82.0
AKT3,82.0


Unnamed: 0,Rank,Annotation,Mean Correlation
FLNA,1,protein-coding,0.099806
HSPG2,2,protein-coding,0.099383
TLN1,3,protein-coding,0.097164
SUSD6,4,protein-coding,0.096399
DYNC1LI2,5,protein-coding,0.096397
...,...,...,...
TBC1D9B,96,protein-coding,0.078090
FMNL3,97,protein-coding,0.077907
ZNF827,98,protein-coding,0.077870
CYFIP1,99,protein-coding,0.077792


Unnamed: 0,Rank,Annotation,Mean Correlation
LOC100507071,402,ncRNA,0.066325
LINC01801,527,ncRNA,0.063248
PWAR5,772,ncRNA,0.059092
LINC02202,798,ncRNA,0.058786
LOC100507516,808,ncRNA,0.058632
...,...,...,...
LOC100506403,3539,ncRNA,0.039301
LOC101927751,3564,ncRNA,0.039175
ADAMTS9-AS1,3615,ncRNA,0.038921
LOC101927359,3637,ncRNA,0.038828


Unnamed: 0,Rank
TP53,60.0
MAPT,50.0
IRS1,41.0
CREB1,36.0
ESR1,35.0
CTNNB1,32.0
STAT3,31.0
BAD,29.0
SRC,29.0
VIM,28.0


Unnamed: 0,Rank,Annotation,Mean Correlation
COL16A1,1,protein-coding,0.149929
CDH3,2,protein-coding,0.149520
AFAP1,3,protein-coding,0.149228
PLXNA1,4,protein-coding,0.149180
NIBAN2,5,protein-coding,0.148826
...,...,...,...
GAS1,96,protein-coding,0.123887
HR,97,protein-coding,0.123808
ITGA2,98,protein-coding,0.123803
NAV1,99,protein-coding,0.123759


Unnamed: 0,Rank,Annotation,Mean Correlation
LOC100507516,78,ncRNA,0.126707
MYOSLID,381,ncRNA,0.103966
AGAP2-AS1,433,ncRNA,0.101496
LINC00519,506,ncRNA,0.099159
LINC01615,510,ncRNA,0.098954
...,...,...,...
LOC102724301,2821,ncRNA,0.062246
MANCR,2838,ncRNA,0.062072
LINC01775,2840,ncRNA,0.062058
CT62,2849,ncRNA,0.061972


Unnamed: 0,Rank
FGFR2,223.0
FLNA,195.0
FGFR3,178.0
COL2A1,170.0
LMNA,163.0
PTEN,135.0
CHD7,127.0
TP63,123.0
FBN1,123.0
KRAS,119.0


Unnamed: 0,Rank,Annotation,Mean Correlation
DLG5,1,protein-coding,0.203142
ANTXR1,2,protein-coding,0.197206
ZNF532,3,protein-coding,0.191912
MXRA5,4,protein-coding,0.191836
RAB23,5,protein-coding,0.189820
...,...,...,...
C14orf132,96,protein-coding,0.160373
GAS1,97,protein-coding,0.160349
ISLR,98,protein-coding,0.160096
DOCK9,99,protein-coding,0.159981


Unnamed: 0,Rank,Annotation,Mean Correlation
LOC100507516,33,ncRNA,0.174409
NORAD,282,ncRNA,0.143716
TBX5-AS1,352,ncRNA,0.140768
GAS1RR,475,ncRNA,0.135014
FRMD6-AS1,503,ncRNA,0.133842
...,...,...,...
ACOXL-AS1,3272,ncRNA,0.080719
LINC02560,3275,ncRNA,0.080669
HOTAIR,3281,ncRNA,0.080622
LOC112268148,3302,ncRNA,0.080341


Unnamed: 0,Rank
STAT1,50.0
ISG15,48.0
EGR1,43.0
IFITM3,42.0
DUSP1,41.0
NFKBIA,38.0
B2M,38.0
CXCL10,38.0
IFIT2,38.0
IFIT3,38.0


Unnamed: 0,Rank,Annotation,Mean Correlation
EPSTI1,1,protein-coding,0.361948
IFI44L,2,protein-coding,0.351423
CMPK2,3,protein-coding,0.347822
GBP1,4,protein-coding,0.343674
PARP9,5,protein-coding,0.342974
...,...,...,...
HLA-DOB,96,protein-coding,0.246724
APOBEC3G,97,protein-coding,0.246174
LY6E,98,protein-coding,0.246069
LGALS9,99,protein-coding,0.246021


Unnamed: 0,Rank,Annotation,Mean Correlation
HCP5,26,ncRNA,0.295560
PSMB8-AS1,88,ncRNA,0.251527
LINC02528,205,ncRNA,0.221469
PCED1B-AS1,262,ncRNA,0.213564
HLA-DQB1-AS1,306,ncRNA,0.207061
...,...,...,...
LOC101928674,1937,ncRNA,0.114094
CARD8-AS1,1948,ncRNA,0.113777
C11orf45,1953,ncRNA,0.113542
LINC01010,1974,ncRNA,0.112973
