In [50]:
import h5py
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, roc_auc_score
from scipy.stats import norm
from matplotlib import pyplot

In [39]:
f = h5py.File("auc_data.hdf5", "r")

In [72]:
curr_name = "mgi_"

In [73]:
data = f['data']
meta = f['meta']
gslib = data['tcga_' + curr_name + 'gslib']
curr_genes = np.array([ str(g[0])[2:-1] for g in meta[curr_name + 'genes'] ])
tcga_genes = np.array([ str(g[0])[2:-1] for g in meta['tcga_genes'] ])
curr_pheno = np.array([ str(p[0])[2:-1] for p in meta[curr_name + 'pheno'] ])
curr_gslib = data[curr_name + 'gslib']
binary_matrix = data[curr_name + "bin_mat"]

In [74]:
common = list(set(tcga_genes) & set(curr_genes))
common[0]

'G6PC3'

In [75]:
auc_list = []
for g in common:
    tcga_idx = np.where(tcga_genes == g)[0][0]
    mgi_idx = np.where(np.transpose(curr_genes) == g)[0][0]
    y_true = binary_matrix[mgi_idx]
    y_probs = gslib[tcga_idx]
    # calculate AUC
    auc = roc_auc_score(y_true, y_probs)
    if auc >= 0.90: 
        auc_list.append(g)

In [77]:
n = 0
auc_list[n]

'PLK1'

In [78]:
def rank_genes(gene, gslib, gene_list, phenotypes, rank):
    row = np.where(gene_list == gene)[0][0]
    tups = list(dict(zip(range(len(gslib[0])), gslib[row])).items())
    tups.sort(key = lambda t: t[1], reverse = True)
    top_rank = [ t[0] for t in tups[: rank] ]
    top_pheno = [ phenotypes[i] for i in top_rank ]
    return top_pheno

In [79]:
rank_genes(auc_list[n], gslib, tcga_genes, curr_pheno, 10)

['MP:0009762 abnormal mitotic spindle assembly checkpoint',
 'MP:0004966 abnormal inner cell mass proliferation',
 'MP:0012174 flat head',
 'MP:0009760 abnormal mitotic spindle morphology',
 'MP:0004965 inner cell mass degeneration',
 'MP:0003707 increased cell nucleus count',
 'MP:0003708 binucleate',
 'MP:0004046 abnormal mitosis',
 'MP:0004024 aneuploidy',
 'MP:0012111 failure of morula compaction']

In [80]:
rank_genes(auc_list[n], curr_gslib, curr_genes, curr_pheno, 10)

['MP:0013292 embryonic lethality prior to organogenesis',
 'MP:0013293 embryonic lethality prior to tooth bud stage',
 'MP:0008714 increased lung carcinoma incidence',
 'MP:0011094 embryonic lethality before implantation, complete penetrance',
 'MP:0004207 increased squamous cell carcinoma incidence',
 'MP:0002032 increased sarcoma incidence',
 'MP:0011100 preweaning lethality, complete penetrance',
 'MP:0012431 increased lymphoma incidence',
 'MP:0004024 aneuploidy',
 'MP:0009760 abnormal mitotic spindle morphology']

In [122]:
len(auc_list)

568

In [116]:
np.where(curr_genes == auc_list[30])[0][0]

11374

In [117]:
curr_genes[11374]

'RSAD2'

In [118]:
row = curr_gslib[11374]

In [119]:
tups = list(dict(zip(range(len(row)), row)).items())

In [120]:
tups.sort(key = lambda t: t[1], reverse=True)

In [123]:
pd.DataFrame(curr_gslib)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5251,5252,5253,5254,5255,5256,5257,5258,5259,5260
0,0.166419,0.000000,0.086941,0.000000,0.000000,0.170768,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.102684,0.000000,0.000000,0.000000,0.000000,0.093394,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.086942,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.116485,0.000000,0.120243,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.074979,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.113224,0.000000,0.119822,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.039319,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.140798,0.208933,0.151365,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.068793,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.073312,0.000000,0.064395,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [125]:
x = np.corrcoef(binary_matrix)

In [127]:
pd.DataFrame(x)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13410,13411,13412,13413,13414,13415,13416,13417,13418,13419
0,1.000000,0.108803,0.168620,0.159872,0.121927,0.168792,0.018553,0.217663,0.122054,0.076933,...,-0.001079,-0.001526,-0.001079,-0.001079,-0.001079,-0.001079,-0.001079,-0.001079,-0.001079,-0.001079
1,0.108803,1.000000,0.106788,0.094546,0.090404,0.090016,0.063467,0.117583,0.033302,0.029627,...,-0.001176,-0.001663,-0.001176,-0.001176,-0.001176,-0.001176,-0.001176,-0.001176,-0.001176,-0.001176
2,0.168620,0.106788,1.000000,0.090601,0.079018,0.149343,0.021077,0.092955,0.034581,0.035087,...,-0.002853,-0.004035,-0.002853,-0.002853,-0.002853,-0.002853,-0.002853,-0.002853,-0.002853,-0.002853
3,0.159872,0.094546,0.090601,1.000000,0.028490,0.363648,0.037530,0.230642,0.031100,0.103782,...,-0.001222,-0.001728,-0.001222,-0.001222,-0.001222,-0.001222,-0.001222,-0.001222,-0.001222,-0.001222
4,0.121927,0.090404,0.079018,0.028490,1.000000,0.064516,0.025248,0.055298,0.075766,0.061407,...,-0.001566,-0.002215,-0.001566,-0.001566,-0.001566,-0.001566,-0.001566,-0.001566,-0.001566,-0.001566
5,0.168792,0.090016,0.149343,0.363648,0.064516,1.000000,0.050922,0.242235,0.045076,0.087175,...,-0.000972,-0.001374,-0.000972,-0.000972,-0.000972,-0.000972,-0.000972,-0.000972,-0.000972,-0.000972
6,0.018553,0.063467,0.021077,0.037530,0.025248,0.050922,1.000000,0.031442,-0.000378,0.006811,...,-0.001309,-0.001852,-0.001309,-0.001309,-0.001309,-0.001309,-0.001309,-0.001309,-0.001309,-0.001309
7,0.217663,0.117583,0.092955,0.230642,0.055298,0.242235,0.031442,1.000000,0.038959,0.084659,...,-0.000762,-0.001077,-0.000762,-0.000762,-0.000762,-0.000762,-0.000762,-0.000762,-0.000762,-0.000762
8,0.122054,0.033302,0.034581,0.031100,0.075766,0.045076,-0.000378,0.038959,1.000000,0.028470,...,-0.002061,-0.002915,-0.002061,-0.002061,-0.002061,-0.002061,-0.002061,-0.002061,-0.002061,-0.002061
9,0.076933,0.029627,0.035087,0.103782,0.061407,0.087175,0.006811,0.084659,0.028470,1.000000,...,-0.001590,-0.002248,-0.001590,-0.001590,-0.001590,-0.001590,-0.001590,-0.001590,-0.001590,-0.001590
