In [8]:
import h5py 
import pandas as pd 
import numpy as np
import json
import requests
from scipy.stats import pearsonr

In [21]:
fi = h5py.File("tcga.hdf5", "r")
f = h5py.File("auc_data.hdf5", "r")
curr_name = "go_bp_"
data = f['data']
meta = f['meta']
gslib = data['tcga_' + curr_name + 'gslib']
curr_genes = np.array([ str(g[0])[2:-1] for g in meta[curr_name + 'genes'] ])
tcga_genes = np.array([ str(g[0])[2:-1] for g in meta['tcga_genes'] ])
curr_pheno = np.array([ str(p[0])[2:-1] for p in meta[curr_name + 'pheno'] ])
curr_gslib = data[curr_name + 'gslib']
binary_matrix = data[curr_name + "bin_mat"]
cor = fi['full correlation matrix']
tcga_genes = np.array([str(g[0])[2:-1] for g in fi['meta']['genes']])
# gene_correlations1(tcga_genes, cor)

In [24]:
def rank_associations(cor, gene, gene_list, pheno_list, rank):
    i = np.where(gene_list == gene)[0][0]
    row = cor[i]
    tups = list(dict(zip(range(len(row)), row)).items())
    tups.sort(key = lambda t: t[1], reverse = True)
    top_genes = [ gene_list[t[0]] for t in tups[: rank] ]
    top_pheno = [ pheno_list[t[0]] for t in tups[: rank] ]
    return top_pheno

In [19]:
def geneshot(gene): 
    GENESHOT_URL = 'http://amp.pharm.mssm.edu/geneshot/api'
    query_string = '/associate/%s/%s'
    similarity_matrix = 'enrichr'
    gene_symbols = ','.join([gene])
    response = requests.get(GENESHOT_URL + query_string % (similarity_matrix, gene_symbols))
    data = json.loads(response.text)
    data = list(data['association'].items())
    if data[0][1]['topGenes'][0] == 'null': return None 
    return [ (t[0], t[1]['simScore']) for t in data ]

In [16]:
def gene_correlations(gene_list, cor):
    correlations = []
    for gene in gene_list:
        geneshot_data = geneshot(gene)
        if geneshot_data: 
            geneshot_genes = [ tup[0] for tup in geneshot_data ]
            ranked_geneshot = [ (i, geneshot_genes[i]) for i in range(len(geneshot_genes)) ]
            tcga_list = rank_associations(cor, gene, gene_list, len(gene_list))
            tcga_rank = [ (i, tcga_list[i]) for i in range(len(gene_list)) ]
            common_genes = list(set(gene_list) & set(geneshot_genes))
            filtered_geneshot = sorted([ tup for tup in ranked_geneshot if tup[1] in common_genes ], key=lambda t:t[1])
            filtered_tcga = sorted([ tup for tup in tcga_rank if tup[1] in common_genes ], key=lambda t:t[1])
            geneshot_indices = [ tup[0] for tup in filtered_geneshot ]
            tcga_indices = [ tup[0] for tup in filtered_tcga ]
            corr, _ = pearsonr(geneshot_indices, tcga_indices)
            correlations.append(abs(corr))
    print(np.mean(correlations))

In [17]:
def gene_correlations1(gene_list, cor): 
    correlations = []
    count = 0
    for gene in gene_list:
        geneshot_data = geneshot(gene)
        if geneshot_data: 
            geneshot_genes = [ tup[0] for tup in geneshot_data ]
            tcga_list = rank_associations(cor, gene, gene_list, len(gene_list))
            common_genes = list(set(gene_list) & set(geneshot_genes))
            filtered_geneshot = [ g for g in geneshot_genes if g in common_genes ]
            filtered_tcga = [ g for g in tcga_list if g in common_genes ]
            geneshot_indices = sorted([ (i, filtered_geneshot[i]) for i in range(len(filtered_geneshot)) ], key=lambda t:t[1] )
            tcga_indices = sorted([ (i, filtered_tcga[i]) for i in range(len(filtered_tcga)) ], key=lambda t:t[1])
            geneshot_indices = [ tup[0] for tup in geneshot_indices ]
            tcga_indices = [ tup[0] for tup in tcga_indices ]
            corr, _ = pearsonr(geneshot_indices, tcga_indices)
            correlations.append(abs(corr))
    print(np.mean(correlations))

In [20]:
# workflow()

In [27]:
rank_associations(gslib, 'KLF1', tcga_genes, curr_pheno, 10)

['regulation of DNA-directed DNA polymerase activity (GO:1900262)',
 'positive regulation of DNA-directed DNA polymerase activity (GO:1900264)',
 'strand invasion (GO:0042148)',
 'replication fork protection (GO:0048478)',
 "mRNA 3'-splice site recognition (GO:0000389)",
 'CENP-A containing nucleosome assembly (GO:0034080)',
 'CENP-A containing chromatin organization (GO:0061641)',
 'chromatin remodeling at centromere (GO:0031055)',
 'G-quadruplex DNA unwinding (GO:0044806)',
 'centromere complex assembly (GO:0034508)']