In [7]:

import s3fs
import h5py as h5
import os
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm
import urllib.request
from scipy import stats
import json
import ssl
import math
import re


ENDPOINT = os.getenv('ENDPOINT', "prismexp")
ENDPOINT_API = os.getenv('ENDPOINT_API', "prismexp")
S3_PREDICTION_URL = os.getenv('S3_PREDICTION_URL', "https://mssm-data.s3.amazonaws.com/px_predictions.2.1.2.h5")

if not os.path.exists("testing/prediction.h5"):
    urllib.request.urlretrieve(S3_PREDICTION_URL, "testing/prediction.h5")


In [2]:

def load_json(url):
    context = ssl._create_unverified_context()
    req = urllib.request.Request(url)
    r = urllib.request.urlopen(req, context=context).read()
    return(json.loads(r.decode('utf-8')))

enrichr_libraries = load_json("https://maayanlab.cloud/speedrichr/api/listlibs")["library"]

def loadGenesS3(library):
    with h5.File(file_path, 'r') as f:
        return np.array([s.decode("UTF-8") for s in np.array(f[library+"/gene"])])

def loadSetsS3(library):
    with h5.File(file_path, 'r') as f:
        return np.array([s.decode("UTF-8") for s in  np.array(f[library+"/set"])])

def loadGenePredictionS3(library, idx):
    with h5.File(file_path, 'r') as f:
        return stats.zscore(np.array(f[library+"/prediction"][idx, :])[0])

def loadGeneAUCS3(library, idx):
    with h5.File(file_path, 'r') as f:
        return np.float16(f[library+"/auc/gene"][idx])

def loadSetAUCS3(library):
    with h5.File(file_path, 'r') as f:
        return np.array(f[library+"/auc/set"])

In [None]:
file_path = "testing/prediction.h5"

def get_predictions(gene_symbol):
    libraries = []
    with h5.File(file_path, 'r') as f:
        libraries = [k for k in f.keys()]
    print(libraries)
    result = {}
    result["gene"] = gene_symbol
    result["predictions"] = {}
    genes = loadGenesS3(libraries[0])
    for lib in libraries:
        sets = loadSetsS3(lib)
        idx = np.where(genes == gene_symbol.upper())[0]
        predictions = loadGenePredictionS3(lib, idx)
        gauc = loadGeneAUCS3(lib, idx)
        sauc = loadSetAUCS3(lib)
        setinfo = []
        for i in range(len(predictions)):
            score = predictions[i]
            term_auc = sauc[i]
            if math.isnan(score):
                score = 0.0
                print("fixed")
            if math.isnan(term_auc):
                term_auc = 0.0
                print("fixed")
            setinfo.append({"term": sets[i], "score": float(predictions[i]), "term_auc": float(sauc[i])})
        setinfo = sorted(setinfo, key=lambda d: d['score'], reverse=True)
        result["predictions"][lib] = {}
        result["predictions"][lib]["auc"] = float(gauc[0])
        result["predictions"][lib]["prediction"] = setinfo
    return result 

def get_predictions(gene_symbol):
    libraries = []
    with h5.File(file_path, 'r') as f:
        libraries = [k for k in f.keys()]
    print(libraries)
    result = {}
    result["gene"] = gene_symbol
    result["predictions"] = {}
    genes = loadGenesS3(libraries[0])
    for lib in libraries:
        sets = loadSetsS3(lib)
        idx = np.where(genes == gene_symbol.upper())[0]
        predictions = loadGenePredictionS3(lib, idx)
        gauc = loadGeneAUCS3(lib, idx)
        sauc = loadSetAUCS3(lib)
        setinfo = []
        for i in range(len(predictions)):
            score = predictions[i]
            term_auc = sauc[i]
            if math.isnan(score):
                score = 0.0
                print("fixed")
            if math.isnan(term_auc):
                term_auc = 0.0
                print("fixed")
            setinfo.append({"term": sets[i], "score": float(predictions[i]), "term_auc": float(sauc[i])})
        setinfo = sorted(setinfo, key=lambda d: d['score'], reverse=True)
        result["predictions"][lib] = {}
        if math.isnan(float(gauc[0])):
            gauc[0] = -1
        result["predictions"][lib]["auc"] = float(gauc[0])
        result["predictions"][lib]["prediction"] = setinfo
    return result 

In [16]:
get_predictions("SOX2")

['ChEA_2022', 'GO_Biological_Process_2021', 'GWAS_Catalog_2019', 'Human_Phenotype_Ontology', 'KEA_2015', 'KEGG_2021_Human', 'MGI_Mammalian_Phenotype_Level_4_2021', 'OMIM_Disease', 'huMAP']
boo


{'gene': 'SOX2',
 'predictions': {'ChEA_2022': {'auc': 0.7060546875,
   'prediction': [{'term': 'RING1B 27294783 CHIP-SEQ NPCS MOUSE',
     'score': 2.220703125,
     'term_auc': 0.8564453125},
    {'term': 'RING1B 27294783 CHIP-SEQ ESCS MOUSE',
     'score': 2.212890625,
     'term_auc': 0.84521484375},
    {'term': 'POU5F1 16153702 CHIP-CHIP HESCS HUMAN',
     'score': 2.12890625,
     'term_auc': 0.83935546875},
    {'term': 'TP53 20018659 CHIP-CHIP R1E MOUSE',
     'score': 2.12109375,
     'term_auc': 0.85498046875},
    {'term': 'ERG 21242973 CHIP-CHIP JURKAT HUMAN',
     'score': 2.099609375,
     'term_auc': 0.83203125},
    {'term': 'RNF2 16625203 CHIP-CHIP MESCS MOUSE',
     'score': 2.0703125,
     'term_auc': 0.8662109375},
    {'term': 'SOX2 16153702 CHIP-CHIP HESCS HUMAN',
     'score': 2.068359375,
     'term_auc': 0.82080078125},
    {'term': 'OLIG2 26023283 CHIP-SEQ AINV15 MOUSE',
     'score': 2.0078125,
     'term_auc': 0.82666015625},
    {'term': 'RNF2 27304074 CHI