In [1]:
import json
import requests


# BigGIM example

swagger api specification: http://biggim.ncats.io/api

In [2]:
base_url = 'http://biggim.ncats.io/api'

In [3]:
#a couple of simple helper functions
def post(endpoint, data={}, base_url=base_url):
    req = requests.post('%s/%s' % (base_url,endpoint), data=data)
    req.raise_for_status()
    return req.json()

def get(endpoint, data={}, base_url=base_url):
    req = requests.get('%s/%s' % (base_url,endpoint), data=data)
    req.raise_for_status()
    return req.json()
    

def jprint(dct):
    print(json.dumps(dct, indent=2))

## Example Metadata

### Get available studies

In [4]:
try:
    studies = get('metadata/study')
    jprint(studies)
except requests.HTTPError as e:
    print(e)

[
  {
    "name": "BioGRID",
    "substudies": [
      {
        "cell_of_origin": null,
        "tissue_hierarchy": null,
        "description": "Default",
        "name": "Default",
        "columns": [
          {
            "datatype": "BOOLEAN",
            "table": {
              "name": "BigGIM_70_v1"
            },
            "interactions_type": "Boolean Flag For Interaction",
            "name": "BioGRID_Interaction"
          },
          {
            "datatype": "STRING",
            "table": {
              "name": "BigGIM_70_v1"
            },
            "interactions_type": "Experimental System Used",
            "name": "BioGRID_Experimental_System"
          },
          {
            "datatype": "STRING",
            "table": {
              "name": "BigGIM_70_v1"
            },
            "interactions_type": "Experimental System Type",
            "name": "BioGRID_Experimental_System_Type"
          },
          {
            "datatype": "STRING",
            

In [5]:
study_names = [s['name'] for s in studies]
study_names

['BioGRID', 'TCGA', 'GIANT', 'GTEx']

### Get a single study

In [6]:

try:
    tcga = get('metadata/study/%s' % ('TCGA'))
    jprint(tcga)
except requests.HTTPError as e:
    print(e)
    
    jprint(e.response.json())

{
  "name": "TCGA",
  "substudies": [
    {
      "cell_of_origin": "ovarian serous carcinoma cell",
      "tissue_hierarchy": "0 - ovarian serous carcinoma cell|1 - ovary cancer cell|2 - ovary|3 - female reproductive gland|3 - internal female genital organ|4 - female reproductive system|4 - endocrine gland|5 - reproductive system|5 - gland|6 - urogenital system|6 - whole body|7 - animal|8 - tissues, cell types and enzyme sources",
      "description": "Ovarian serous cystadenocarcinoma",
      "name": "OV",
      "columns": [
        {
          "datatype": "FLOAT",
          "table": {
            "name": "BigGIM_70_v1"
          },
          "interactions_type": "Spearman Rank Correlation Coefficient",
          "name": "TCGA_OV_Correlation"
        },
        {
          "datatype": "FLOAT",
          "table": {
            "name": "BigGIM_70_v1"
          },
          "interactions_type": "P-value (-log10)",
          "name": "TCGA_OV_Pvalue"
        },
        {
          "dataty

### what an error looks like

In [7]:

try:
    ta = get('metadata/study/%s' % ('TCSG'))
    jprint(studies)
except requests.HTTPError as e:
    print(e)
    
    jprint(e.response.json())

404 Client Error: NOT FOUND for url: http://biggim.ncats.io/api/metadata/study/TCSG
{
  "message": "[TCSG] not a valid study",
  "status": "error"
}


### Get all tables

In [8]:
try:
    tables = get('/metadata/table')
    jprint(tables)
except requests.HTTPError as e:
    print(e)
    
    jprint(e.response.json())

[
  {
    "default": true,
    "name": "BigGIM_70_v1",
    "description": "Gene pairwise associations: correlation metrics from 33 TCGA tumor types, 21 GTEx tissues, functional interaction scores from 145 tissues (from GIANT), and BioGRID interactions. Containing only rows (gene pairs) where at least one of the TCGA or GTEx absolute correlations is higher than 0.7",
    "num_bytes": 146013971730,
    "num_rows": 51090886
  },
  {
    "default": false,
    "name": "BigGIM_80_v1",
    "description": "Gene pairwise associations: correlation metrics from 33 TCGA tumor types, 21 GTEx tissues, functional interaction scores from 145 tissues (from GIANT), and BioGRID interactions. Containing only rows (gene pairs) where at least one of the TCGA or GTEx absolute correlations is higher than 0.8",
    "num_bytes": 41062787154,
    "num_rows": 14210893
  },
  {
    "default": false,
    "name": "BigGIM_90_v1",
    "description": "Gene pairwise associations: correlation metrics from 33 TCGA tumor t

### Get one table

The default table

In [9]:
default_table = [t for t in tables if t['default'] == True][0]['name']


In [10]:
try:
    table = get('/metadata/table/%s' %  (default_table,))
    jprint(table)
except requests.HTTPError as e:
    print(e)
    
    jprint(e.response.json())

{
  "name": "BigGIM_70_v1",
  "description": "Gene pairwise associations: correlation metrics from 33 TCGA tumor types, 21 GTEx tissues, functional interaction scores from 145 tissues (from GIANT), and BioGRID interactions. Containing only rows (gene pairs) where at least one of the TCGA or GTEx absolute correlations is higher than 0.7",
  "num_rows": 51090886,
  "columns": [
    {
      "interactions_type": "Spearman Rank Correlation Coefficient",
      "datatype": "FLOAT",
      "name": "TCGA_ACC_Correlation",
      "substudy": {
        "study": {
          "name": "TCGA",
          "description": " The Cancer Genome Atlas\n        The Cancer Genome Atlas (TCGA) is a collaboration between the National Cancer Institute (NCI) and the National Human Genome Research Institute (NHGRI) that has generated comprehensive, multi-dimensional maps of the key genomic changes in 33 types of cancer. The TCGA dataset, comprising more than two petabytes of genomic data, has been made publically avai

### First 10 columns in default table

In [11]:
jprint(table['columns'][:10])

[
  {
    "interactions_type": "Spearman Rank Correlation Coefficient",
    "datatype": "FLOAT",
    "name": "TCGA_ACC_Correlation",
    "substudy": {
      "study": {
        "name": "TCGA",
        "description": " The Cancer Genome Atlas\n        The Cancer Genome Atlas (TCGA) is a collaboration between the National Cancer Institute (NCI) and the National Human Genome Research Institute (NHGRI) that has generated comprehensive, multi-dimensional maps of the key genomic changes in 33 types of cancer. The TCGA dataset, comprising more than two petabytes of genomic data, has been made publically available, and this genomic information helps the cancer research community to improve the prevention, diagnosis, and treatment of cancer.\n\n        https://cancergenome.nih.gov/\n        "
      },
      "cell_of_origin": "adrenocortical carcinoma cell",
      "tissue_hierarchy": "0 - adrenocortical carcinoma cell|1 - adrenal cortex|2 - adrenal gland|3 - viscus|3 - endocrine gland|4 - whole b

### Get a single column

In [12]:
try:
    column = get('/metadata/table/%s/column/%s' %  (default_table,'TCGA_ACC_Correlation'))
    jprint(column)
except requests.HTTPError as e:
    print(e)
    
    jprint(e.response.json())

{
  "interactions_type": "Spearman Rank Correlation Coefficient",
  "datatype": "FLOAT",
  "name": "TCGA_ACC_Correlation",
  "substudy": {
    "study": {
      "name": "TCGA",
      "description": " The Cancer Genome Atlas\n        The Cancer Genome Atlas (TCGA) is a collaboration between the National Cancer Institute (NCI) and the National Human Genome Research Institute (NHGRI) that has generated comprehensive, multi-dimensional maps of the key genomic changes in 33 types of cancer. The TCGA dataset, comprising more than two petabytes of genomic data, has been made publically available, and this genomic information helps the cancer research community to improve the prevention, diagnosis, and treatment of cancer.\n\n        https://cancergenome.nih.gov/\n        "
    },
    "cell_of_origin": "adrenocortical carcinoma cell",
    "tissue_hierarchy": "0 - adrenocortical carcinoma cell|1 - adrenal cortex|2 - adrenal gland|3 - viscus|3 - endocrine gland|4 - whole body|4 - gland|5 - animal

### list all tissues

In [13]:
tissues = get('metadata/tissue')
jprint(tissues)

{
  "tissues": [
    "B_lymphocyte",
    "B_lymphoma_cell_line",
    "T_lymphocyte",
    "acute_myeloid_leukemia_cell",
    "adenocarcinoma_cell",
    "adipose_tissue",
    "adrenal_cortex",
    "adrenal_gland",
    "adrenal_gland_cancer_cell",
    "adrenocortical_carcinoma_cell",
    "adult_stem_cell",
    "alimentary_canal",
    "amygdala",
    "animal",
    "aorta",
    "artery",
    "astroblast",
    "astrocyte",
    "astrocytoma_cell",
    "astroglia",
    "astroglial_cell",
    "avian_pallium",
    "basal_ganglion",
    "basophil",
    "bile_duct",
    "bile_duct_epithelium",
    "biliary_epithelium",
    "bladder",
    "blast_cell",
    "blastomere",
    "blastula",
    "blood",
    "blood_cancer_cell",
    "blood_plasma",
    "blood_platelet",
    "blood_vessel",
    "blood_vessel_endothelium",
    "bone",
    "bone_cancer_cell",
    "bone_marrow",
    "bone_marrow_cell",
    "brain",
    "brain_stem",
    "brain_ventricle",
    "breast",
    "breast_cancer_cell",
    "breast_e

In [14]:
substudies = get('metadata/tissue/%s' % ("lymphoid_cell",))
jprint(substudies)

{
  "tissue": "lymphoid_cell",
  "substudies": [
    {
      "cell_of_origin": "natural killer cell",
      "tissue_hierarchy": "0 - natural killer cell|1 - null cell|1 - large granular lymphocyte|2 - lymphocyte|3 - leukocyte|3 - lymphoid cell|4 - hematopoietic cell|4 - lymphoid tissue|5 - blood|5 - immune system|5 - hematopoietic system|6 - whole body|7 - animal|8 - tissues, cell types and enzyme sources",
      "description": "natural_killer_cell",
      "name": "natural_killer_cell",
      "columns": [
        {
          "datatype": "FLOAT",
          "table": {
            "name": "BigGIM_70_v1"
          },
          "interactions_type": "Binary Call For Known Functional Interaction",
          "name": "GIANT_natural_killer_cell_KnownFunctionalInteraction"
        },
        {
          "datatype": "FLOAT",
          "table": {
            "name": "BigGIM_70_v1"
          },
          "interactions_type": "Bayesian Posterior Probability Of Functional Interaction",
          "name

## Example query

In [15]:
example_query = {
  "restriction_join": "union",
  "restriction_gt": "TCGA_GBM_Correlation,.2, GTEx_Brain_Correlation,.2",
  "restriction_lt": "TCGA_GBM_Pvalue,.05, GTEx_Brain_Pvalue,.01",
  "table": default_table,
  "ids2": "5111,6996,57697,6815,889,7112,2176,1019,5888,5706,3333,1111,112,3333",
  "columns": "TCGA_GBM_Correlation,TCGA_GBM_Pvalue,GTEx_Brain_Correlation,GTEx_Brain_Pvalue",
  "ids1": "5111,6996,57697,6815,889,7112,2176,1019,5888,5706,5722,1111,112,3333",
  "limit": 100000
}
try:
    query_submit = get('biggim/query', data=example_query)
    jprint(query_submit)
except requests.HTTPError as e:
    print(e)
    
    jprint(e.response.json())

{
  "request_id": "c6f3dd0e-8602-44e2-90fb-b3d46a9b0915",
  "status": "submitted"
}


### Check the status of the query

In [16]:
import time
try:
    while True:
        query_status = get('biggim/status/%s'% (query_submit['request_id'],))
        jprint(query_status)
        if query_status['status'] !='running':
            # query has finished
            break
        else:
            time.sleep(1)
            print("Checking again")
except requests.HTTPError as e:
    print(e)
    
    jprint(e.response.json())

{
  "request_uri": [
    "https://storage.googleapis.com/ncats_bigquery_results/c6f3dd0e-8602-44e2-90fb-b3d46a9b0915000000000000.csv"
  ],
  "size": "2.72 KB",
  "rows": 54,
  "request_id": "c6f3dd0e-8602-44e2-90fb-b3d46a9b0915",
  "status": "complete",
  "processed_data": "0B"
}


### Get the results as dataframe

In [17]:
import pandas
result = pandas.concat(map(pandas.read_csv, query_status['request_uri']))
result

Unnamed: 0,GPID,Gene1,Gene2,TCGA_GBM_Correlation,TCGA_GBM_Pvalue,GTEx_Brain_Correlation,GTEx_Brain_Pvalue
0,21760000005888,5888,2176,0.543,9.83,0.5403,95.62
1,51110000005888,5888,5111,0.5431,9.83,0.551,100.17
2,11110000005888,5888,1111,0.653,15.2,-0.0131,0.19
3,1120000005888,5888,112,0.0019,0.01,0.556,102.35
4,10190000005888,5888,1019,0.5021,8.29,0.3958,47.76
5,10190000005706,5706,1019,0.22,1.8,0.4353,58.63
6,51110000005706,5706,5111,0.3278,3.59,0.7406,218.54
7,8890000005706,5706,889,-0.0382,0.17,0.8163,301.2
8,10190000006996,6996,1019,0.3294,3.62,0.4184,53.8
9,8890000006996,6996,889,0.036,0.16,0.808,290.39


# Full example

Lets get relationships for a certain tissue where any tissue related correlation has a pvalue < .05

 Assume we are interested in issues in lymphoid tissues

In [18]:
# get lymphoid substudies
substudies = get('metadata/tissue/%s' % ("blood",))


# we only want things in a specific table
table = default_table
pvalue_columns = []
column_names = []
for ss in substudies['substudies']:
    for column in ss['columns']:
        # only if column is from our table
        if column['table']['name'] == table:
            # add column to select
            column_names.append(column['name'])
            if column['interactions_type'] == 'P-value (-log10)':
                #add pvalue to where
                pvalue_columns.append(column['name'])
 
# grab columns with lower pvalues
pv = []
for p in pvalue_columns:
    pv.append(p)
    pv.append(str(.05))
query_arg = {}
query_arg['table'] = table
query_arg['columns'] = ','.join(sorted(column_names))
if len(pv):
    query_arg['restriction_lt'] = ','.join(pv)
# get if any of the lymphoid columns have 
query_arg['restriction_join'] = 'union'
query_arg['limit'] = 1000000
jprint(query_arg)

try:
    query_submit = get('biggim/query', data=query_arg)
    jprint(query_submit)
except requests.HTTPError as e:
    print(e)
    
    jprint(e.response.json())

try:
    ctr = 1
    while True:
        
        query_status = get('biggim/status/%s'% (query_submit['request_id'],))
        jprint(query_status)
        if query_status['status'] !='running':
            # query has finished
            break
        else:
            time.sleep(ctr)
            ctr += 1
            #linear backoff
            print("Checking again")
except requests.HTTPError as e:
    print(e)
    
    jprint(e.response.json())
if query_status['request_uri']:
    result = pandas.concat(map(pandas.read_csv, query_status['request_uri']))
    result.head()
else:
    print("Error see above")

{
  "columns": "GIANT_b_lymphocyte_KnownFunctionalInteraction,GIANT_b_lymphocyte_ProbabilityOfFunctionalInteraction,GIANT_basophil_KnownFunctionalInteraction,GIANT_basophil_ProbabilityOfFunctionalInteraction,GIANT_blood_KnownFunctionalInteraction,GIANT_blood_ProbabilityOfFunctionalInteraction,GIANT_blood_plasma_KnownFunctionalInteraction,GIANT_blood_plasma_ProbabilityOfFunctionalInteraction,GIANT_blood_platelet_KnownFunctionalInteraction,GIANT_blood_platelet_ProbabilityOfFunctionalInteraction,GIANT_dendritic_cell_KnownFunctionalInteraction,GIANT_dendritic_cell_ProbabilityOfFunctionalInteraction,GIANT_eosinophil_KnownFunctionalInteraction,GIANT_eosinophil_ProbabilityOfFunctionalInteraction,GIANT_granulocyte_KnownFunctionalInteraction,GIANT_granulocyte_ProbabilityOfFunctionalInteraction,GIANT_hematopoietic_stem_cell_KnownFunctionalInteraction,GIANT_hematopoietic_stem_cell_ProbabilityOfFunctionalInteraction,GIANT_leukocyte_KnownFunctionalInteraction,GIANT_leukocyte_ProbabilityOfFunctional

In [19]:
result.describe()

Unnamed: 0,GPID,Gene1,Gene2,GIANT_b_lymphocyte_KnownFunctionalInteraction,GIANT_b_lymphocyte_ProbabilityOfFunctionalInteraction,GIANT_basophil_KnownFunctionalInteraction,GIANT_basophil_ProbabilityOfFunctionalInteraction,GIANT_blood_KnownFunctionalInteraction,GIANT_blood_ProbabilityOfFunctionalInteraction,GIANT_blood_plasma_KnownFunctionalInteraction,...,GIANT_t_lymphocyte_KnownFunctionalInteraction,GIANT_t_lymphocyte_ProbabilityOfFunctionalInteraction,GIANT_thymocyte_KnownFunctionalInteraction,GIANT_thymocyte_ProbabilityOfFunctionalInteraction,GTEx_Blood_Correlation,GTEx_Blood_Pvalue,TCGA_DLBC_Correlation,TCGA_DLBC_Pvalue,TCGA_LAML_Correlation,TCGA_LAML_Pvalue
count,1000000.0,1000000.0,1000000.0,981572.0,981572.0,981572.0,981572.0,981572.0,981572.0,981572.0,...,981572.0,981572.0,981572.0,981572.0,992398.0,992398.0,903756.0,903756.0,699181.0,699181.0
mean,1113490000000000.0,3162595.0,111349.0,9.1e-05,0.03241,2e-06,0.26731,0.000878,0.050434,0.000184,...,0.000125,0.042617,2.3e-05,0.045766,0.236475,33.171874,0.019585,0.375821,0.039455,1.247562
std,2.864479e+16,17243360.0,2864479.0,0.009522,0.040762,0.001427,0.232961,0.029621,0.044822,0.013578,...,0.011193,0.039923,0.004841,0.059814,0.384304,44.131055,0.166694,0.895111,0.156253,2.369636
min,10000000000.0,13.0,1.0,0.0,1.4e-05,0.0,7.5e-05,0.0,0.000326,0.0,...,0.0,0.000208,0.0,1.1e-05,-0.8897,0.0,-0.8947,0.0,-0.7669,0.0
25%,37680000000000.0,51085.0,3768.0,0.0,0.007402,0.0,0.090176,0.0,0.024408,0.0,...,0.0,0.017192,0.0,0.011122,-0.0031,0.75,-0.014,0.02,-0.0082,0.03
50%,86330000000000.0,83903.0,8633.0,0.0,0.020462,0.0,0.185712,0.0,0.041415,0.0,...,0.0,0.033352,0.0,0.027561,0.1454,9.3,0.0024,0.03,0.0042,0.28
75%,512480000000000.0,171024.0,51248.0,0.0,0.043506,0.0,0.384964,0.0,0.064434,0.0,...,0.0,0.056911,0.0,0.05895,0.6137,57.7175,0.0185,0.3,0.1089,1.46
max,1.053787e+18,109286600.0,105378700.0,1.0,0.996109,1.0,0.999977,1.0,0.996149,1.0,...,1.0,0.99421,1.0,0.999469,0.9626,290.04,0.9203,15.11,0.9339,72.63
