In [None]:
%%bash
declare -A endpoints=(
    ["mygene"]="https://mygene.info/metadata/fields"
    ["mychem"]="https://mychem.info/metadata/fields"
    ["myvariant"]="https://myvariant.info/metadata/fields"
    ["mydisease"]="https://mydisease.info/metadata/fields"
);

for service in "${!endpoints[@]}"; do
    curl -X GET "${endpoints[$service]}" -o "data/schema/${service}_fields.json";
    echo "Downloaded ${service} schema to data/schema/${service}_fields.json";
done

In [3]:
import json
import os

directory = "data/schema"
schemas = {}

for filename in os.listdir(directory):
    if filename.endswith(".json"):
        key = os.path.splitext(filename)[0].replace("_fields", "")
        with open(os.path.join(directory, filename)) as fd:
            schemas[key] = json.load(fd)

for key, fields in schemas.items():
    print(f"{key}: {len(fields)} entries")

mychem: 1082 entries
mygene: 355 entries
mydisease: 264 entries
myvariant: 1615 entries


In [4]:
version_map = {
    "mychem": "v1",
    "mygene": "v3",
    "mydisease": "v1",
    "myvariant": "v1"
}

In [5]:
# fields cannot be searched unless their schema is known, hence delete object fields
def remove_obj_fields(api_name: str, api_schema: dict):
    keys_to_del = []
    for key in api_schema:
        if api_schema[key]["type"] == "object":
            keys_to_del.append(key)

        # NOTE: can update prompt instead of using "type" every line
        # api_schema[key] = {"type": api_schema[key]["type"]}
    
    for key in keys_to_del:
        del api_schema[key]

    print(f"Deleted {len(keys_to_del)} fields, leaving {len(api_schema)} fields for {api_name}")

In [6]:
for api_name, api_schema in schemas.items():
    remove_obj_fields(api_name, api_schema)

print(f"\nMake sure dict was modified in place, len(mygene): {len(schemas["mygene"])}")

Deleted 145 fields, leaving 937 fields for mychem
Deleted 75 fields, leaving 280 fields for mygene
Deleted 40 fields, leaving 224 fields for mydisease
Deleted 246 fields, leaving 1369 fields for myvariant

Make sure dict was modified in place, len(mygene): 280


In [7]:
def minify_response(data: dict):
    assert isinstance(data, dict)
    for k, v in data.items():
        if isinstance(v, list) and len(v) > 1:
            data[k] = v[:1]
    return data

In [8]:
import httpx

endpoint_template = "https://{service}.info/{version}/query?size=1&fields=all&dotfield=true"

# manually curated ids for biggest record for each API, if needed
entity_ids = {
    "mychem": "AFCGFAGUEYAMAO-UHFFFAOYSA-N",  # InChIKey
    "mygene": "2146",  # short for NCBIGene:2146
    "mydisease": "0000845",  # short for MONDO:0000845
    "myvariant": "162741822G",  # short for chr1:g.162741822G>A
}
minified_responses = {}

with httpx.Client() as client:
    for service in schemas:
        url = endpoint_template.format(service=service, version=version_map[service])
        response = client.get(url)
        if response.status_code == 200:
            minified_responses[service] = minify_response(response.json()["hits"][0])
            print(f"Fetched and minified sample for {service}")
        else:
            print(f"Failed to fetch data for {service}, status code: {response.status_code}")

Fetched and minified sample for mychem
Fetched and minified sample for mygene
Fetched and minified sample for mydisease
Fetched and minified sample for myvariant


In [None]:
schema_desc = {}

In [48]:
output_schema = {
    "type": "object",
    "properties": {
        "descriptions": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "field": {"type": "string"},
                    "description": {"type": "string"}
                },
                "required": ["field", "description"]
            }
        }
    },
    "required": ["descriptions"]
}

In [49]:
print(list(schemas["myvariant"].items()))

[('_seqhashed._flag', {'index': True, 'type': 'boolean'}), ('cadd.1000g.af', {'index': True, 'type': 'float'}), ('cadd.1000g.afr', {'index': False, 'type': 'float'}), ('cadd.1000g.amr', {'index': False, 'type': 'float'}), ('cadd.1000g.asn', {'index': False, 'type': 'float'}), ('cadd.1000g.eur', {'index': False, 'type': 'float'}), ('cadd.alt', {'analyzer': 'string_lowercase', 'index': True, 'type': 'text'}), ('cadd.anc', {'index': False, 'type': 'text'}), ('cadd.annotype', {'analyzer': 'string_lowercase', 'index': True, 'type': 'text'}), ('cadd.bstatistic', {'index': False, 'type': 'integer'}), ('cadd.chmm.bivflnk', {'index': False, 'type': 'float'}), ('cadd.chmm.enh', {'index': False, 'type': 'float'}), ('cadd.chmm.enhbiv', {'index': False, 'type': 'float'}), ('cadd.chmm.het', {'index': False, 'type': 'float'}), ('cadd.chmm.quies', {'index': False, 'type': 'float'}), ('cadd.chmm.reprpc', {'index': False, 'type': 'float'}), ('cadd.chmm.reprpcwk', {'index': False, 'type': 'float'}), ('ca

In [50]:
minified_responses["myvariant"]

{'_id': 'chr6:g.30656453A>G',
 '_score': 1.0,
 'cadd._license': 'http://bit.ly/2TIuab9',
 'cadd.alt': 'G',
 'cadd.anc': 'A',
 'cadd.annotype': 'CodingTranscript',
 'cadd.bstatistic': 851,
 'cadd.chmm.bivflnk': 0.0,
 'cadd.chmm.enh': 0.055,
 'cadd.chmm.enhbiv': 0.0,
 'cadd.chmm.het': 0.0,
 'cadd.chmm.quies': 0.0,
 'cadd.chmm.reprpc': 0.0,
 'cadd.chmm.reprpcwk': 0.0,
 'cadd.chmm.tssa': 0.024,
 'cadd.chmm.tssaflnk': 0.22,
 'cadd.chmm.tssbiv': 0.0,
 'cadd.chmm.tx': 0.0,
 'cadd.chmm.txflnk': 0.047,
 'cadd.chmm.txwk': 0.252,
 'cadd.chmm.znfrpts': 0.0,
 'cadd.chrom': 6,
 'cadd.consdetail': 'synonymous',
 'cadd.consequence': 'SYNONYMOUS',
 'cadd.consscore': 5,
 'cadd.cpg': 0.04,
 'cadd.dna.helt': 0.65,
 'cadd.dna.mgw': 0.15,
 'cadd.dna.prot': 2.54,
 'cadd.dna.roll': 1.48,
 'cadd.encode.exp': 251.28,
 'cadd.encode.h3k27ac': 49.96,
 'cadd.encode.h3k4me1': 22.0,
 'cadd.encode.h3k4me3': 38.8,
 'cadd.encode.nucleo': 1.7,
 'cadd.encode.occ': 3,
 'cadd.encode.p_val.comb': 1.44,
 'cadd.encode.p_val.ct

In [71]:
from openai import OpenAI

openai_client = OpenAI()

def describe_fields(fields: dict, sample_record: dict):
    """Call the OpenAI API to describe the keys."""
    
    response = openai_client.beta.chat.completions.parse(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": "You are an expert in computational biology. Use the example given as reference for the task."
            },
            {
                "role": "user",
                "content": "Using your experience, describe the following fields from a biology database that aggregates data from various sources. Strictly output a JSON.\n" + json.dumps(fields) + "\n\nHere's a sample record\n" + json.dumps(sample_record)
            }
        ],
        response_format={
            "type": "json_schema",
            "json_schema": {
                "name": "field_description",
                "schema": output_schema
            }
        }
    )
    return response.choices[0].message.content

In [5]:
d = {
   "category": "Defines the classification of the entry, categorizing it into specific groups.",
   "compounds.annotation_quality": "Indicates the quality or reliability of the compound's annotation, such as 'high' or 'low'.",
   "compounds.cas_number": "The unique numerical identifier assigned to the chemical substance by the Chemical Abstracts Service (CAS).",
   "compounds.description": "A textual description of the compound, including its sources, occurrences, and properties.",
   "compounds.kingdom": "The broader classification of the compound within organic or inorganic categories.",
   "compounds.klass": "A mid-level classification of the compound, often referring to its chemical family.",
   "compounds.moldb_inchi": "The International Chemical Identifier (InChI) representation of the compound's structure.",
   "compounds.moldb_inchikey": "A hashed version of the InChI that serves as a unique identifier for the compound.",
   "compounds.moldb_iupac": "The IUPAC (International Union of Pure and Applied Chemistry) systematic name of the compound.",
   "compounds.moldb_mono_mass": "The monoisotopic mass of the compound, representing its exact atomic weight.",
   "compounds.moldb_smiles": "The SMILES (Simplified Molecular Input Line Entry System) string representation of the compound's molecular structure.",
   "compounds.name": "The common or widely recognized name of the compound.",
   "compounds.orig_contents.avg": "The average concentration of the compound found in a given source (e.g., food item).",
   "compounds.orig_contents.max": "The maximum recorded concentration of the compound in a given source.",
   "compounds.orig_contents.min": "The minimum recorded concentration of the compound in a given source.",
   "compounds.orig_contents.unit": "The unit of measurement for the compound's concentration, such as mg/100 g.",
   "compounds.reference": "The source or reference database from which the compound information was obtained.",
   "compounds.state": "The physical state of the compound (e.g., solid, liquid, gas) under standard conditions.",
   "compounds.subklass": "A more specific subclassification within the broader chemical class.",
   "compounds.superklass": "The highest-level chemical classification that the compound belongs to.",
   "created_at": "The timestamp indicating when the database entry was created.",
   "creator_id": "The unique identifier of the user or system that created the entry.",
   "description": "A general description of the database entry, including its biological or ecological significance.",
   "export_to_afcdb": "A boolean flag indicating whether the entry is included in the AFCDB (a specific food composition database).",
   "export_to_foodb": "A boolean flag indicating whether the entry is included in FooDB (a food chemistry database).",
   "food_group": "The broad category of food to which the entry belongs (e.g., 'Herbs and Spices').",
   "food_subgroup": "A more specific categorization within the food group (e.g., 'Herbs').",
   "food_type": "A classification of the food type, often used for database organization.",
   "itis_id": "The Integrated Taxonomic Information System (ITIS) identifier for the biological species.",
   "legacy_id": "An older identifier used in previous database versions.",
   "name": "The common or vernacular name of the entry.",
   "name_scientific": "The scientific or Latin name of the biological entity.",
   "ncbi_taxonomy_id": "The National Center for Biotechnology Information (NCBI) Taxonomy ID for the species.",
   "picture_content_type": "The MIME type of the associated image file (e.g., 'image/jpeg').",
   "picture_file_name": "The file name of the associated image.",
   "picture_file_size": "The size of the image file in bytes.",
   "picture_updated_at": "The timestamp indicating the last update of the associated image file.",
   "updated_at": "The timestamp indicating the last modification of the database entry.",
   "updater_id": "The unique identifier of the user or system that last updated the entry.",
   "wikipedia_id": "The Wikipedia page title or identifier associated with the entry."
}


In [6]:
f = {
    "category":{
       "index":True,
       "type":"keyword"
    },
    "compounds.annotation_quality":{
       "index":True,
       "type":"keyword"
    },
    "compounds.cas_number":{
       "index":True,
       "type":"text"
    },
    "compounds.description":{
       "index":True,
       "type":"text"
    },
    "compounds.kingdom":{
       "index":True,
       "type":"text"
    },
    "compounds.klass":{
       "index":True,
       "type":"text"
    },
    "compounds.moldb_inchi":{
       "index":True,
       "type":"keyword"
    },
    "compounds.moldb_inchikey":{
       "index":True,
       "type":"keyword"
    },
    "compounds.moldb_iupac":{
       "index":True,
       "type":"text"
    },
    "compounds.moldb_mono_mass":{
       "index":True,
       "type":"keyword"
    },
    "compounds.moldb_smiles":{
       "index":True,
       "type":"keyword"
    },
    "compounds.name":{
       "index":True,
       "type":"text"
    },
    "compounds.orig_contents.avg":{
       "index":True,
       "type":"float"
    },
    "compounds.orig_contents.max":{
       "index":True,
       "type":"float"
    },
    "compounds.orig_contents.min":{
       "index":True,
       "type":"float"
    },
    "compounds.orig_contents.unit":{
       "index":True,
       "type":"text"
    },
    "compounds.reference":{
       "index":True,
       "type":"text"
    },
    "compounds.state":{
       "index":True,
       "type":"keyword"
    },
    "compounds.subklass":{
       "index":True,
       "type":"text"
    },
    "compounds.superklass":{
       "index":True,
       "type":"text"
    },
    "created_at":{
       "index":True,
       "type":"keyword"
    },
    "creator_id":{
       "index":True,
       "type":"integer"
    },
    "description":{
       "index":True,
       "type":"text"
    },
    "export_to_afcdb":{
       "index":True,
       "type":"boolean"
    },
    "export_to_foodb":{
       "index":True,
       "type":"boolean"
    },
    "food_group":{
       "index":True,
       "type":"text"
    },
    "food_subgroup":{
       "index":True,
       "type":"text"
    },
    "food_type":{
       "index":True,
       "type":"text"
    },
    "itis_id":{
       "index":True,
       "type":"keyword"
    },
    "legacy_id":{
       "index":True,
       "type":"integer"
    },
    "name":{
       "index":True,
       "searched_by_default":True,
       "type":"text"
    },
    "name_scientific":{
       "index":True,
       "type":"text"
    },
    "ncbi_taxonomy_id":{
       "index":True,
       "type":"integer"
    },
    "picture_content_type":{
       "index":True,
       "type":"keyword"
    },
    "picture_file_name":{
       "index":True,
       "type":"keyword"
    },
    "picture_file_size":{
       "index":True,
       "type":"integer"
    },
    "picture_updated_at":{
       "index":True,
       "type":"keyword"
    },
    "updated_at":{
       "index":True,
       "type":"keyword"
    },
    "updater_id":{
       "index":True,
       "type":"integer"
    },
    "wikipedia_id":{
       "index":True,
       "type":"text"
    }
 }

In [8]:
f

{'category': {'type': 'keyword',
  'description': 'A general description of the database entry, including its biological or ecological significance.'},
 'compounds.annotation_quality': {'type': 'keyword',
  'description': 'A general description of the database entry, including its biological or ecological significance.'},
 'compounds.cas_number': {'type': 'text',
  'description': 'A general description of the database entry, including its biological or ecological significance.'},
 'compounds.description': {'type': 'text',
  'description': 'A general description of the database entry, including its biological or ecological significance.'},
 'compounds.kingdom': {'type': 'text',
  'description': 'A general description of the database entry, including its biological or ecological significance.'},
 'compounds.klass': {'type': 'text',
  'description': 'A general description of the database entry, including its biological or ecological significance.'},
 'compounds.moldb_inchi': {'type': 'key

In [72]:
import itertools
import time
import tqdm

def get_schema_descriptions(schema, sample_record):
    descriptions = []
    BATCH_SIZE = 100
    
    key_list = list(schema.keys())
    for idx in tqdm.tqdm(range(0, len(key_list), BATCH_SIZE)):
        batch = {k: schema[k] for k in key_list[idx:(idx + BATCH_SIZE)]}
        resp = json.loads(describe_fields(batch, sample_record))
        # resp = describe_fields(batch, sample_record)
        descriptions.extend(resp["descriptions"][:])
    
        time.sleep(1)
    return descriptions

In [79]:
schema_desc["myvariant"] = get_schema_descriptions(schemas["myvariant"], minified_responses["myvariant"])

100%|█████████████████████████████████████████████████████████████| 14/14 [13:27<00:00, 57.69s/it]


In [80]:
print(json.dumps(schema_desc["myvariant"], indent=2))

[
  {
    "field": "_seqhashed._flag",
    "description": "A boolean flag indicating some form of sequence hashing has been performed. It is indexed for fast lookup."
  },
  {
    "field": "cadd.1000g.af",
    "description": "The allele frequency of this variant in the 1000 Genomes Project dataset. It is represented as a floating point and indexed for queries."
  },
  {
    "field": "cadd.1000g.afr",
    "description": "The allele frequency of this variant within the African population according to the 1000 Genomes Project. It is a floating-point value but not indexed."
  },
  {
    "field": "cadd.1000g.amr",
    "description": "The allele frequency of this variant within the American populations in the 1000 Genomes Project. This is a float and not indexed."
  },
  {
    "field": "cadd.1000g.asn",
    "description": "The allele frequency of this variant in the Asian population according to the 1000 Genomes Project. Stored as a float and not indexed."
  },
  {
    "field": "cadd.1000g.e

In [82]:
for api_name in schemas:
    print("Could not summarize the following for", api_name)
    desc_set = {d["field"] for d in schema_desc[api_name]}
    print(set(schemas[api_name].keys()).difference(desc_set))

Could not summarize the following for mychem
set()
Could not summarize the following for mygene
set()
Could not summarize the following for mydisease
set()
Could not summarize the following for myvariant
{'dbnsfp.linsight.rankscore', 'dbnsfp.hg38.start', 'dbnsfp.fitcons.h1-hesc.confidence_value', 'dbnsfp.deogen2.score', 'dbnsfp.exac_nontcga.amr.af', 'dbnsfp.exac_nontcga.adj_af', 'dbnsfp.eve.score', 'dbnsfp.gtex.eqtl.gene', 'dbnsfp.exac_nonpsych.af', 'dbnsfp.exac_nonpsych.afr.ac', 'dbnsfp.fitcons.h1-hesc.score', 'dbnsfp.exac.sas.af', 'dbnsfp.exac_nonpsych.fin.ac', 'dbnsfp.m-cap.pred', 'dbnsfp.eve.class90_pred', 'dbnsfp.exac_nonpsych.eas.af', 'dbnsfp.eigen.raw_coding', 'dbnsfp.exac_nontcga.amr.ac', 'dbnsfp.exac_nontcga.afr.ac', 'dbnsfp.mutpred.aa_change', 'dbnsfp.phastcons.470way_mammalian.rankscore', 'dbnsfp.esm1b.pred', 'dbnsfp.phastcons.17way_primate.rankscore', 'dbnsfp.exac.ac', 'dbnsfp.hgvsc', 'dbnsfp.exac_nontcga.ac', 'dbnsfp.geuvadis_eqtl_target_gene', 'dbnsfp.exac.fin.af', 'dbnsf

In [83]:
import pandas as pd

for api_name in schemas:
    df = pd.DataFrame(schema_desc[api_name], columns=["field", "description"])
    df.to_csv(f"data/schema/{api_name}_field_desc.csv", index=False)