In [4]:
%%bash
declare -A endpoints=(
    ["mygene"]="https://mygene.info/metadata/fields"
    ["mychem"]="https://mychem.info/metadata/fields"
    ["myvariant"]="https://myvariant.info/metadata/fields"
    ["mydisease"]="https://mydisease.info/metadata/fields"
);

for service in "${!endpoints[@]}"; do
    curl -X GET "${endpoints[$service]}" -o "data/schema/${service}_fields.json";
    echo "Downloaded ${service} schema to data/schema/${service}_fields.json";
done

Downloaded mychem schema to data/schema/mychem_fields.json
Downloaded mygene schema to data/schema/mygene_fields.json
Downloaded mydisease schema to data/schema/mydisease_fields.json
Downloaded myvariant schema to data/schema/myvariant_fields.json


In [12]:
import json
import os

directory = "data/schema"
schemas = {}

for filename in os.listdir(directory):
    if filename.endswith(".json"):
        key = os.path.splitext(filename)[0].replace("_fields", "")
        with open(os.path.join(directory, filename)) as fd:
            schemas[key] = json.load(fd)

for key, fields in schemas.items():
    print(f"{key}: {len(fields)} entries")

mychem: 1082 entries
mygene: 355 entries
mydisease: 264 entries
myvariant: 1615 entries


In [13]:
# fields cannot be searched unless their schema is known, hence delete object fields
def remove_obj_fields(api_name: str, api_schema: dict):
    keys_to_del = []
    for key in api_schema:
        if api_schema[key]["type"] == "object":
            keys_to_del.append(key)

        # NOTE: can update prompt instead of using "type" every line
        # api_schema[key] = {"type": api_schema[key]["type"]}
    
    for key in keys_to_del:
        del api_schema[key]

    print(f"Deleted {len(keys_to_del)} fields, leaving {len(api_schema)} fields for {api_name}")

In [17]:
for api_name, api_schema in schemas.items():
    remove_obj_fields(api_name, api_schema)

print(f"\nMake sure dict was modified in place, len(mygene): {len(schemas["mygene"])}")

Deleted 0 fields, leaving 937 fields for mychem
Deleted 0 fields, leaving 280 fields for mygene
Deleted 0 fields, leaving 224 fields for mydisease
Deleted 0 fields, leaving 1369 fields for myvariant

Make sure dict was modified in place, len(mygene): 280


In [5]:
import openai

In [7]:
with open("data/original/minified_gene_response.json") as fd:
    sample_record = fd.read()

In [26]:
output_schema = {
                    "type": "object",
                    "properties": {
                        "descriptions": {
                            "type": "array",
                            "items": {
                                "type": "object",
                                "properties": {
                                    "field": {"type": "string"},
                                    "description": {"type": "string"}
                                },
                                "required": ["field", "description"]
                            }
                        }
                    },
                    "required": ["descriptions"]
                }

In [27]:
def describe_fields(fields: dict):
    """Call the OpenAI API to describe the keys."""
    
    response = openai.chat.completions.create(
        model="gpt-4o-2024-08-06",
        messages=[
            {
                "role": "system",
                "content": "You are an expert in computational biology. Use the example given as reference for the task."
            },
            {
                "role": "user",
                "content": "Using your experience, describe the following fields from a biology database that aggregates various sources. Strictly output a JSON.\n" + json.dumps(fields) + "\n\nHere's a sample record\n" + sample_record
            }
        ],
        response_format={
            "type": "json_schema",
            "json_schema": {
                "name": "field_description",
                "schema": output_schema
            }
        }
    )
    return response.choices[0].message.content

In [37]:
import itertools
import time
import tqdm

descriptions = []
BATCH_SIZE = 10

key_list = list(fields.keys())
for idx in tqdm.tqdm(range(0, len(key_list), BATCH_SIZE)):
    batch = {k: fields[k] for k in key_list[idx:(idx + BATCH_SIZE)]}
    resp = json.loads(describe_fields(batch))
    descriptions.extend(resp["descriptions"][:])

    time.sleep(2)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [03:32<00:00,  7.88s/it]


In [39]:
descriptions

[{'field': 'AllianceGenome',
  'description': 'A unique identifier from the Alliance of Genome Resources database, which integrates data from several model organism databases.'},
 {'field': 'AnimalQTLdb',
  'description': 'The Animal QTLdb field contains data regarding Quantitative Trait Loci (QTL) in animals such as livestock, typically used for understanding the genetic basis of complex traits like skeletal or cardiovascular traits.'},
 {'field': 'FLYBASE',
  'description': 'A unique identifier from FlyBase, a database dedicated to the genetics and genomics of Drosophila melanogaster.'},
 {'field': 'HGNC',
  'description': 'A unique identifier provided by the HUGO Gene Nomenclature Committee, which assigns standardized gene symbols and names to human genes.'},
 {'field': 'HPRD',
  'description': 'A unique identifier from the Human Protein Reference Database, a resource for exploring the protein-protein interactions of human proteins.'},
 {'field': 'MGI',
  'description': 'An identifi

In [40]:
print("Could not summarize the following")
desc_set = {d["field"] for d in descriptions}
print(set(fields.keys()).difference(desc_set))

Could not summarize the following
set()


In [11]:
# with open("detailed_descriptions.json", "w+") as json_fd:
#     json.dump(descriptions, json_fd, indent=4)

In [12]:
# with open("data/original/compact_descriptions.json") as fd:
#     desc = json.load(fd)

In [41]:
import pandas as pd
df = pd.DataFrame(descriptions, columns=["field", "description"])
df.head()

Unnamed: 0,field,description
0,AllianceGenome,A unique identifier from the Alliance of Genom...
1,AnimalQTLdb,The Animal QTLdb field contains data regarding...
2,FLYBASE,"A unique identifier from FlyBase, a database d..."
3,HGNC,A unique identifier provided by the HUGO Gene ...
4,HPRD,A unique identifier from the Human Protein Ref...


In [42]:
df.to_csv("data/original/compact_desc_with_context.csv", index=False)