In [None]:
!curl -X GET https://mygene.info/metadata/fields -o fields.json

In [3]:
import json

with open("fields.json") as fd:
    fields = json.load(fd)
len(fields)

342

In [4]:
# fields cannot be searched unless their schema is known, hence delete object fields
keys_to_del = []
for key in fields:
    if fields[key]["type"] == "object":
        keys_to_del.append(key)
    fields[key] = {"type": fields[key]["type"]}

for key in keys_to_del:
    print(key, fields[key])
    del fields[key]

len(fields)

accession {'type': 'object'}
accession.translation {'type': 'object'}
clingen {'type': 'object'}
clingen.clinical_validity {'type': 'object'}
ensembl {'type': 'object'}
ensembl.translation {'type': 'object'}
exac {'type': 'object'}
exac.all {'type': 'object'}
exac.nonpsych {'type': 'object'}
exac.nontcga {'type': 'object'}
exons {'type': 'object'}
exons_hg19 {'type': 'object'}
exons_mm10 {'type': 'object'}
exons_mm9 {'type': 'object'}
generif {'type': 'object'}
genomic_pos {'type': 'object'}
genomic_pos_hg19 {'type': 'object'}
genomic_pos_mm9 {'type': 'object'}
go {'type': 'object'}
go.BP {'type': 'object'}
go.CC {'type': 'object'}
go.MF {'type': 'object'}
homologene {'type': 'object'}
interpro {'type': 'object'}
pantherdb {'type': 'object'}
pantherdb.ortholog {'type': 'object'}
pathway {'type': 'object'}
pathway.biocarta {'type': 'object'}
pathway.humancyc {'type': 'object'}
pathway.kegg {'type': 'object'}
pathway.mousecyc {'type': 'object'}
pathway.netpath {'type': 'object'}
pathway.

270

In [5]:
fields

{'AllianceGenome': {'type': 'keyword'},
 'AnimalQTLdb': {'type': 'text'},
 'FLYBASE': {'type': 'keyword'},
 'HGNC': {'type': 'keyword'},
 'HPRD': {'type': 'keyword'},
 'MGI': {'type': 'keyword'},
 'MIM': {'type': 'keyword'},
 'RATMAP': {'type': 'keyword'},
 'RGD': {'type': 'keyword'},
 'SGD': {'type': 'keyword'},
 'TAIR': {'type': 'keyword'},
 'Vega': {'type': 'text'},
 'WormBase': {'type': 'keyword'},
 'Xenbase': {'type': 'keyword'},
 'ZFIN': {'type': 'keyword'},
 'accession.genomic': {'type': 'text'},
 'accession.protein': {'type': 'text'},
 'accession.rna': {'type': 'text'},
 'alias': {'type': 'keyword'},
 'biocarta': {'type': 'text'},
 'clingen.clinical_validity.classification': {'type': 'keyword'},
 'clingen.clinical_validity.classification_date': {'type': 'date'},
 'clingen.clinical_validity.disease_label': {'type': 'text'},
 'clingen.clinical_validity.gcep': {'type': 'text'},
 'clingen.clinical_validity.moi': {'type': 'keyword'},
 'clingen.clinical_validity.mondo': {'type': 'key

In [6]:
import openai
openai.api_key = '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'

In [7]:
def describe_fields(fields: dict):
    """Call the OpenAI API to describe the keys."""
    
    response = openai.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": "You are an expert in computational biology. Using your experience, describe the following fields from a table. Strictly output a JSON."
            },
            {
                "role": "user",
                "content": json.dumps(fields)
            }
        ],
        response_format={"type": "json_object"}
    )
    return response.choices[0].message.content

In [8]:
import itertools
import time
import tqdm

descriptions = {}
BATCH_SIZE = 10

key_list = list(fields.keys())
for idx in tqdm.tqdm(range(0, len(key_list), BATCH_SIZE)):
    batch = {k: fields[k] for k in key_list[idx:(idx + BATCH_SIZE)]}
    resp = json.loads(describe_fields(batch))
    descriptions.update(resp)

    time.sleep(2)

100%|████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [04:00<00:00,  8.91s/it]


In [9]:
print("Could not summarize the following")
print(set(fields.keys()).difference(set(descriptions.keys())))

Could not summarize the following
set()


In [11]:
with open("detailed_descriptions.json", "w+") as json_fd:
    json.dump(descriptions, json_fd, indent=4)

In [10]:
descriptions

{'AllianceGenome': {'description': 'A consortium focused on providing unified access to genetic and genomic data from different model organism databases.',
  'type': 'keyword'},
 'AnimalQTLdb': {'description': 'A database that curates quantitative trait locus (QTL) data for various animal species, facilitating the study of genetics related to traits in livestock and other animals.',
  'type': 'text'},
 'FLYBASE': {'description': 'A comprehensive database for information on the genetics and molecular biology of the fruit fly Drosophila melanogaster.',
  'type': 'keyword'},
 'HGNC': {'description': 'The HUGO Gene Nomenclature Committee (HGNC) database that provides unique and meaningful names for human genes.',
  'type': 'keyword'},
 'HPRD': {'description': 'The Human Protein Reference Database, a curated database of human proteins.',
  'type': 'keyword'},
 'MGI': {'description': 'The Mouse Genome Informatics database, a comprehensive resource for genetic, genomic, and biological data on

In [12]:
with open("data/original/compact_descriptions.json") as fd:
    desc = json.load(fd)

In [18]:
import pandas as pd
df = pd.DataFrame(desc.items(), columns=["field", "description"])
df.head()

Unnamed: 0,field,description
0,AllianceGenome,A keyword representing resources from the Alli...
1,AnimalQTLdb,A database compiling Quantitative Trait Loci (...
2,FLYBASE,A comprehensive database for Drosophila (fruit...
3,HGNC,A keyword representing the HUGO Gene Nomenclat...
4,HPRD,A keyword representing the Human Protein Refer...


In [19]:
df.to_csv("data/original/compact_desc.csv", index=False)