In [1]:
import requests
import json
import re
from tqdm import tqdm
import os
import pandas as pd
import time
import uuid
from jsonschema import validate

In [2]:
def fetch_save_read(url, file, reader=pd.read_csv, sep='\t', **kwargs):
  ''' Download file from {url}, save it to {file}, and subsequently read it with {reader} using pandas options on {**kwargs}.
  '''
  if not os.path.exists(file):
    if os.path.dirname(file):
      os.makedirs(os.path.dirname(file), exist_ok=True)
    df = reader(url, sep=sep, index_col=None)
    df.to_csv(file, sep=sep, index=False)
  return pd.read_csv(file, sep=sep, **kwargs)

In [3]:
organism = "Mammalia/Homo_sapiens"
url = 'ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/{}.gene_info.gz'.format(organism)
file = '{}.gene_info.tsv'.format(organism)

ncbi_gene = fetch_save_read(url, file)


In [4]:
def maybe_split(record):
    ''' NCBI Stores Nulls as '-' and lists '|' delimited
    '''
    if record in {'', '-'}:
        return set()
    return set(record.split('|'))

def supplement_dbXref_prefix_omitted(ids):
    ''' NCBI Stores external IDS with Foreign:ID while most datasets just use the ID
    '''
    for id in ids:
        # add original id
        yield id
        # also add id *without* prefix
        if ':' in id:
            yield id.split(':', maxsplit=1)[1]

In [5]:
ncbi_gene['All_synonyms'] = [
    set.union(
      maybe_split(gene_info['Symbol']),
      maybe_split(gene_info['Symbol_from_nomenclature_authority']),
      maybe_split(str(gene_info['GeneID'])),
      maybe_split(gene_info['Synonyms']),
      maybe_split(gene_info['Other_designations']),
      maybe_split(gene_info['LocusTag']),
      set(supplement_dbXref_prefix_omitted(maybe_split(gene_info['dbXrefs']))),
    )
    for _, gene_info in ncbi_gene.iterrows()
  ]

synonyms, gene_id = zip(*{
    (synonym, gene_info['GeneID'])
    for _, gene_info in ncbi_gene.iterrows()
    for synonym in gene_info['All_synonyms']
  })
ncbi_lookup_syn = pd.Series(gene_id, index=synonyms)
symbols, cap, gene_id = zip(*{
    (gene_info['Symbol'], gene_info['Symbol'].upper(), gene_info['GeneID'])
    for _, gene_info in ncbi_gene.iterrows()
  })
ncbi_lookup_sym = pd.Series(gene_id, index=symbols)
ncbi_lookup_sym_cap = pd.Series(gene_id, index=cap)

In [6]:
index_values = ncbi_lookup_syn.index.value_counts()
ambiguous = index_values[index_values > 1].index
ncbi_lookup_syn_disambiguated = ncbi_lookup_syn[(
(ncbi_lookup_syn.index == ncbi_lookup_syn) | (~ncbi_lookup_syn.index.isin(ambiguous))
)]
sym_dict = ncbi_lookup_sym.to_dict()
syn_dict_cap = ncbi_lookup_sym_cap.to_dict()
syn_dict = ncbi_lookup_syn_disambiguated.to_dict()
def gene_lookup(gene):
    gene_id = sym_dict.get(gene)
    if gene_id: return str(gene_id)
    gene_id = syn_dict_cap.get(gene)
    if gene_id: return str(gene_id)
    return str(syn_dict.get(gene))

In [7]:
gene_lookup('HLA-A')

'3105'

In [30]:
gene_name_mapper = {
    "Seladin": "DHCR24",
 'AA555029_RC': "AA555029_RC",
 'ELOC (TCEB1)': "ELOC",
 'ERBB2 (HER2)': "ERBB2",
 'HLA_A': "HLA-A",
 'ROS1\n': "ROS1",
 'AA555029_RC': 'AA555029_RC',
 'ACVR1 (ALK2)': '',
 'ARHGDB': 'ARHGDB',
 'CD274 (PD- L1)': 'CD274',
 'CD274 (PD-L1)': 'CD274',
 'CHECK1': 'CHECK1',
 'DHRF': 'DHRF',
 'ER': 'ER',
 'ERBB2 (Her2/Neu)': 'ERBB2',
 'FDF18': 'FDF18',
 'FRAG1': 'FRAG1',
 'FRDX2': 'FRDX2',
 'FRKCA': 'FRKCA',
 'GUS': 'GUS',
 'HER2 (ERBB2)': 'HER2',
 'KIAA0999': 'KIAA0999',
 'KIAA1625': 'KIAA1625',
 'KIE20A': 'KIE20A',
 'KRTS': 'KRTS',
 'Ki67': 'Ki67',
 'LOC100131053': 'LOC100131053',
 'LOC100288906': 'LOC100288906',
 'LOC730018': 'LOC730018',
 'MAP2K\n': 'MAP2K',
 'MAP2K1 (MEK1)': 'MAP2K1',
 'MAP2K2 (MEK2)': 'MAP2K2',
 'MDF1': 'MDF1',
 'ORCL6L': 'ORCL6L',
 'P1K3R3': 'P1K3R3',
 'PALB3': 'PALB3',
 'PDCD1L G2 (PD-L2)': 'PD-L2',
 'PDGRFA': 'PDGRFA',
 'PHGCH': 'PHGCH',
 'PTNP11': 'PTNP11',
 'RPLPO': 'RPLPO',
 'SLCA7A5': 'SLCA7A5',
 'SMAD4/DPC4': 'SMAD4',
 'STK11/LKB1': 'STK11',
}


In [31]:
invalid_genes = set()
def valid_gene(gene):
    gene = gene_name_mapper.get(gene, gene)
    if gene_lookup(gene) != 'None':
        return True
    invalid_genes.add(gene)
    return False

In [35]:
gmt = []
with open('data/011724_biomarkers.gmt') as o:
    for line in o:
        label, description, *genes = line.split("\t")
        valid_genes = []
        for gene in genes:
            if valid_gene(gene):
                valid_genes.append(gene)
        if len(valid_genes) >= 5:
            gmt.append([label, description, *valid_genes])


In [36]:
with open('data/011724_filtered_biomarkers.gmt', 'w') as o:
    o.write("\n".join(["\t".join(i) for i in gmt]))


In [38]:
terms = {'Breast Cancer',
 'Gastric Cancer',
 'Cell-Free Tumor DNA',
 'Circulating Tumor DNA',
 'Colorectal Cancer',
 'Genetic alterations',
 'Genomic alterations',
 'Mutations, alterations',
 'NSCLC',
 'Pancreatic Cancer',
 'Prostate Cancer',
 'Thyroid Nodules',
 'Tumor profiling'}

In [57]:
onto_id_mapper = {}
for i in terms:
    print(i)
    res = requests.get("https://www.ebi.ac.uk/ols4/api/v2/entities?search=%s&ontologyId=doid"%i)
    results = res.json()
    if results['numElements'] > 0:
        for e in results["elements"]:
            if type(e["label"]) == str and e["label"].lower() == i.lower() or ("synonym" in e and i.lower() in [d.lower() for d in e["synonym"]]):
                onto_id_mapper[i] = {
                    "label": e["label"],
                    "onto_id": e["curie"],
                    "source": e["ontologyPreferredPrefix"]
                }
                break

    


Genetic alterations
Pancreatic Cancer
Tumor profiling
Prostate Cancer
NSCLC
Breast Cancer
Gastric Cancer
Circulating Tumor DNA
Colorectal Cancer
Mutations, alterations
Thyroid Nodules
Cell-Free Tumor DNA
Genomic alterations


In [58]:
unmatched = set(terms) - set(onto_id_mapper.keys())

In [59]:
for i in unmatched:
    print(i)
    res = requests.get("https://www.ebi.ac.uk/ols4/api/v2/entities?search=%s"%i)
    results = res.json()
    if results['numElements'] > 0:
        for e in results["elements"]:
            if type(e["label"]) == str and e["label"].lower() == i.lower():
                onto_id_mapper[i] = {
                    "label": e["label"],
                    "onto_id": e["curie"],
                    "source": e["ontologyPreferredPrefix"]
                }
                break

    


Genetic alterations
Tumor profiling
Mutations, alterations
Circulating Tumor DNA
Thyroid Nodules
Cell-Free Tumor DNA
Genomic alterations


In [60]:
set(terms) - set(onto_id_mapper.keys())

{'Cell-Free Tumor DNA',
 'Circulating Tumor DNA',
 'Genetic alterations',
 'Genomic alterations',
 'Mutations, alterations',
 'Thyroid Nodules',
 'Tumor profiling'}

In [61]:
columns = ["best_biomarker_role", "biomarker", "recommended_name", "assessed_biomarker_entity_id", "assessed_entity_type", "source", "pubmed", "evidence_tags", "condition_id", "condition_name", "condition_resource", "condition_url"]

In [67]:
df = pd.DataFrame(index=[], columns=columns)

In [151]:
ind = 0
for i in gmt:
    source, condition, *genes = i
    condition_id = []
    condition_name = []
    condition_url = []
    for c in condition.split("/ "):
        if c in onto_id_mapper:
            val = onto_id_mapper[c]
            condition_id.append(val["onto_id"])
            condition_name.append(val["label"])
            condition_url.append("https://disease-ontology.org/?id=%s"%val["onto_id"])
    for gene in genes:
        gene_id = gene_lookup(gene)
        if gene_id != 'None':
            biomarker = "%s (%s)"%(source, gene)
            if len(condition_id) == 0:
                 biomarker = "%s (%s %s)"%(source, gene, condition)
            tags = ["biomarker"]
            # for i in condition_id:
            #     tags.append("condition:%s"%i)
            df.loc[ind] = {
                "best_biomarker_role": "",
                "biomarker": biomarker,
                "recommended_name": gene,
                "assessed_biomarker_entity_id": "NCBI:%s"%gene_id,
                "assessed_entity_type": "gene",
                "source": source,
                "pubmed": "",
                "evidence_tags": tags,
                "condition_id": "|".join(condition_id),
                "condition_name": "|".join(condition_name),
                "condition_url": "|".join(condition_url),
                "condition_resource": "DOID" if len(condition_id) > 0 else ""
            }
            ind += 1

In [152]:
df.head()

Unnamed: 0,best_biomarker_role,biomarker,recommended_name,assessed_biomarker_entity_id,assessed_entity_type,source,pubmed,evidence_tags,condition_id,condition_name,condition_resource,condition_url
0,,Oncotype DX (STK15),STK15,NCBI:6790,gene,Oncotype DX,,[biomarker],DOID:1612,breast cancer,DOID,https://disease-ontology.org/?id=DOID:1612
1,,Oncotype DX (BIRC5),BIRC5,NCBI:332,gene,Oncotype DX,,[biomarker],DOID:1612,breast cancer,DOID,https://disease-ontology.org/?id=DOID:1612
2,,Oncotype DX (CCNB1),CCNB1,NCBI:891,gene,Oncotype DX,,[biomarker],DOID:1612,breast cancer,DOID,https://disease-ontology.org/?id=DOID:1612
3,,Oncotype DX (MYBL2),MYBL2,NCBI:4605,gene,Oncotype DX,,[biomarker],DOID:1612,breast cancer,DOID,https://disease-ontology.org/?id=DOID:1612
4,,Oncotype DX (MMP11),MMP11,NCBI:4320,gene,Oncotype DX,,[biomarker],DOID:1612,breast cancer,DOID,https://disease-ontology.org/?id=DOID:1612


In [153]:
df.to_csv("out/biomarker_formatted.tsv", sep="\t", index=None)

In [154]:
vals = df.loc[0]

In [155]:
entry = {
    "biomarker_component": [],
    "best_biomarker_role": "diagnostic"
}

In [156]:
biomarker = {}
biomarker["biomarker"] = vals["biomarker"]
biomarker["assessed_biomarker_entity"] = {
    "recommended_name": vals["recommended_name"]
}
biomarker["assessed_biomarker_entity_id"] = vals["assessed_biomarker_entity_id"]
biomarker["assessed_entity_type"]= vals["assessed_entity_type"]
biomarker["evidence_source"] = [
    {
        "evidence_id": "33519238",
        "database": "PubMed",
        "url": "https://pubmed.ncbi.nlm.nih.gov/33519238/",
        "tags": [{"tag": t.strip()} for t in vals["evidence_tags"]]
    }
]

In [157]:
entry["biomarker_component"].append(biomarker)

In [158]:
print(json.dumps(entry, indent=4))

{
    "biomarker_component": [
        {
            "biomarker": "Oncotype DX  (STK15)",
            "assessed_biomarker_entity": {
                "recommended_name": "STK15"
            },
            "assessed_biomarker_entity_id": "NCBI:6790",
            "assessed_entity_type": "gene",
            "evidence_source": [
                {
                    "evidence_id": "33519238",
                    "database": "PubMed",
                    "url": "https://pubmed.ncbi.nlm.nih.gov/33519238/",
                    "tags": [
                        {
                            "tag": "biomarker"
                        }
                    ]
                }
            ]
        }
    ],
    "best_biomarker_role": "diagnostic"
}


In [159]:
schema = requests.get("https://raw.githubusercontent.com/biomarker-ontology/biomarker-partnership/main/schema/v0.3/biomarker_schema.json").json()
sample = requests.get("https://raw.githubusercontent.com/biomarker-ontology/biomarker-partnership/main/supplementary_files/sample_data_model_structures/v0.3/sample_biomarker.json").json()

In [160]:
validate(instance=[entry], schema=schema)