In [1]:
import json
import rdflib
import re
import pandas as pd
from rdflib import URIRef, BNode, Literal, Graph, RDF, Namespace, ODRL2, RDFS
import requests
import time
from tqdm import tqdm
import urllib.parse
from maayanlab_bioinformatics.utils import fetch_save_read, merge
from functools import lru_cache

In [2]:
@lru_cache()
def ncbi_genes_fetch(organism='Mammalia/Homo_sapiens', filters=None):
    ''' Fetch the current NCBI Human Gene Info database.
    See ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/ for the directory/file of the organism of interest.
    '''
    def maybe_split(record):
        ''' NCBI Stores Nulls as '-' and lists '|' delimited
        '''
        if record in {'', '-'}:
            return set()
        return set(record.split('|'))
    #
    def supplement_dbXref_prefix_omitted(ids):
        ''' NCBI Stores external IDS with Foreign:ID while most datasets just use the ID
        '''
        for id in ids:
          # add original id
          yield id
          # also add id *without* prefix
          if ':' in id:
            yield id.split(':', maxsplit=1)[1]
    #
    ncbi = fetch_save_read(
    'ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/{}.gene_info.gz'.format(organism),
    '{}.gene_info.tsv'.format(organism),
    sep='\t',
    )
    if filters and callable(filters):
        ncbi = ncbi[filters(ncbi)]
    #
    ncbi['All_synonyms'] = [
        set.union(
          maybe_split(gene_info['Symbol']),
          maybe_split(gene_info['Symbol'].upper()),
          maybe_split(gene_info['Symbol_from_nomenclature_authority']),
          maybe_split(str(gene_info['GeneID'])),
          maybe_split(gene_info['Synonyms']),
          maybe_split(gene_info['Other_designations']),
          maybe_split(gene_info['LocusTag']),
          set(supplement_dbXref_prefix_omitted(maybe_split(gene_info['dbXrefs']))),
        )
        for _, gene_info in ncbi.iterrows()
    ]
    return ncbi

@lru_cache()                  
def ncbi_genes_lookup(organism='Mammalia/Homo_sapiens', filters=lambda ncbi: ncbi['type_of_gene']=='protein-coding'):
    ''' Return a lookup dictionary with synonyms as the keys, and official symbols as the values
    Usage:
    ```python
    ncbi_lookup = ncbi_genes_lookup('Mammalia/Homo_sapiens')
    print(ncbi_lookup('STAT3')) # any alias will get converted into the official symbol
    ```
    '''
    ncbi_genes = ncbi_genes_fetch(organism=organism)
    synonyms, symbols, gene_ids = zip(*{
    (synonym, gene_info['Symbol'], gene_info['GeneID'])
    for _, gene_info in ncbi_genes.iterrows()
    for synonym in gene_info['All_synonyms']
    })
    ncbi_lookup = pd.Series(symbols, index=synonyms)
    index_values = ncbi_lookup.index.value_counts()
    ncbi_lookup_disambiguated = ncbi_lookup.drop(index_values[index_values > 1].index)
    for i in symbols:
        if i not in ncbi_lookup_disambiguated.index:
            ncbi_lookup_disambiguated[i] = i
    
    ncbi_lookup_id = pd.Series(gene_ids, index=synonyms)
    index_values_id = ncbi_lookup_id.index.value_counts()
    ncbi_lookup_disambiguated_id = ncbi_lookup_id.drop(index_values_id[index_values_id > 1].index)
    ncbi_genes = ncbi_genes.set_index("GeneID")
    for i in gene_ids:
        gene = ncbi_genes.at[i, "Symbol"]
        if gene not in ncbi_lookup_disambiguated_id.index:
            ncbi_lookup_disambiguated_id[gene] = i

    return ncbi_lookup_disambiguated.to_dict().get, ncbi_lookup_disambiguated_id.get

symbol_mapper, id_mapper = ncbi_genes_lookup()
def get_info(gene):
    return(symbol_mapper(gene), str(id_mapper(gene)))

get_info('NOS1'), get_info('HAUS1P1')

(('NOS1', '4842'), ('HAUS1P1', '728921'))

In [3]:
gene_scores = pd.read_csv("https://raw.githubusercontent.com/nih-cfde/ReproToxTables/main/Susceptibility%20Scores%20and%20GWAS%20Gene%20Lists/Susceptibility%20Scores/All%20Scores.csv", index_col=0)
gene_scores.head()

Unnamed: 0_level_0,pLI,Residual.Variation.Intolerance.Score,Residual.Variation.Intolerance.Score.Percentile,pHI,pTS
Gene Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A1BG,9.064924e-05,-0.466531,23.5138,0.1345,0.52075
A1CF,0.003619701,-0.378346,28.013682,0.21197,0.22586
A2M,0.0005401149,0.099179,60.757254,0.9201,0.39196
A2ML1,1.3290220000000001e-22,2.419901,98.525596,0.27137,0.18193
A4GALT,0.1115987,-0.290161,33.339231,0.14511,0.09411


In [4]:
drug_scores = pd.read_csv("data/L1000_2021_Signature_Similarity_predicted_drug_table.csv", index_col=0)
drug_scores["Rank"] = drug_scores.Similarity_Score.rank(method="first")
drug_scores.head()

Unnamed: 0,Similarity_Score,Rank
FTI-276,0.111335,33473.0
BRD-K08703257,0.104131,33472.0
temozolomide,0.102109,33471.0
Gossypetin,0.101613,33470.0
TAK-715,0.100779,33469.0


In [5]:
HPO_Freq = pd.read_csv("data/HPO_Freq.tsv", sep="\t", index_col=1)
HPO_Freq.head()

Unnamed: 0_level_0,HPO description,KF Frequency
HPO ID,Unnamed: 1_level_1,Unnamed: 2_level_1
HP:0410030,Cleft lip,888
HP:0000776,Congenital diaphragmatic hernia,770
HP:0000175,Cleft palate,704
HP:0010438,Abnormal ventricular septum morphology,629
HP:0001710,Conotruncal defect,545


In [6]:
associations = {
  'https://raw.githubusercontent.com/nih-cfde/ReproToxTables/main/CDC-birth-defects/Geneshot_BirthDefects_Gene_Associations.ttl': "Geneshot",
  'https://raw.githubusercontent.com/nih-cfde/ReproToxTables/main/CDC-birth-defects/Drugshot_BirthDefects_Drug_Associations.ttl': "Drugshot",
  'https://raw.githubusercontent.com/nih-cfde/ReproToxTables/main/CDC-birth-defects/DrugEnrichr_BirthDefects_Drug_Associations.ttl': "DrugEnrichr",
}


In [7]:
with open("data/birth_defects.json") as o:
    birth_defects = json.loads(o.read())

with open("data/drug_ref.json") as o:
    drugs = json.loads(o.read())
    for k,v in drugs.items():
        new_dict = {}
        for i,j in v.items():
            new_dict[i.replace(".", "_")] = j
        drugs[k] = new_dict

with open("data/gene_ref.json") as o:
    genes = json.loads(o.read())
    for k,v in genes.items():
        new_dict = {}
        for i,j in v.items():
            new_dict[i.replace(".", "_")] = j
        genes[k] = new_dict

In [8]:
for k,v in genes.items():
    a = v.pop("Residual.Variation.Intolerance.Score", None)
    if a:
        v["Residual_Variation_Intolerance_Score"] = a
    a = v.pop("Residual.Variation.Intolerance.Score.Percentile", None)
    if a:
        v["Residual_Variation_Intolerance_Score_Percentile"] = a

In [9]:
iri_mappers = [
  dict(
    cls='BirthDefect',
    expr=re.compile(r'^https://www\.orpha\.net/ORDO/(?P<id>.+)$'),
  ),
  dict(
    cls='BirthDefect',
    expr=re.compile(r'^https://purl\.obolibrary\.org/obo/(?P<id>.+)$'),
  ),
  dict(
    cls='Gene',
    expr=re.compile(r'^https://identifiers\.org/hgnc\.symbol/(?P<symbol>.+)$'),
  ),
  dict(
    cls='Drug',
    expr=re.compile(r'^https://identifiers\.org/lincs\.smallmolecule:(?P<id>.+)$'),
  ),
  # dict(
  #   cls='Relationship',
  #   expr=re.compile(r'^https://semanticscience\.org/resource/(?P<id>.+)$'),
  # ),
  dict(
    cls='BirthDefect',
    expr=re.compile(r'^https://hpo\.jax\.org/app/browse/term/(?P<id>.+)$'),
  ),
]

relationships = {
  "SIO_010299": "disease",
  "SIO_010056": "phenotype",
  "SIO_000993": "chemical-disease",
  "SIO_000983": "gene-disease"
}

orpha = {}
def iri_to_node(iri):
    for mapper in iri_mappers:
        iri = iri.replace("HP_", "HP:")
        m = mapper['expr'].match(iri)
        if m:
            group_dict = m.groupdict()
            if mapper['cls'] == "Gene":
              properties = {
                "id": genes[group_dict["symbol"]]["id"],
                "label": genes[group_dict["symbol"]]["label"]
              }
              if group_dict["symbol"] in gene_scores:
                for k,v in gene_scores.loc[group_dict["symbol"]]:
                  properties[k.replace(".", "_" )] = v
              gene_symbol, gene_id = get_info(group_dict["symbol"])
              if gene_symbol and gene_id:
                properties["label"] = gene_symbol
                properties["id"] = gene_id
                properties["uri"] = "https://www.ncbi.nlm.nih.gov/gene/%s"%gene_id
                
                
              else:
                properties["uri"] = "https://uswest.ensembl.org/Homo_sapiens/Gene/Summary?g=%s"%group_dict["symbol"]
              return { "type": mapper['cls'], "properties": {**properties}}
            elif mapper['cls'] == "Drug":
              gd = drugs[group_dict["id"]]
              if not str(drugs[group_dict["id"]]["id"]).startswith("LSM"):
                gd["id"] = str(drugs[group_dict["id"]]["id"])
                gd["uri"] = "https://pubchem.ncbi.nlm.nih.gov/compound/%s"%str(group_dict["id"])
              return {"type": mapper['cls'], "properties": gd}
            elif mapper['cls'] == "BirthDefect":
              birth_defect_id = group_dict["id"]
              if birth_defect_id in birth_defects:
                birth_defect = birth_defects[birth_defect_id]
                properties = {
                  "id": birth_defect_id,
                  "label": birth_defect,
                  "uri": "https://purl.obolibrary.org/obo/=%s"%birth_defect
                }
                return { "type": mapper['cls'], "properties": {**properties}}
              elif birth_defect_id in orpha:
                properties = orpha[birth_defect_id]
                return { "type": mapper['cls'], "properties": {**properties}}
              else:
                res = requests.get("https://www.ebi.ac.uk/ols/api/select?q=%s"%birth_defect_id)
                if not res.ok:
                  print("ERROR %s"%birth_defect_id)
                  print(res.text)
                else:
                  print(birth_defect_id)
                  birth_defect = res.json()["response"]["docs"][0]["label"]
                  birth_defect_id = res.json()["response"]["docs"][0]["obo_id"]
                  properties = {
                    "id": birth_defect_id,
                    "label": birth_defect,
                    "uri": res.json()["response"]["docs"][0]["iri"]
                  }
                  orpha[birth_defect_id.replace(":", "_")] = properties
                  return { "type": mapper['cls'], "properties": {**properties}}
            else:
              print(group_dict)

              
              
            # if "id" in group_dict:
            #     if group_dict["id"] in birth_defects:
            #         group_dict["label"] = birth_defects[group_dict["id"]]
            #         return {"type": mapper['cls'], "properties": group_dict}
            #     elif group_dict["id"] in drugs:
            #         gd = drugs[group_dict["id"]]
            #         if mapper['cls'] == 'Drug':
            #           gd["id"] = "CID:%s"%str(drugs[group_dict["id"]]["id"])
            #         return {"type": mapper['cls'], "properties": gd}
            #     elif group_dict["id"] in relationships:
            #         group_dict["label"] = relationships[group_dict["id"]]
            #         return {"type": mapper['cls'], "properties": group_dict}
            # elif "symbol" in group_dict:
            #     gene_symbol, gene_id = get_info(group_dict["symbol"])
            #     if gene_symbol and gene_id:
            #       properties = genes[group_dict["symbol"]]
            #       properties["label"] = gene_symbol
            #       properties["id"] = gene_id
            #       properties["uri"] = "https://www.ncbi.nlm.nih.gov/gene/%s"%gene_id
            #       return { "type": mapper['cls'], "properties": {**group_dict, **properties}}
            #     else:
            #       return { "type": mapper['cls'], "properties": {**group_dict, 
            #         **genes[group_dict["symbol"]],
            #         "uri": "https://uswest.ensembl.org/Homo_sapiens/Gene/Summary?g=%s"%group_dict["symbol"]
            #       }}
            # return {"type": mapper['cls'], "properties": group_dict}


In [10]:
edges = []
nodes = {}
hpo_ids = set()
no_freq = set()
for assoc, resource in associations.items():
    rdfgraph = rdflib.Graph()
    rdfgraph.parse(assoc, format='ttl')
    for i in rdfgraph.all_nodes():
        iri = iri_to_node(i)
        if iri and str(iri["properties"]["id"]) not in nodes:
            if iri["type"] == "Drug":
                # iri["properties"]["uri"] = "https://pubchem.ncbi.nlm.nih.gov/compound/%s"%str(iri["properties"]["id"].replace("CID:", ""))
                nodes[str(iri["properties"]["id"])] = {
                    **iri,
                    
                }
            elif iri["type"] == "Gene":
                # iri["properties"]["uri"] = "https://uswest.ensembl.org/Homo_sapiens/Gene/Summary?g=%s"%str(iri["properties"]["id"])
                nodes[str(iri["properties"]["id"])] = {
                    **iri,
                    
                }
            elif iri["type"] == "BirthDefect" and str(iri["properties"]["id"].startswith("HP:")):
                # iri["properties"]["uri"] = "https://purl.obolibrary.org/obo/=%s"%str(iri["properties"]["id"])
                node_id = str(iri["properties"]["id"])
                nodes[node_id] = {
                    **iri,
                }
                if node_id in HPO_Freq.index:
                    nodes[node_id]["KF_freq"] = int(HPO_Freq.at[str(iri["properties"]["id"]), 'KF Frequency'])
                    hpo_ids.add(node_id)
                else:
                    no_freq.add(node_id)
            else: 
                nodes[str(iri["properties"]["id"])] = {
                    **iri,
                    "uri": str(i)
                }
    for subj, pred, obj in rdfgraph.triples((None, None, None)):
        subj_iri = iri_to_node(subj)
        obj_iri = iri_to_node(obj)
        if subj_iri and nodes[str(subj_iri["properties"]["id"])] and obj_iri and nodes[str(obj_iri["properties"]["id"])]:
            pred = str(pred)
            n = iri_to_node(pred)
            if n and n["id"]:
                pred = relationships[n["id"]]
            
            edges.append({
                "type": "Relation",
                "source": str(subj_iri["properties"]["id"]),
                "relation": "%s"%resource,#(resource, relationships[pred.split("/")[-1]]),
                "target": str(obj_iri["properties"]["id"]),
                "properties": {
                    "id": "%s_%s_%s"%(subj_iri["properties"]["label"], relationships[pred.split("/")[-1]], obj_iri["properties"]["label"]),
                    "source_label": subj_iri["properties"]["label"],
                    "target_label": obj_iri["properties"]["label"],
                    "directed": True,
                    "resource": resource
                }
            })


Orphanet_3378
Orphanet_3380
Orphanet_870


In [11]:
serialization_v1 = {
    "version": "1",
    "nodes": nodes,
    "edges": edges
}
with open("results/reprotox_serialization.v1.json", "w") as o:
    o.write(json.dumps(serialization_v1, indent=2))

In [12]:
def isNumber(value):
    try:
        v = int(value)
        return {"@type": "int", "@value": v}
    except:
        try:
            v = float(value)
            return {"@type": "number", "@value": v}
        except:
            return False


In [13]:
isinstance('', str)

True

In [14]:
def typer(value):
    numb = isNumber(str(value))
    if numb:
        return numb
    elif isinstance(value, str):
        return {
            "@type": "string",
            "@value": value
        }
    elif isinstance(value, list):
        type_list = []
        for i in value:
            type_list.append(typer(i))
        
        return {
            "@type": "array",
            "@value": type_list
        }
    elif isinstance(value, dict):
        type_dict = {}
        for k,v in value.items():
            if k == "id" or k == "target" or k == "source":
                type_dict[k] = {
                    "@type": "string",
                    "@value": v
                }
            else: type_dict[k] = typer(v)
        return {
            "@type": "object",
            "@value": type_dict
        }
    

In [15]:
nodes_v2 = {}
for k,v in nodes.items():
    nodes_v2[k] = typer(v)

In [16]:
edges_v2 = []
for i in edges:
    edges_v2.append(
        typer(i)
    )

In [17]:
serialization_v2 = {
    "version": "2",
    "nodes": nodes_v2,
    "edges": edges_v2
}
with open("results/reprotox_serialization.v2.json", "w") as o:
    o.write(json.dumps(serialization_v2, indent=2))

In [18]:
def get_id(value):
    for prefix, val in rdfgraph.namespaces():
        v = value.replace(val, "")
        if not v == value:
            return prefix, v
    else:
        return "Literal", value


## SigCom LINCS
### drug2gene

In [19]:
with open("data/SigComLINCS_drug_2_gene_25.v1.json") as o:
    sigcom_lincs = json.loads(o.read())

In [20]:
gene_names = set()
for k,v in sigcom_lincs["nodes"].items():
    t = v["type"]
    if t == "Gene":
        gene_names.add(v["properties"]["label"])
len(gene_names)

4420

In [21]:
payload = {
    "filter": {
        "where": {
            "meta.symbol": {"inq": list(gene_names)}
        }
    }
}
res = requests.post("https://maayanlab.cloud/sigcom-lincs/metadata-api/entities/find", json=payload)
len(res.json())

4411

In [22]:
gene_mapping_name = {
    'LINC00341': 'SYNE3',
    'ATP5MPL': 'ATP5MJ'
}
gene_mapping = {
    'SYNE3': '161176',
    'ATP5MJ': '9556'
}
for i in res.json():
    gene_id = i["meta"]["geneid"]
    gene_symbol = i["meta"]["symbol"]
    gene_mapping[gene_symbol] = gene_id

In [23]:
for i in gene_names - gene_mapping.keys():
    gene_symbol, gene_id = get_info(i)
    gene_mapping[gene_symbol] = str(gene_id)

In [24]:
nodes = {}
node_map = {}
ensembl_map = {}
for k,v in sigcom_lincs["nodes"].items():
    t = v["type"]
    v["properties"]["id"] = str(v["properties"]["id"].replace("CID", "CID:"))
    new_v = {}
    for i,j in v["properties"].items():
        new_v[i.replace(".", "_")] = j
    label = new_v["label"]
    if t == "Gene":
        old_id = v["properties"]["id"]
        gene_label = gene_mapping_name.get(label, label)
        gene_id = gene_mapping.get(gene_label)
        if gene_label and gene_id:
            ensembl_map[old_id] = gene_id
            new_v["id"] = gene_id
            new_v["label"] = gene_label
            new_v["uri"] = "https://www.ncbi.nlm.nih.gov/gene/%s"%new_v["id"]
            label = gene_label
            if label in gene_scores.index:
                for key,val in gene_scores.loc[label].items():
                    new_v[key.replace(".", "_")] = val
        else:
            print(label)
            continue
    elif t == "Drug":
        new_v["uri"] = "https://pubchem.ncbi.nlm.nih.gov/compound/%s"%v["properties"]["id"].replace("CID:", "")
        if label in drug_scores.index:
            new_v["placenta_score"] = drug_scores.at[label, "Similarity_Score"]
            new_v["placenta_rank"] = drug_scores.at[label, "Rank"]
    nodes[new_v["id"]] = {
        "type": t,
        "properties": new_v
    }
    node_map[k] = str(v["properties"]["id"])
    node_map[str(new_v["id"])] = str(v["properties"]["id"])

In [25]:
edges = []
not_in = set()
in_set = set()
for i in sigcom_lincs["edges"]:
    source = str(i["source"].replace("CID", "CID:"))
    target = ensembl_map[str(i["target"])]
    # if source not in node_map: not_in.add(source)
    # else: in_set.add(source)
    # if target not in node_map: not_in.add(target)
    # else: in_set.add(target)
    i["source"] = source
    i["target"] = target
    source_label = nodes[source]["properties"]["label"]
    target_label = nodes[target]["properties"]["label"]
    i["properties"] = {
        "id": "%s %s %s"%(source_label, i["relation"], target_label),
        "source_label": source_label,
        "target_label": target_label,
        "resource": "SigCom LINCS",
        "mean_CD_coefficient": i.pop("weight"),
        "directed": True,
    }
    i["relation"] = "SigCom LINCS Drug-to-Gene (%s)"%i["relation"]
    edges.append(i)

In [26]:
serialization_v1 = {
    "version": "1",
    "nodes": nodes,
    "edges": edges
}
with open("results/sigcom_lincs_serialization.v1.json", "w") as o:
    o.write(json.dumps(serialization_v1, indent=2))

In [27]:
nodes_v2 = {}
for k,v in nodes.items():
    nodes_v2[k] = typer(v)

edges_v2 = []
for i in edges:
    edges_v2.append(
        typer(i)
    )
    
serialization_v2 = {
    "version": "2",
    "nodes": nodes_v2,
    "edges": edges_v2
}
with open("results/sigcom_lincs_serialization.v2.json", "w") as o:
    o.write(json.dumps(serialization_v2, indent=2))

### Gene 2 Drug (Should I do this?)

In [93]:
with open("data/full_weights_SigComLINCS_gene_2_drug07262022.v1.json") as o:
    sigcom_lincs = json.loads(o.read())

In [94]:
nodes = {}
node_map = {}
for k,v in sigcom_lincs["nodes"].items():
    t = v["type"]
    v["properties"]["id"] = str(v["properties"]["id"].replace("CID", "CID:"))
    new_v = {}
    for i,j in v["properties"].items():
        new_v[i.replace(".", "_")] = j
    label = new_v["label"]
    if t == "Gene":
        new_v["id"] = v["properties"]["id"]
        new_v["label"] = v["properties"]["label"]
        new_v["uri"] = "https://www.ncbi.nlm.nih.gov/gene/%s"%new_v["id"]
        if label in gene_scores.index:
            for key,val in gene_scores.loc[label].items():
                new_v[key.replace(".", "_")] = val
    elif t == "Drug":
        new_v["uri"] = "https://pubchem.ncbi.nlm.nih.gov/compound/%s"%v["properties"]["id"].replace("CID:", "")
        if label in drug_scores.index:
            new_v["placenta_score"] = drug_scores.at[label, "Similarity_Score"]
            new_v["placenta_rank"] = drug_scores.at[label, "Rank"]
    nodes[new_v["id"]] = {
        "type": t,
        "properties": new_v
    }
    node_map[k] = str(v["properties"]["id"])
    node_map[str(new_v["id"])] = str(v["properties"]["id"])

In [95]:
edges = []
not_in = set()
in_set = set()
for i in sigcom_lincs["edges"]:
    source = str(i["source"].replace("CID", "CID:"))
    target = str(i["target"])
    # if source not in node_map: not_in.add(source)
    # else: in_set.add(source)
    # if target not in node_map: not_in.add(target)
    # else: in_set.add(target)
    i["source"] = source
    i["target"] = target
    source_label = nodes[source]["properties"]["label"]
    target_label = nodes[target]["properties"]["label"]
    i["properties"] = {
        "id": "%s %s %s"%(source_label, i["relation"], target_label),
        "source_label": source_label,
        "target_label": target_label,
        "resource": "SigCom LINCS",
        "mean_CD_coefficient": i.pop("weight"),
        "directed": True,
    }
    i["relation"] = "SigCom LINCS (%s)"%i["relation"]
    edges.append(i)

In [96]:
edges[0]

{'source': 'CID:6738',
 'relation': 'SigCom LINCS (upregulates)',
 'target': '55266',
 'properties': {'id': 'hydroxytoluic-acid upregulates TMEM19',
  'source_label': 'hydroxytoluic-acid',
  'target_label': 'TMEM19',
  'resource': 'SigCom LINCS',
  'mean_CD_coefficient': 0.00712173,
  'directed': True}}

In [97]:
payload = {
    "filter": {
        "where": {
            "meta.symbol": {
                "inq": list(not_in)
            }
        }
    }
}

res = requests.post("https://maayanlab.cloud/sigcom-lincs/metadata-api/entities/find", json=payload)
res.ok

True

In [98]:
len(not_in), len(res.json())

(0, 0)

In [99]:
not_in - set([i["meta"]["symbol"] for i in res.json()])

set()

In [100]:
n = set()
for i in res.json():
    if "ensemblid" not in i["meta"]:
        gene_id = str(i["meta"]["geneid"])
        label = i["meta"]["symbol"]
        nodes[gene_id] = {
            "type": "Gene",
            "properties": {
                "id": gene_id,
                "label": label
            }
        }
    else:
        gene_id = str(i["meta"]["ensemblid"])
        label = i["meta"]["symbol"]
        nodes[gene_id] = {
            "type": "Gene",
            "properties": {
                "id": gene_id,
                "label": label
            }
        }
    node_map[label] = gene_id
    node_map[gene_id] = gene_id

In [101]:
sigcom_lincs["edges"][0]

{'source': 'CID:6738',
 'relation': 'SigCom LINCS (upregulates)',
 'target': '55266',
 'properties': {'id': 'hydroxytoluic-acid upregulates TMEM19',
  'source_label': 'hydroxytoluic-acid',
  'target_label': 'TMEM19',
  'resource': 'SigCom LINCS',
  'mean_CD_coefficient': 0.00712173,
  'directed': True}}

In [102]:
edges = []
not_in = {}
in_set = set()
for i in sigcom_lincs["edges"]:
    source = str(i["source"])
    target = str(i["target"])
    
    if source in nodes and target in nodes:
        i["source"] = node_map.get(source, source)
        i["target"] = node_map.get(target, target)
        cd = i["properties"].pop("Mean CD-coefficient", None)
        if cd:
            i["properties"]["mean_CD_coefficient"] = cd
            source_label = nodes[i["source"]]["properties"]["label"]
            target_label = nodes[i["target"]]["properties"]["label"]
            i["properties"]["source_label"] = source_label
            i["properties"]["target_label"] = target_label
            i["properties"]["id"] = "%s_%s_%s"%(source_label, i["relation"], target_label)
            i["relation"] = "SigCom LINCS (%s)"%i["relation"]
        edges.append(i)
    elif not source in nodes:
        if source not in not_in:
            not_in[source] = 0
        not_in[source]+=1
    elif not target in nodes:
        if target not in not_in:
            not_in[target] = 0
        not_in[target]+=1

In [103]:
edges[1]

{'source': 'CID:12124',
 'relation': 'SigCom LINCS (downregulates)',
 'target': '1786',
 'properties': {'id': 'metacetamol downregulates DNMT1',
  'source_label': 'metacetamol',
  'target_label': 'DNMT1',
  'resource': 'SigCom LINCS',
  'mean_CD_coefficient': -0.02283431,
  'directed': True}}

In [104]:
len(edges), len(sigcom_lincs["edges"])

(60677, 60677)

In [105]:
serialization_v1 = {
    "version": "1",
    "nodes": nodes,
    "edges": edges
}
with open("results/sigcom_lincs_serialization.v1.json", "w") as o:
    o.write(json.dumps(serialization_v1, indent=2))

In [106]:
nodes_v2 = {}
for k,v in nodes.items():
    nodes_v2[k] = typer(v)


In [107]:
edges_v2 = []
for i in edges:
    edges_v2.append(
        typer(i)
    )

In [108]:
nodes_v2 = {}
for k,v in nodes.items():
    nodes_v2[k] = typer(v)

edges_v2 = []
for i in edges:
    edges_v2.append(
        typer(i)
    )
    
serialization_v2 = {
    "version": "2",
    "nodes": nodes_v2,
    "edges": edges_v2
}
with open("results/sigcom_lincs_serialization.v2.json", "w") as o:
    o.write(json.dumps(serialization_v2, indent=2))

### Drug2Drug

In [28]:
with open("data/full_SigComLINCS_lm_drug_cosine_sim.v1.json") as o:
    drug2drug = json.loads(o.read())

In [29]:
nodes = {}
for k,v in drug2drug["nodes"].items():
    properties = v["properties"]
    properties["id"] = properties["id"].replace("CID", "CID:")
    label = properties["label"]
    properties["uri"] = "https://pubchem.ncbi.nlm.nih.gov/compound/%s"%v["properties"]["id"].replace("CID:", "")
    if label in drug_scores.index:
        properties["placenta_score"] = drug_scores.at[label, "Similarity_Score"]
        properties["placenta_rank"] = drug_scores.at[label, "Rank"]
    nodes[properties["id"]] = {
        "type": v["type"],
        "properties": properties
    }
    

In [30]:
properties

{'id': 'CID:395754',
 'label': 'FTI-276',
 'lsm': 'LSM-4727',
 'uri': 'https://pubchem.ncbi.nlm.nih.gov/compound/395754',
 'placenta_score': 0.111334816,
 'placenta_rank': 33473.0}

In [31]:
edges = []
for i in drug2drug["edges"]:
    source = i["source"].replace("CID", "CID:")
    source_label = nodes[source]["properties"]["label"]
    target = i["target"].replace("CID", "CID:")
    target_label = nodes[target]["properties"]["label"]
    relation = "LINCS Drugs Cosine Similarity"
    cosine_similarity = i["weight"]

    edges.append({
        "source": source,
        "relation": relation,
        "target": target,
        "properties": {
            "id": "%s-%s similarity"%(source_label, target_label),
            "source_label": source_label,
            "target_label": target_label,
            "cosine_similarity": cosine_similarity
        }
    })


In [32]:
edges[0]

{'source': 'CID:10109823',
 'relation': 'LINCS Drugs Cosine Similarity',
 'target': 'CID:11338033',
 'properties': {'id': 'AS-601245-AT-7519 similarity',
  'source_label': 'AS-601245',
  'target_label': 'AT-7519',
  'cosine_similarity': 0.5566698312759399}}

In [33]:
serialization_v1 = {
    "version": "1",
    "nodes": nodes,
    "edges": edges
}
with open("results/sigcom_lincs_drug_similarity.v1.json", "w") as o:
    o.write(json.dumps(serialization_v1, indent=2))

In [34]:
nodes_v2 = {}
for k,v in nodes.items():
    nodes_v2[k] = typer(v)

edges_v2 = []
for i in edges:
    edges_v2.append(
        typer(i)
    )

serialization_v2 = {
    "version": "2",
    "nodes": nodes_v2,
    "edges": edges_v2
}
with open("results/sigcom_lincs_drug_similarity.v2.json", "w") as o:
    o.write(json.dumps(serialization_v2, indent=2))

## Drug Target

In [35]:
drug_target = pd.read_csv("data/idg_target.tsv", sep="\t")
drug_target.head()

Unnamed: 0,DRUG_SMILES,DRUG_NAME,DRUG_ID,DRUG_PUBCHEM_CID,TARGET_NAME,TARGET_CLASS,TARGET_GENE,TARGET_ACCESSION,TARGET_SWISSPROT,TARGET_ORGANISM,TARGET_TDL,ACT_TYPE,ACT_RELATION,ACT_VALUE,ACT_COMMENT,ACT_SOURCE
0,CCN(CC)CCNC(=O)C1=C(C)NC(\C=C2/C(=O)NC3=C2C=C(...,sunitinib,2544,5329102,AP2-associated protein kinase 1,Kinase,AAK1,Q2M2I8,AAK1_HUMAN,Homo sapiens,Tchem,Kd,=,7.96,Binding constant for AAK1 kinase domain,CHEMBL
1,CN(C)C[C@@H]1CCN2C=C(C3=CC=CC=C23)C2=C(C(=O)NC...,ruboxistaurin,3533,153999,AP2-associated protein kinase 1,Kinase,AAK1,Q2M2I8,AAK1_HUMAN,Homo sapiens,Tchem,Kd,=,6.05,Binding constant for AAK1 kinase domain,CHEMBL
2,N#CC[C@H](C1CCCC1)N1C=C(C=N1)C1=C2C=CNC2=NC=N1,ruxolitinib,4190,25126798,AP2-associated protein kinase 1,Kinase,AAK1,Q2M2I8,AAK1_HUMAN,Homo sapiens,Tchem,Kd,=,6.92,Binding constant for AAK1 kinase domain,CHEMBL
3,COC(=O)C1=CC2=C(C=C1)\C(=C(\NC1=CC=C(C=C1)N(C)...,nintedanib,4903,135423438,AP2-associated protein kinase 1,Kinase,AAK1,Q2M2I8,AAK1_HUMAN,Homo sapiens,Tchem,Kd,=,7.2,Binding constant for AAK1 kinase domain,CHEMBL
4,CCS(=O)(=O)N1CC(CC#N)(C1)N1C=C(C=N1)C1=NC=NC2=...,baricitinib,5202,44205240,AP2-associated protein kinase 1,Kinase,AAK1,Q2M2I8,AAK1_HUMAN,Homo sapiens,Tchem,Kd,=,7.77,Binding affinity determined in a cell-free bio...,IUPHAR


In [36]:
nodes = {}
edges = []

In [37]:
for drug_id in set(drug_target["DRUG_ID"]):
    sub_df = drug_target[drug_target["DRUG_ID"] == drug_id]
    for i in sub_df.index:
        drug_pubchem_id = "CID:%s"%sub_df.at[i, "DRUG_PUBCHEM_CID"]
        drug_name = sub_df.at[i, "DRUG_NAME"]
        if drug_pubchem_id not in nodes:
            properties = {
                "id": drug_pubchem_id,
                "label": drug_name,
                "smiles": sub_df.at[i, "DRUG_SMILES"],
                "IDG_ID": str(sub_df.at[i, "DRUG_ID"]),
                "uri": "https://pubchem.ncbi.nlm.nih.gov/compound/%s"%str(sub_df.at[i, "DRUG_PUBCHEM_CID"])
            }
            if drug_name in drug_scores.index:
                properties["placenta_score"] = float(drug_scores.at[drug_name, "Similarity_Score"])
                properties["placenta_rank"] = int(drug_scores.at[drug_name, "Rank"])
            nodes[drug_pubchem_id] = {
                "type": "Drug",
                "properties": properties
            }
        
        gene_label = sub_df.at[i, "TARGET_GENE"]
        gene_label, gene_id = get_info(gene_label)
        if gene_label and gene_id:
            if gene_id not in nodes:
                properties = {
                    "id": gene_id,
                    "label": gene_label,
                    "target_name": str(sub_df.at[i, "TARGET_NAME"]),
                    "target_class": str(sub_df.at[i, "TARGET_CLASS"]),
                    "accession": str(sub_df.at[i, "TARGET_ACCESSION"]),
                    "swissprot": str(sub_df.at[i, "TARGET_SWISSPROT"]),
                    "organism": str(sub_df.at[i, "TARGET_ORGANISM"]),
                    "TDL": str(sub_df.at[i, "TARGET_TDL"])
                }
                if gene_label in gene_scores.index:
                    for k,v in gene_scores.loc[gene_label].items():
                        properties[k.replace(".", "_")] = float(v)
                nodes[gene_id] = {
                    "type": "Gene",
                    "properties": properties
                }
            edge = {
                "source": drug_pubchem_id,
                "relation": "IDG (Drug Target)",
                "target": gene_id,
                "properties": {
                    "id": "%s targets %s"%(drug_name, gene_label),
                    "source_label": drug_name,
                    "target_label": gene_label,
                    "act_type": str(sub_df.at[i, "ACT_TYPE"]),
                    "act_relation": str(sub_df.at[i, "ACT_RELATION"]),
                    "act_value": str(sub_df.at[i, "ACT_VALUE"]),
                    "act_comment": str(sub_df.at[i, "ACT_COMMENT"]),
                    "act_source": str(sub_df.at[i, "ACT_SOURCE"]),
                }
            }
            edges.append(edge)
        else:
            print(sub_df.at[i, "TARGET_GENE"])

MT-CO2
MT-CO2


In [38]:
len(edges)

7324

In [39]:
drug_target.shape

(7326, 16)

In [40]:
serialization_v1 = {
    "version": "1",
    "nodes": nodes,
    "edges": edges
}
with open("results/idg_drug_targets.v1.json", "w") as o:
    o.write(json.dumps(serialization_v1, indent=2))

In [41]:
nodes_v2 = {}
for k,v in nodes.items():
    nodes_v2[k] = typer(v)

edges_v2 = []
for i in edges:
    edges_v2.append(
        typer(i)
    )

serialization_v2 = {
    "version": "2",
    "nodes": nodes_v2,
    "edges": edges_v2
}
with open("results/idg_drug_targets.v2.json", "w") as o:
    o.write(json.dumps(serialization_v2, indent=2))

### HPO

In [42]:
HPO_Freq.head()

Unnamed: 0_level_0,HPO description,KF Frequency
HPO ID,Unnamed: 1_level_1,Unnamed: 2_level_1
HP:0410030,Cleft lip,888
HP:0000776,Congenital diaphragmatic hernia,770
HP:0000175,Cleft palate,704
HP:0010438,Abnormal ventricular septum morphology,629
HP:0001710,Conotruncal defect,545


In [72]:
nodes = {}
edges = []
for i in HPO_Freq.index:
    if i not in nodes:
        label = HPO_Freq.at[i, "HPO description"]
        if type(label) == pd.core.series.Series:
            label = label[0]
        KF_Freq =  HPO_Freq.at[i, "KF Frequency"]
        if type(KF_Freq) == pd.core.series.Series:
            KF_Freq = int(KF_Freq[0])
        else:
            KF_Freq = int(KF_Freq)
        nodes[i] = {
            "type": "BirthDefect",
            "properties": {
                "id": i,
                "label": label,
                "KF_Freq": KF_Freq,
                "uri": "https://purl.obolibrary.org/obo/%s"%i.replace(":", "_")
            }
        }
        time.sleep(0.1)
        res = requests.get("https://hpo.jax.org/api/hpo/term/%s/genes?max=-1&offset=1"%i, verify=False)
        if not res.ok:
            print(i)
        else:
            for gene_info in res.json()["genes"]:
                entrezGeneId = str(gene_info["entrezGeneId"])
                entrezGeneSymbol = gene_info["entrezGeneSymbol"]
                if entrezGeneId not in nodes:
                    properties = {
                        "id": entrezGeneId,
                        "label": entrezGeneSymbol,
                        "uri": "https://www.ncbi.nlm.nih.gov/gene/%s"%entrezGeneId
                    }
                    if entrezGeneSymbol in gene_scores:
                        for k,v in gene_scores.loc[entrezGeneSymbol]:
                            properties[k.replace(".", "_" )] = v
                    
                    nodes[entrezGeneId] = {
                        "type": "Gene",
                        "properties": properties
                    }
                edge = {
                    "source": i,
                    "relation": "HPO",
                    "target": entrezGeneId,
                    "properties": {
                        "id": "%s (%s-%s)"%(i, label, entrezGeneSymbol),
                        "source_label": label,
                        "target_label": entrezGeneSymbol,
                        "resource": "HPO"
                    }
                }
                edges.append(edge)



In [73]:
serialization_v1 = {
    "version": "1",
    "nodes": nodes,
    "edges": edges
}
with open("results/hpo.v1.json", "w") as o:
    o.write(json.dumps(serialization_v1, indent=2))

In [74]:
nodes_v2 = {}
for k,v in nodes.items():
    nodes_v2[k] = typer(v)

edges_v2 = []
for i in edges:
    edges_v2.append(
        typer(i)
    )

serialization_v2 = {
    "version": "2",
    "nodes": nodes_v2,
    "edges": edges_v2
}
with open("results/hpo.v2.json", "w") as o:
    o.write(json.dumps(serialization_v2, indent=2))