In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import requests
from flatten_json import flatten

# Creating Nodes Files From the Biomarker Data

First, create a dataframe from the edgelist and create a new column with the extracted biomarker ID

In [3]:
def get_id(s):
    index = s.find(":")
    return s[index+1:]

In [4]:
def get_new_id(s):
    index = s.find("-")
    return s[:index]

In [5]:
edges = pd.read_csv("OWLNETS_edgelist.txt", sep='\t')
edges['subject_id'] = edges['subject'].apply(get_id)
display(edges)

Unnamed: 0,subject,predicate,object,subject_id
0,BIOMARKER:AN3902-1,indicated_by_presence_of,DBSNP:rs7785013,AN3902-1
1,BIOMARKER:AN3902-1,determined_using_sample_from,UBERON:0000178,AN3902-1
2,BIOMARKER:AN3902-1,has_BEST_classification,OBCI:0000008,AN3902-1
3,BIOMARKER:AN3902-1,indicates_risk_of_developing,DOID:3905,AN3902-1
4,BIOMARKER:AN3903-1,indicated_by_presence_of,DBSNP:rs73159014,AN3903-1
...,...,...,...,...
546644,BIOMARKER:AM9674-1,has_BEST_classification,OBCI:0000008,AM9674-1
546645,BIOMARKER:AM9674-1,indicates_risk_of_developing,DOID:1324,AM9674-1
546646,BIOMARKER:AM9675-1,indicated_by_presence_of,DBSNP:rs1338696489,AM9675-1
546647,BIOMARKER:AM9675-1,has_BEST_classification,OBCI:0000008,AM9675-1


We'll define a function to find our target entity labels using the Biomarker (subject) id. The get_label_api function will return the output from the "biomarker/detail" request.

In [6]:
def get_response_api(biomarker_id):
    response = requests.get(f'https://hivelab.biochemistry.gwu.edu/biomarker/api/biomarker/detail/{biomarker_id}', timeout=30)
    response = response.text[1:]
    return response

In [177]:
def get_label_biomarker(biomarker_id):
    ind = response.find("{") + 3
    response = response[ind:]
    ind = response.find(":") + 3
    response = response[ind:]
    backind = response.find(",")
    response = response[:backind-1]
    return response

Get a list of the different target node types. The fields we have to access in the response depend on these different types. We will also need to create a [node_type].nodes.csv file for each one.

In [7]:
def get_prefix(s):
    index = s.find(":")
    return s[:index]

In [8]:
def get_type(source):
    if source == "DBSNP":
        type = "Variant"
    elif source == "OBCI":
        type = "Role"
    elif source == "UBERON":
        type = "Anatomy"
    elif source == "DOID":
        type = "Condition"
    elif source == "PUBCHEM":
        type = "Compound"
    else: type = "Gene"
    return type

In [22]:
edges['object_type'] = edges['object'].apply(get_prefix)
edges['type'] = edges['object_type'].apply(get_type)
display(edges)
print((edges.object_type).unique())
print((edges.predicate).unique())

Unnamed: 0,subject,predicate,object,subject_id,object_type,type
0,BIOMARKER:AN3902-1,indicated_by_presence_of,DBSNP:rs7785013,AN3902-1,DBSNP,Variant
1,BIOMARKER:AN3902-1,determined_using_sample_from,UBERON:0000178,AN3902-1,UBERON,Anatomy
2,BIOMARKER:AN3902-1,has_BEST_classification,OBCI:0000008,AN3902-1,OBCI,Role
3,BIOMARKER:AN3902-1,indicates_risk_of_developing,DOID:3905,AN3902-1,DOID,Condition
4,BIOMARKER:AN3903-1,indicated_by_presence_of,DBSNP:rs73159014,AN3903-1,DBSNP,Variant
...,...,...,...,...,...,...
546644,BIOMARKER:AM9674-1,has_BEST_classification,OBCI:0000008,AM9674-1,OBCI,Role
546645,BIOMARKER:AM9674-1,indicates_risk_of_developing,DOID:1324,AM9674-1,DOID,Condition
546646,BIOMARKER:AM9675-1,indicated_by_presence_of,DBSNP:rs1338696489,AM9675-1,DBSNP,Variant
546647,BIOMARKER:AM9675-1,has_BEST_classification,OBCI:0000008,AM9675-1,OBCI,Role


['DBSNP' 'UBERON' 'OBCI' 'DOID' 'PUBCHEM' 'NCBI']
['indicated_by_presence_of' 'determined_using_sample_from'
 'has_BEST_classification' ' indicates_risk_of_developing'
 'indicated_by_above_normal_level_of' 'diagnostic_for'
 'indicated_by_below_normal_level_of' 'prognostic_for']


We can create separate dataframes for each target node type to make things easier. In the end we will also make separate files for each edge type as well.

## Extracting Node labels

We'll create a master dataframe that has each target entity node's "object_type", referring to it's source, and it's entity "type", which is the type of biological entity it is.

In [39]:
node_type = "PUBCHEM" # change this value to the source of the entity type you want to get the labels for
nodes = edges[edges["object_type"] == node_type]
nodes = nodes.drop_duplicates(subset=['object'])
nodes["object_id"] = nodes["object"].apply(get_id)
display(nodes)

Unnamed: 0,subject,predicate,object,subject_id,object_type,type,object_id
17864,BIOMARKER:AN6165-1,indicated_by_above_normal_level_of,PUBCHEM:1662,AN6165-1,PUBCHEM,Compound,1662
17866,BIOMARKER:AN6165-1,indicated_by_above_normal_level_of,PUBCHEM:69362,AN6165-1,PUBCHEM,Compound,69362
17867,BIOMARKER:AN6165-1,indicated_by_above_normal_level_of,PUBCHEM:1551553,AN6165-1,PUBCHEM,Compound,1551553
17868,BIOMARKER:AN6165-1,indicated_by_above_normal_level_of,PUBCHEM:12284,AN6165-1,PUBCHEM,Compound,12284
17869,BIOMARKER:AN6165-1,indicated_by_above_normal_level_of,PUBCHEM:71464474,AN6165-1,PUBCHEM,Compound,71464474
17871,BIOMARKER:AN6165-1,indicated_by_above_normal_level_of,PUBCHEM:71296139,AN6165-1,PUBCHEM,Compound,71296139
17875,BIOMARKER:AN6166-1,indicated_by_above_normal_level_of,PUBCHEM:169485,AN6166-1,PUBCHEM,Compound,169485
17880,BIOMARKER:AN6167-1,indicated_by_above_normal_level_of,PUBCHEM:160471,AN6167-1,PUBCHEM,Compound,160471
17882,BIOMARKER:AN6167-1,indicated_by_above_normal_level_of,PUBCHEM:71464477,AN6167-1,PUBCHEM,Compound,71464477
17885,BIOMARKER:AN6167-1,indicated_by_above_normal_level_of,PUBCHEM:6441567,AN6167-1,PUBCHEM,Compound,6441567


Now we will go through the nodes dataframe and get the labels of each node using the biomarker id. Below are all of the different label-finding methods that differ depending on the  entity type.

In [41]:
def get_label_biomarker(biomarker_id):
    response = get_response_api(biomarker_id)
    ind = response.find("{") + 3
    response = response[ind:]
    ind = response.find(":") + 3
    response = response[ind:]
    backind = response.find(",")
    response = response[:backind-1]
    return response

In [66]:
def get_label_uberon(biomarker_id):
    response = get_response_api(biomarker_id)
    ind = response.find("specimen")
    response = response[ind:]
    ind = response.find("name")
    response = response[ind+len("name")+4:]
    ind = response.find(",")
    response = response[:ind-1]
    return response

In [120]:
def get_label_doid(biomarker_id, doid_id):
    response = get_response_api(biomarker_id)
    ind = response.find(doid_id)
    response = response[ind:]
    ind = response.find("id")
    response = response[ind+len("id"):]
    ind = response.find("name")
    response = response[ind+len("name")+4:]
    ind = response.find("description")
    response = response[:ind-4]
    return response

In [92]:
def get_label_dbsnp(biomarker_id):
    response = get_response_api(biomarker_id)
    ind = response.find("recommended_name")
    response = response[ind+len("recommended_name") + 4:]
    backind = response.find(",")
    response = response[:backind-1]
    print(biomarker_id, response)
    return response

In [94]:
def get_label_obci(biomarker_id):
    response = get_response_api(biomarker_id)
    ind = response.find("best_biomarker_role")
    response = response[ind+len("best_biomarker_role") + 4:]
    ind = response.find("role")
    response = response[ind+len("role") + 4:]
    backind = response.find("}")
    response = response[:backind-1]
    print(response)
    return response

In [35]:
def get_label_pubchem(biomarker_id, pubchem_id):
    response = get_response_api(biomarker_id)
    endInd = response.find(pubchem_id)
    response = response[:endInd]
    ind = response.rfind("name")
    response = response[ind:]
    response = response[len("name") + 4:]
    ind = response.find(",")
    response = response[:ind-1]
    return response

In [121]:
nodes['label'] = nodes.apply(lambda row: get_label_doid(row['subject_id'], row['object']), axis=1)
display(nodes)

Unnamed: 0,object,subject_id,label
3,DOID:3905,AN3902-1,lung carcinoma
7,DOID:3192,AN3903-1,neurilemmoma
11,DOID:3910,AN3904-1,lung adenocarcinoma
15,DOID:8552,AN3905-1,chronic myeloid leukemia
18,DOID:162,AN3906-1,cancer
...,...,...,...
337692,DOID:4927,AH2481-1,Klatskin's tumor
353736,DOID:0060061,AH6520-1,primary cutaneous T-cell non-Hodgkin lymphoma
400203,DOID:0080916,AD6612-1,erythroleukemia
407208,DOID:0080794,AD8463-1,childhood acute megakaryoblastic leukemia


We want to always name nodes files like so: "node_type.nodes.csv"

If you want to retrieve the labels for PubChem or DOID nodes, run this:

In [40]:

nodes['label'] = nodes.apply(lambda row: get_label_pubchem(row['subject_id'], row['object_id']), axis=1)
nodes = nodes.get(['object', 'label', 'type'])
nodes.to_csv("Compound.nodes.csv", index=False, sep=',')
display(nodes)

Unnamed: 0,object,label,type
17864,PUBCHEM:1662,3-hydroxy-3-methylglutaric acid,Compound
17866,PUBCHEM:69362,3-hydroxy-isovaleric acid,Compound
17867,PUBCHEM:1551553,3-methylglutaconic acid,Compound
17868,PUBCHEM:12284,3-methylglutaric acid,Compound
17869,PUBCHEM:71464474,C5-OH acylcarnitine,Compound
17871,PUBCHEM:71296139,C6DC acylcarnitine,Compound
17875,PUBCHEM:169485,3-Methylcrotonylglycine,Compound
17880,PUBCHEM:160471,2-methyl-3-hydroxybutyric acid,Compound
17882,PUBCHEM:71464477,3-hydroxybutyrylcarnitine,Compound
17885,PUBCHEM:6441567,Tiglylglycine,Compound


If you want to get biomarker labels, run this:

In [None]:
nodes['label'] = nodes['subject_id'].apply(get_label_biomarker)
nodes = nodes.get(['subject', 'label', 'type'])
# collapses the nodes getting rid of the suffix
nodes['subject'] = nodes['subject'].apply(get_new_id)
nodes = nodes.drop_duplicates()
nodes.to_csv("Biomarker.nodes.csv", index=False, sep=',')
display(nodes)

Otherwise, run this:

In [None]:
nodes['label'] = nodes['subject_id'].apply(get_label_dbsnp)
nodes = nodes.get(['object', 'label', 'type'])
nodes.to_csv("Variant.nodes.csv", index=False, sep=',')
display(nodes)

## Creating Edge Files

All of the edge files have a "subject" of a node from BIOMARKER, a relation, and a "object" of a node from another resource. We will separate edges by the resource of the object.

In [158]:
edges = pd.read_csv("OWLNETS_edgelist.txt", sep='\t')
edges['resource'] = edges['object'].apply(get_prefix)
edges['type'] = edges['resource'].apply(get_type)
display(edges)

Unnamed: 0,subject,predicate,object,resource,type
0,BIOMARKER:AN3902-1,indicated_by_presence_of,DBSNP:rs7785013,DBSNP,Variant
1,BIOMARKER:AN3902-1,determined_using_sample_from,UBERON:0000178,UBERON,Anatomy
2,BIOMARKER:AN3902-1,has_BEST_classification,OBCI:0000008,OBCI,Role
3,BIOMARKER:AN3902-1,indicates_risk_of_developing,DOID:3905,DOID,Condition
4,BIOMARKER:AN3903-1,indicated_by_presence_of,DBSNP:rs73159014,DBSNP,Variant
...,...,...,...,...,...
546644,BIOMARKER:AM9674-1,has_BEST_classification,OBCI:0000008,OBCI,Role
546645,BIOMARKER:AM9674-1,indicates_risk_of_developing,DOID:1324,DOID,Condition
546646,BIOMARKER:AM9675-1,indicated_by_presence_of,DBSNP:rs1338696489,DBSNP,Variant
546647,BIOMARKER:AM9675-1,has_BEST_classification,OBCI:0000008,OBCI,Role


We will have edge files that look like "Biomarker.{relation}_{object node type}.{object node type}.edges.csv"

In [22]:
list_types = ((edges.type).unique())
print(list_types)

['Variant' 'Anatomy' 'Role' 'Condition' 'Compound' 'Gene']


First separate edges by resource name.

In [156]:
dbsnp = edges[edges["resource"] == "DBSNP"]
uberon = edges[edges["resource"] == "UBERON"]
obci = edges[edges["resource"] == "OBCI"]
doid = edges[edges["resource"] == "DOID"]
pubchem = edges[edges["resource"] == "PUBCHEM"]
print(len(dbsnp) + len(uberon) + len(obci) + len(doid) + len(pubchem))

546648


(Optional) Creating a table of edges per node type to summarize data

In [47]:
table = pd.DataFrame(columns = ["node type", "edges"])
table = pd.concat([pd.DataFrame([["Variant", len(dbsnp.index)]], columns=table.columns), table], ignore_index=True)
table = pd.concat([pd.DataFrame([["Anatomy", len(uberon.index)]], columns=table.columns), table], ignore_index=True)
table = pd.concat([pd.DataFrame([["Role", len(obci.index)]], columns=table.columns), table], ignore_index=True)
table = pd.concat([pd.DataFrame([["Condition", len(doid.index)]], columns=table.columns), table], ignore_index=True)
table = pd.concat([pd.DataFrame([["Compound", len(pubchem.index)]], columns=table.columns), table], ignore_index=True)
table.to_csv("edge_counts.csv", index=False, sep=",")

For each target entity type, we want to create files containing the edges between biomarkers and those entities. Some target entity types have multiple different relations with biomarkers (ex. prognostic for, diagnostic for), so we have to create multiple dataframes for each individual relation type.

In [157]:
doid = doid.get(["subject", "predicate", "object"])
doid['subject'] = doid['subject'].apply(get_new_id)

# gets rid of the space in front of "indicates", can delete this line
doid1 = doid[doid['predicate'] == ' indicates_risk_of_developing']
doid1['predicate'] ='indicates_risk_of_developing'
doid2 = doid[doid['predicate'] == 'prognostic_for']
doid3 = doid[doid['predicate'] == 'diagnostic_for']

display(doid1)
display(doid2)
display(doid3)

# Name the edge files in the format "Biomarker.relation.target_type.edges.csv".
doid1.to_csv("all_csv/Condition/Biomarker.indicates_risk_of_developing.Condition.edges.csv", index=False, sep=',')
doid2.to_csv("all_csv/Condition/Biomarker.prognostic_for.Condition.edges.csv", index=False, sep=',')
doid3.to_csv("all_csv/Condition/Biomarker.diagnostic_for.Condition.edges.csv", index=False, sep=',')

Unnamed: 0,subject,predicate,object
3,BIOMARKER:AN3902,indicates_risk_of_developing,DOID:3905
7,BIOMARKER:AN3903,indicates_risk_of_developing,DOID:3192
11,BIOMARKER:AN3904,indicates_risk_of_developing,DOID:3910
15,BIOMARKER:AN3905,indicates_risk_of_developing,DOID:8552
18,BIOMARKER:AN3906,indicates_risk_of_developing,DOID:162
...,...,...,...
546636,BIOMARKER:AM9671,indicates_risk_of_developing,DOID:999
546639,BIOMARKER:AM9672,indicates_risk_of_developing,DOID:999
546642,BIOMARKER:AM9673,indicates_risk_of_developing,DOID:3672
546645,BIOMARKER:AM9674,indicates_risk_of_developing,DOID:1324


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  doid1['predicate'] ='indicates_risk_of_developing'


Unnamed: 0,subject,predicate,object
3,BIOMARKER:AN3902,indicates_risk_of_developing,DOID:3905
7,BIOMARKER:AN3903,indicates_risk_of_developing,DOID:3192
11,BIOMARKER:AN3904,indicates_risk_of_developing,DOID:3910
15,BIOMARKER:AN3905,indicates_risk_of_developing,DOID:8552
18,BIOMARKER:AN3906,indicates_risk_of_developing,DOID:162
...,...,...,...
546636,BIOMARKER:AM9671,indicates_risk_of_developing,DOID:999
546639,BIOMARKER:AM9672,indicates_risk_of_developing,DOID:999
546642,BIOMARKER:AM9673,indicates_risk_of_developing,DOID:3672
546645,BIOMARKER:AM9674,indicates_risk_of_developing,DOID:1324


Unnamed: 0,subject,predicate,object
17991,BIOMARKER:AN4559,prognostic_for,DOID:1612
17999,BIOMARKER:AN4561,prognostic_for,DOID:1612
18007,BIOMARKER:AN4565,prognostic_for,DOID:2394
18117,BIOMARKER:AN4610,prognostic_for,DOID:0080650
18151,BIOMARKER:AN4624,prognostic_for,DOID:0080650
...,...,...,...
22897,BIOMARKER:AN6151,prognostic_for,DOID:9119
22900,BIOMARKER:AN6152,prognostic_for,DOID:9119
22906,BIOMARKER:AN6154,prognostic_for,DOID:3908
22926,BIOMARKER:AN6163,prognostic_for,DOID:0081252


Unnamed: 0,subject,predicate,object
17879,BIOMARKER:AN6166,diagnostic_for,DOID:0050710
17899,BIOMARKER:AN6169,diagnostic_for,DOID:14753
17918,BIOMARKER:AN6172,diagnostic_for,DOID:14701
17923,BIOMARKER:AN6173,diagnostic_for,DOID:0080155
17930,BIOMARKER:AN6174,diagnostic_for,DOID:14755
...,...,...,...
22918,BIOMARKER:AN6159,diagnostic_for,DOID:0081252
22920,BIOMARKER:AN6160,diagnostic_for,DOID:0081252
22922,BIOMARKER:AN6161,diagnostic_for,DOID:0081252
22924,BIOMARKER:AN6162,diagnostic_for,DOID:0081252
