# 04-filter_biolink
 - Filter specific domain and ranges for: CAUSES, LOCATION_OF, TREATS, PREDISPOSES, PREVENTS
 - rename 'converts_to' edge to 'derives_into'
 - rename 'isa' edge to 'subclass of'
 - rename 'disrupts' edge to 'affects'
 - rename 'associated_with' edge to 'related_to'
 - rename 'STIMULATES' edge to 'positively_regulates'
 - rename 'INHIBITS' edge to 'negatively_regulates'
 - associated_with/related_to edges with domain: gene, range: disease; rename to gene_associated_with_condition\

#### Note: this filtering step may be mildly confused by edges with nodes which have multiple Biolink category types(?)
 

In [1]:
import os
import pickle
%matplotlib inline
import pandas as pd
from semmeddb_biolink_environment import *

In [2]:
edges = pd.read_csv(EDGES_FILTERED_TSV, sep='\t')
nodes = pd.read_csv(NODES_FILTERED_TSV, sep='\t')
node_category = dict(zip(nodes.ID, nodes.blm_category))

In [3]:
nodes.blm_category.value_counts()

MolecularEntity                    68031
OrganismTaxon                      49722
Disease                            35788
NucleicAcidEntity                  26978
Polypeptide                        23979
Procedure                          15488
DiseaseOrPhenotypicFeature         12626
GrossAnatomicalStructure           10880
Drug                                6185
PhysiologicalProcess                5692
Device                              4941
MolecularActivity                   4541
AnatomicalEntity                    3554
Activity                            2952
PhenotypicFeature                   2805
InformationContentEntity            2681
CellularComponent                   2535
Phenomenon                          1596
Cell                                1555
Cohort                              1461
SmallMolecule                       1392
Behavior                            1229
Agent                               1163
Food                                 833
PopulationOfIndi

In [4]:
edges.head()

Unnamed: 0,SUBJECT_CUI,PREDICATE,OBJECT_CUI,PMID,NEG
0,C1412045,AFFECTS,C0005935,29798367,False
1,C1412045,AFFECTS,C0020291,6298464,False
2,C1412045,AFFECTS,C0028754,19789049,False
3,C1412045,AFFECTS,C0036421,31505074;31505074,False
4,C1412045,AFFECTS,C0597304,1409557,False


In [5]:
# generate domain and range from the biolink node category (may be multiple?)
edges['bl_domain'] = edges.SUBJECT_CUI.apply(node_category.get)
edges['bl_pred'] = edges.PREDICATE.apply(lambda p: p.lower())
edges['bl_range'] = edges.OBJECT_CUI.apply(node_category.get)
edges['bl_type'] = edges['bl_domain'] + "." + edges['bl_pred'] + "." + edges['bl_range']
edges.head()

Unnamed: 0,SUBJECT_CUI,PREDICATE,OBJECT_CUI,PMID,NEG,bl_domain,bl_pred,bl_range,bl_type
0,C1412045,AFFECTS,C0005935,29798367,False,NucleicAcidEntity,affects,PhysiologicalProcess,NucleicAcidEntity.affects.PhysiologicalProcess
1,C1412045,AFFECTS,C0020291,6298464,False,NucleicAcidEntity,affects,Phenomenon,NucleicAcidEntity.affects.Phenomenon
2,C1412045,AFFECTS,C0028754,19789049,False,NucleicAcidEntity,affects,Disease,NucleicAcidEntity.affects.Disease
3,C1412045,AFFECTS,C0036421,31505074;31505074,False,NucleicAcidEntity,affects,Disease,NucleicAcidEntity.affects.Disease
4,C1412045,AFFECTS,C0597304,1409557,False,NucleicAcidEntity,affects,MolecularActivity,NucleicAcidEntity.affects.MolecularActivity


In [6]:
# drop edges unmapped to a bl_type
edges.dropna(subset=['bl_type'], inplace=True)
edges.head()

Unnamed: 0,SUBJECT_CUI,PREDICATE,OBJECT_CUI,PMID,NEG,bl_domain,bl_pred,bl_range,bl_type
0,C1412045,AFFECTS,C0005935,29798367,False,NucleicAcidEntity,affects,PhysiologicalProcess,NucleicAcidEntity.affects.PhysiologicalProcess
1,C1412045,AFFECTS,C0020291,6298464,False,NucleicAcidEntity,affects,Phenomenon,NucleicAcidEntity.affects.Phenomenon
2,C1412045,AFFECTS,C0028754,19789049,False,NucleicAcidEntity,affects,Disease,NucleicAcidEntity.affects.Disease
3,C1412045,AFFECTS,C0036421,31505074;31505074,False,NucleicAcidEntity,affects,Disease,NucleicAcidEntity.affects.Disease
4,C1412045,AFFECTS,C0597304,1409557,False,NucleicAcidEntity,affects,MolecularActivity,NucleicAcidEntity.affects.MolecularActivity


In [7]:
edges.rename(columns={'PREDICATE': 'SEMMED_PRED'}, inplace=True)

In [8]:
edges.bl_pred.value_counts()

location_of         3308116
interacts_with      2536607
affects             2355724
coexists_with       2113139
treats              1517832
stimulates          1437115
part_of             1371158
inhibits            1219056
associated_with     1146716
causes              1066883
disrupts             658150
produces             540309
predisposes          378655
isa                  372949
prevents             250853
measures             237404
precedes             169184
converts_to           53282
manifestation_of      45009
measurement_of         1166
Name: bl_pred, dtype: int64

In [9]:
len(set(edges.bl_pred))

20

In [10]:
# get the most common domain and ranges for each pred
pred_type_count = edges.groupby("bl_pred").bl_type.value_counts()

In [11]:
pred_type_count['prevents'][:30]

bl_type
MolecularEntity.prevents.Disease                         83665
Procedure.prevents.Disease                               49202
NucleicAcidEntity.prevents.Disease                       24531
Polypeptide.prevents.Disease                             24280
Drug.prevents.Disease                                    15220
Activity.prevents.Disease                                 8623
Device.prevents.Disease                                   7848
MolecularEntity.prevents.PhenotypicFeature                6902
Procedure.prevents.PhenotypicFeature                      6360
Activity.prevents.DiseaseOrPhenotypicFeature              3900
Procedure.prevents.DiseaseOrPhenotypicFeature             2293
MolecularEntity.prevents.DiseaseOrPhenotypicFeature       2170
Activity.prevents.PhenotypicFeature                       1425
Drug.prevents.PhenotypicFeature                           1410
SmallMolecule.prevents.Disease                            1268
Device.prevents.PhenotypicFeature              

In [12]:
# working off https://docs.google.com/spreadsheets/d/1zXitcR1QjHyh6WocukgshSR7IoAVg7MJQG-HNh96Jec/edit#gid=579577728
# TODO: Not sure if this needs to be reviewed relative to Biolink 2.0
allowed_domain_range = {
    'causes': (None, {'BiologicalProcessOrActivity', 'DiseaseOrPhenotypicFeature'}),
    'location_of': ({'GrossAnatomicalStructure', 'AnatomicalEntity', 'CellularComponent', 'Cell'}, None),
    'treats': (None, {'DiseaseOrPhenotypicFeature'}),
    'predisposes': (None, {'DiseaseOrPhenotypicFeature'}),
    'prevents': (None, {'DiseaseOrPhenotypicFeature'}),
}

In [13]:
def is_allowed_edge(domain, pred, rnge):
    allowed_domain, allowed_range = allowed_domain_range[pred] if pred in allowed_domain_range else (None, None)
    return ((domain in allowed_domain if allowed_domain else True) and 
              (rnge in allowed_range if allowed_range else True))

In [14]:
d = {x: is_allowed_edge(*x.split(".")) for x in set(edges.bl_type)}
allowed_edges = {k for k,v in d.items() if v}

In [15]:
idx = edges.bl_type.isin(allowed_edges)
print(idx.value_counts())
edges = edges[idx]

True     16981175
False     3798132
Name: bl_type, dtype: int64


In [16]:
print(len(edges))

16981175


In [17]:
edges.head()

Unnamed: 0,SUBJECT_CUI,SEMMED_PRED,OBJECT_CUI,PMID,NEG,bl_domain,bl_pred,bl_range,bl_type
0,C1412045,AFFECTS,C0005935,29798367,False,NucleicAcidEntity,affects,PhysiologicalProcess,NucleicAcidEntity.affects.PhysiologicalProcess
1,C1412045,AFFECTS,C0020291,6298464,False,NucleicAcidEntity,affects,Phenomenon,NucleicAcidEntity.affects.Phenomenon
2,C1412045,AFFECTS,C0028754,19789049,False,NucleicAcidEntity,affects,Disease,NucleicAcidEntity.affects.Disease
3,C1412045,AFFECTS,C0036421,31505074;31505074,False,NucleicAcidEntity,affects,Disease,NucleicAcidEntity.affects.Disease
4,C1412045,AFFECTS,C0597304,1409557,False,NucleicAcidEntity,affects,MolecularActivity,NucleicAcidEntity.affects.MolecularActivity


In [18]:
# rename 'converts_to' edge to 'derives_into'
edges.loc[lambda df: df['bl_pred'] == "converts_to", "bl_pred"] = "derives_into"

In [19]:
# rename 'isa' edge to 'subclass of'
edges.loc[lambda df: df['bl_pred'] == "isa", "bl_pred"] = "subclass_of"

In [20]:
# rename 'disrupts' edge to 'affects'
edges.loc[lambda df: df['bl_pred'] == "disrupts", "bl_pred"] = "affects"

In [21]:
# rename 'associated_with' edge to 'related_to'
edges.loc[lambda df: df['bl_pred'] == "associated_with", "bl_pred"] = "related_to"

In [22]:
# rename 'STIMULATES' edge to 'entity_positively_regulates_entity'
edges.loc[lambda df: df['bl_pred'] == "stimulates", "bl_pred"] = "entity_positively_regulates_entity"

In [23]:
# rename 'INHIBITS' edge to 'entity_negatively_regulates_entity'
edges.loc[lambda df: df['bl_pred'] == "inhibits", "bl_pred"] = "entity_negatively_regulates_entity"

In [24]:
# associated_with/related_to edges with domain: gene, range: disease; rename to gene_associated_with_condition, 
# and leave the rest alone
edges.loc[lambda df:(df['bl_pred'] == "related_to") & (df['bl_domain'] == "Gene") & 
      (df['bl_range'] == "DiseaseOrPhenotypicFeature"), "bl_pred"] = 'gene_associated_with_condition'

In [25]:
edges.bl_pred.value_counts()

affects                               3013874
interacts_with                        2536607
location_of                           2442878
coexists_with                         2113139
entity_positively_regulates_entity    1437115
part_of                               1371158
entity_negatively_regulates_entity    1219056
related_to                            1146716
produces                               540309
subclass_of                            372949
measures                               237404
precedes                               169184
treats                                 160019
causes                                  87507
derives_into                            53282
manifestation_of                        45009
predisposes                             23859
prevents                                 9944
measurement_of                           1166
Name: bl_pred, dtype: int64

In [26]:
edges[(edges.bl_domain == "Gene") & (edges.bl_range == "DiseaseOrPhenotypicFeature")].bl_pred.value_counts()

Series([], Name: bl_pred, dtype: int64)

In [27]:
print(len(nodes))
nodes = nodes[nodes.ID.isin(set(list(edges['SUBJECT_CUI']) + list(edges['OBJECT_CUI'])))]
print(len(nodes))

291339
268561


In [28]:
## summary
edges['bl_type'] = edges['bl_domain'] + "." + edges['bl_pred'] + "." + edges['bl_range']
print(len(edges))
print(len(nodes))
print(len(set(edges.bl_type)))
print(len(set(edges.bl_pred)))
print(len(set(nodes.blm_category)))

16981175
268561
4328
19
37


In [29]:
del edges['bl_type']
del edges['bl_domain']
del edges['bl_range']

In [30]:
edges.head()

Unnamed: 0,SUBJECT_CUI,SEMMED_PRED,OBJECT_CUI,PMID,NEG,bl_pred
0,C1412045,AFFECTS,C0005935,29798367,False,affects
1,C1412045,AFFECTS,C0020291,6298464,False,affects
2,C1412045,AFFECTS,C0028754,19789049,False,affects
3,C1412045,AFFECTS,C0036421,31505074;31505074,False,affects
4,C1412045,AFFECTS,C0597304,1409557,False,affects


In [31]:
edges.to_csv(EDGES_BIOLINK_TSV, sep='\t', index=None)
nodes.to_csv(NODES_BIOLINK_TSV, sep='\t', index=None)