# 04-filter_biolink
 - Filter specific domain and ranges for: CAUSES, LOCATION_OF, TREATS, PREDISPOSES, PREVENTS
 - rename 'converts_to' edge to 'derives_into'
 - rename 'isa' edge to 'subclass of'
 - rename 'disrupts' edge to 'affects'
 - rename 'associated_with' edge to 'related_to'
 - rename 'STIMULATES' edge to 'positively_regulates'
 - rename 'INHIBITS' edge to 'negatively_regulates'
 - associated_with/related_to edges with domain: gene, range: disease; rename to gene_associated_with_condition\

#### Note: this filtering step may be mildly confused by edges with nodes which have multiple Biolink category types(?)
 

In [1]:
import os
import pickle
%matplotlib inline
import pandas as pd
from semmed_biolink_environment import *

In [2]:
edges = pd.read_csv(EDGES_FILTERED_TSV, sep='\t')
nodes = pd.read_csv(NODES_FILTERED_TSV, sep='\t')
node_category = dict(zip(nodes.ID, nodes.blm_category))

In [3]:
nodes.blm_category.value_counts()

ChemicalSubstance              58713
DiseaseOrPhenotypicFeature     36971
Protein                        28092
Gene                           23825
GrossAnatomicalStructure        9086
BiologicalProcessOrActivity     7684
AnatomicalEntity                2757
CellularComponent               1732
Cell                            1255
ActivityAndBehavior              882
PhenotypicFeature                466
GenomicEntity                    173
Name: blm_category, dtype: int64

In [4]:
edges.head()

Unnamed: 0,SUBJECT_CUI,PREDICATE,OBJECT_CUI,PMID,NEG
0,C1412045,AFFECTS,C0005935,29798367,False
1,C1412045,AFFECTS,C0028754,19789049,False
2,C1412045,AFFECTS,C0036421,31505074;31505074,False
3,C1412045,AFFECTS,C0597304,1409557,False
4,C1412045,AFFECTS,C0599816,7617239,False


In [5]:
# generate domain and range from the biolink node category (may be multiple?)
edges['bl_domain'] = edges.SUBJECT_CUI.apply(node_category.get)
edges['bl_pred'] = edges.PREDICATE.apply(lambda p: p.lower())
edges['bl_range'] = edges.OBJECT_CUI.apply(node_category.get)
edges['bl_type'] = edges['bl_domain'] + "." + edges['bl_pred'] + "." + edges['bl_range']
edges.head()

Unnamed: 0,SUBJECT_CUI,PREDICATE,OBJECT_CUI,PMID,NEG,bl_domain,bl_pred,bl_range,bl_type
0,C1412045,AFFECTS,C0005935,29798367,False,Gene,affects,BiologicalProcessOrActivity,Gene.affects.BiologicalProcessOrActivity
1,C1412045,AFFECTS,C0028754,19789049,False,Gene,affects,DiseaseOrPhenotypicFeature,Gene.affects.DiseaseOrPhenotypicFeature
2,C1412045,AFFECTS,C0036421,31505074;31505074,False,Gene,affects,DiseaseOrPhenotypicFeature,Gene.affects.DiseaseOrPhenotypicFeature
3,C1412045,AFFECTS,C0597304,1409557,False,Gene,affects,BiologicalProcessOrActivity,Gene.affects.BiologicalProcessOrActivity
4,C1412045,AFFECTS,C0599816,7617239,False,Gene,affects,BiologicalProcessOrActivity,Gene.affects.BiologicalProcessOrActivity


In [6]:
edges.rename(columns={'PREDICATE': 'SEMMED_PRED'}, inplace=True)

In [7]:
edges.bl_pred.value_counts()

interacts_with      2229571
location_of         2164390
coexists_with       1842464
affects             1644009
stimulates          1312678
inhibits            1112598
associated_with      962351
causes               793698
part_of              760353
disrupts             542272
treats               503437
produces             483339
predisposes          279729
isa                  245915
prevents             150134
precedes              55586
converts_to           49203
manifestation_of      38284
measures               1066
measurement_of           44
prep                      3
Name: bl_pred, dtype: int64

In [8]:
len(set(edges.bl_pred))

21

In [9]:
# get the most common domain and ranges for each pred
pred_type_count = edges.groupby("bl_pred").bl_type.value_counts()

In [10]:
pred_type_count['prevents'][:30]

bl_type
ChemicalSubstance.prevents.DiseaseOrPhenotypicFeature              89504
Protein.prevents.DiseaseOrPhenotypicFeature                        35038
Gene.prevents.DiseaseOrPhenotypicFeature                           21067
ActivityAndBehavior.prevents.DiseaseOrPhenotypicFeature             2923
ChemicalSubstance.prevents.BiologicalProcessOrActivity               342
DiseaseOrPhenotypicFeature.prevents.DiseaseOrPhenotypicFeature       306
ChemicalSubstance.prevents.ActivityAndBehavior                       262
Protein.prevents.BiologicalProcessOrActivity                         178
Gene.prevents.BiologicalProcessOrActivity                            121
BiologicalProcessOrActivity.prevents.DiseaseOrPhenotypicFeature      113
Protein.prevents.ActivityAndBehavior                                  75
Gene.prevents.ActivityAndBehavior                                     70
PhenotypicFeature.prevents.DiseaseOrPhenotypicFeature                 63
GenomicEntity.prevents.DiseaseOrPhenotypicF

In [11]:
# working off https://docs.google.com/spreadsheets/d/1zXitcR1QjHyh6WocukgshSR7IoAVg7MJQG-HNh96Jec/edit#gid=579577728
allowed_domain_range = {
    'causes': (None, {'BiologicalProcessOrActivity', 'DiseaseOrPhenotypicFeature'}),
    'location_of': ({'GrossAnatomicalStructure', 'AnatomicalEntity', 'CellularComponent', 'Cell'}, None),
    'treats': (None, {'DiseaseOrPhenotypicFeature'}),
    'predisposes': (None, {'DiseaseOrPhenotypicFeature'}),
    'prevents': (None, {'DiseaseOrPhenotypicFeature'}),
}

In [12]:
def is_allowed_edge(domain, pred, rnge):
    allowed_domain, allowed_range = allowed_domain_range[pred] if pred in allowed_domain_range else (None, None)
    return ((domain in allowed_domain if allowed_domain else True) and 
              (rnge in allowed_range if allowed_range else True))

In [13]:
d = {x: is_allowed_edge(*x.split(".")) for x in set(edges.bl_type)}
allowed_edges = {k for k,v in d.items() if v}

In [14]:
idx = edges.bl_type.isin(allowed_edges)
print(idx.value_counts())
edges = edges[idx]

True     15055795
False      115329
Name: bl_type, dtype: int64


In [15]:
print(len(edges))

15055795


In [16]:
edges.head()

Unnamed: 0,SUBJECT_CUI,SEMMED_PRED,OBJECT_CUI,PMID,NEG,bl_domain,bl_pred,bl_range,bl_type
0,C1412045,AFFECTS,C0005935,29798367,False,Gene,affects,BiologicalProcessOrActivity,Gene.affects.BiologicalProcessOrActivity
1,C1412045,AFFECTS,C0028754,19789049,False,Gene,affects,DiseaseOrPhenotypicFeature,Gene.affects.DiseaseOrPhenotypicFeature
2,C1412045,AFFECTS,C0036421,31505074;31505074,False,Gene,affects,DiseaseOrPhenotypicFeature,Gene.affects.DiseaseOrPhenotypicFeature
3,C1412045,AFFECTS,C0597304,1409557,False,Gene,affects,BiologicalProcessOrActivity,Gene.affects.BiologicalProcessOrActivity
4,C1412045,AFFECTS,C0599816,7617239,False,Gene,affects,BiologicalProcessOrActivity,Gene.affects.BiologicalProcessOrActivity


In [17]:
# rename 'converts_to' edge to 'derives_into'
edges.loc[lambda df: df['bl_pred'] == "converts_to", "bl_pred"] = "derives_into"

In [18]:
# rename 'isa' edge to 'subclass of'
edges.loc[lambda df: df['bl_pred'] == "isa", "bl_pred"] = "subclass_of"

In [19]:
# rename 'disrupts' edge to 'affects'
edges.loc[lambda df: df['bl_pred'] == "disrupts", "bl_pred"] = "affects"

In [20]:
# rename 'associated_with' edge to 'related_to'
edges.loc[lambda df: df['bl_pred'] == "associated_with", "bl_pred"] = "related_to"

In [21]:
# rename 'STIMULATES' edge to 'positively_regulates'
edges.loc[lambda df: df['bl_pred'] == "stimulates", "bl_pred"] = "positively_regulates"

In [22]:
# rename 'INHIBITS' edge to 'negatively_regulates'
edges.loc[lambda df: df['bl_pred'] == "inhibits", "bl_pred"] = "negatively_regulates"

In [23]:
# associated_with/related_to edges with domain: gene, range: disease; rename to gene_associated_with_condition, 
# and leave the rest alone
edges.loc[lambda df:(df['bl_pred'] == "related_to") & (df['bl_domain'] == "Gene") & 
      (df['bl_range'] == "DiseaseOrPhenotypicFeature"), "bl_pred"] = 'gene_associated_with_condition'

In [24]:
edges.bl_pred.value_counts()

interacts_with                    2229571
affects                           2186281
location_of                       2062312
coexists_with                     1842464
positively_regulates              1312678
negatively_regulates              1112598
causes                             787261
part_of                            760353
related_to                         598317
treats                             500524
produces                           483339
gene_associated_with_condition     364034
predisposes                        276915
subclass_of                        245915
prevents                           149047
precedes                            55586
derives_into                        49203
manifestation_of                    38284
measures                             1066
measurement_of                         44
prep                                    3
Name: bl_pred, dtype: int64

In [25]:
edges[(edges.bl_domain == "Gene") & (edges.bl_range == "DiseaseOrPhenotypicFeature")].bl_pred.value_counts()

gene_associated_with_condition    364034
affects                           150084
causes                            101421
predisposes                        65249
prevents                           21067
part_of                            19966
treats                             13955
negatively_regulates                 532
produces                              36
positively_regulates                  30
subclass_of                           16
coexists_with                         16
interacts_with                        15
precedes                               7
Name: bl_pred, dtype: int64

In [26]:
print(len(nodes))
nodes = nodes[nodes.ID.isin(set(list(edges['SUBJECT_CUI']) + list(edges['OBJECT_CUI'])))]
print(len(nodes))

171636
171431


In [27]:
## summary
edges['bl_type'] = edges['bl_domain'] + "." + edges['bl_pred'] + "." + edges['bl_range']
print(len(edges))
print(len(nodes))
print(len(set(edges.bl_type)))
print(len(set(edges.bl_pred)))
print(len(set(nodes.blm_category)))

15055795
171431
989
21
12


In [28]:
del edges['bl_type']
del edges['bl_domain']
del edges['bl_range']

In [29]:
edges.head()

Unnamed: 0,SUBJECT_CUI,SEMMED_PRED,OBJECT_CUI,PMID,NEG,bl_pred
0,C1412045,AFFECTS,C0005935,29798367,False,affects
1,C1412045,AFFECTS,C0028754,19789049,False,affects
2,C1412045,AFFECTS,C0036421,31505074;31505074,False,affects
3,C1412045,AFFECTS,C0597304,1409557,False,affects
4,C1412045,AFFECTS,C0599816,7617239,False,affects


In [30]:
edges.to_csv(EDGES_BIOLINK_TSV, sep='\t', index=None)
nodes.to_csv(NODES_BIOLINK_TSV, sep='\t', index=None)