# 04-filter_biolink
 - Filter specific domain and ranges for: CAUSES, LOCATION_OF, TREATS, PREDISPOSES, PREVENTS
 - rename 'converts_to' edge to 'derives_into'
 - rename 'isa' edge to 'subclass of'
 - rename 'disrupts' edge to 'affects'
 - rename 'associated_with' edge to 'related_to'
 - rename 'STIMULATES' edge to 'positively_regulates'
 - rename 'INHIBITS' edge to 'negatively_regulates'
 - associated_with/related_to edges with domain: gene, range: disease; rename to gene_associated_with_condition\

#### Note: this filtering step may be mildly confused by edges with nodes which have multiple Biolink category types(?)
 

In [1]:
import os
import pickle
%matplotlib inline
import pandas as pd
import seaborn as sns
import re
from collections import defaultdict, Counter
from tqdm import tqdm

In [2]:
DATA  = 'data/'
EDGES_FILTERED_CSV = DATA+"edges_filtered.csv"
NODES_FILTERED_CSV = DATA+"nodes_filtered.csv"
EDGES_BIOLINK_CSV = DATA+"edges_biolink.csv"
NODES_BIOLINK_CSV = DATA+"nodes_biolink.csv"

In [3]:
edges = pd.read_csv(EDGES_FILTERED_CSV)
nodes = pd.read_csv(NODES_FILTERED_CSV)
node_category = dict(zip(nodes.ID, nodes.blm_category))

In [4]:
nodes.blm_category.value_counts()

ChemicalSubstance              57100
Protein                        27347
DiseaseOrPhenotypicFeature     26643
Gene                           23741
GrossAnatomicalStructure        8211
BiologicalProcessOrActivity     7279
AnatomicalEntity                2275
CellComponent                   1673
Cell                            1193
ActivityAndBehavior              845
PhenotypicFeature                281
GenomicEntity                    171
Name: blm_category, dtype: int64

In [5]:
edges.head()

Unnamed: 0,SUBJECT_CUI,PREDICATE,OBJECT_CUI,PMID,NEG
0,C1412045,AFFECTS,C0005935,29798367,False
1,C1412045,AFFECTS,C0028754,19789049,False
2,C1412045,AFFECTS,C0036421,31505074;31505074,False
3,C1412045,AFFECTS,C0597304,1409557,False
4,C1412045,AFFECTS,C0599816,7617239,False


In [6]:
# generate domain and range from the biolink node category (may be multiple?)
edges['bl_domain'] = edges.SUBJECT_CUI.apply(node_category.get)
edges['bl_pred'] = edges.PREDICATE
edges['bl_range'] = edges.OBJECT_CUI.apply(node_category.get)
edges['bl_type'] = edges['bl_domain'] + "." + edges['bl_pred'] + "." + edges['bl_range']
edges.head()

Unnamed: 0,SUBJECT_CUI,PREDICATE,OBJECT_CUI,PMID,NEG,bl_domain,bl_pred,bl_range,bl_type
0,C1412045,AFFECTS,C0005935,29798367,False,Gene,AFFECTS,BiologicalProcessOrActivity,Gene.AFFECTS.BiologicalProcessOrActivity
1,C1412045,AFFECTS,C0028754,19789049,False,Gene,AFFECTS,DiseaseOrPhenotypicFeature,Gene.AFFECTS.DiseaseOrPhenotypicFeature
2,C1412045,AFFECTS,C0036421,31505074;31505074,False,Gene,AFFECTS,DiseaseOrPhenotypicFeature,Gene.AFFECTS.DiseaseOrPhenotypicFeature
3,C1412045,AFFECTS,C0597304,1409557,False,Gene,AFFECTS,BiologicalProcessOrActivity,Gene.AFFECTS.BiologicalProcessOrActivity
4,C1412045,AFFECTS,C0599816,7617239,False,Gene,AFFECTS,BiologicalProcessOrActivity,Gene.AFFECTS.BiologicalProcessOrActivity


In [7]:
edges.rename(columns={'PREDICATE': 'SEMMED_PRED'}, inplace=True)

In [8]:
edges.bl_pred.value_counts()

INTERACTS_WITH      2220070
LOCATION_OF         2105454
COEXISTS_WITH       1780772
AFFECTS             1614278
STIMULATES          1307942
INHIBITS            1106486
ASSOCIATED_WITH      939793
CAUSES               759205
PART_OF              751794
DISRUPTS             534850
TREATS               484268
PRODUCES             480627
PREDISPOSES          269523
ISA                  237589
PREVENTS             145866
PRECEDES              52602
CONVERTS_TO           48751
MANIFESTATION_OF      36672
MEASURES               1048
MEASUREMENT_OF           38
PREP                      3
Name: bl_pred, dtype: int64

In [9]:
len(set(edges.bl_pred))

21

In [10]:
# get the most common domain and ranges for each pred
pred_type_count = edges.groupby("bl_pred").bl_type.value_counts()

In [11]:
pred_type_count['PREVENTS'][:30]

bl_type
ChemicalSubstance.PREVENTS.DiseaseOrPhenotypicFeature              86698
Protein.PREVENTS.DiseaseOrPhenotypicFeature                        34009
Gene.PREVENTS.DiseaseOrPhenotypicFeature                           20702
ActivityAndBehavior.PREVENTS.DiseaseOrPhenotypicFeature             2870
ChemicalSubstance.PREVENTS.BiologicalProcessOrActivity               339
DiseaseOrPhenotypicFeature.PREVENTS.DiseaseOrPhenotypicFeature       302
ChemicalSubstance.PREVENTS.ActivityAndBehavior                       262
Protein.PREVENTS.BiologicalProcessOrActivity                         178
Gene.PREVENTS.BiologicalProcessOrActivity                            121
BiologicalProcessOrActivity.PREVENTS.DiseaseOrPhenotypicFeature      109
Protein.PREVENTS.ActivityAndBehavior                                  75
Gene.PREVENTS.ActivityAndBehavior                                     70
PhenotypicFeature.PREVENTS.DiseaseOrPhenotypicFeature                 60
GenomicEntity.PREVENTS.DiseaseOrPhenotypicF

In [12]:
# working off https://docs.google.com/spreadsheets/d/1zXitcR1QjHyh6WocukgshSR7IoAVg7MJQG-HNh96Jec/edit#gid=579577728
allowed_domain_range = {
    'CAUSES': (None, {'biological_process_or_activity', 'disease_or_phenotypic_feature'}),
    'LOCATION_OF': ({'gross_anatomical_structure', 'anatomical_entity', 'cell_component', 'cell'}, None),
    'TREATS': (None, {'disease_or_phenotypic_feature'}),
    'PREDISPOSES': (None, {'disease_or_phenotypic_feature'}),
    'PREVENTS': (None, {'disease_or_phenotypic_feature'}),
}

In [13]:
def is_allowed_edge(domain, pred, rnge):
    allowed_domain, allowed_range = allowed_domain_range[pred] if pred in allowed_domain_range else (None, None)
    return ((domain in allowed_domain if allowed_domain else True) and 
              (rnge in allowed_range if allowed_range else True))

In [14]:
d = {x: is_allowed_edge(*x.split(".")) for x in set(edges.bl_type)}
allowed_edges = {k for k,v in d.items() if v}

In [15]:
idx = edges.bl_type.isin(allowed_edges)
print(idx.value_counts())
edges = edges[idx]

True     11113315
False     3764316
Name: bl_type, dtype: int64


In [16]:
print(len(edges))

11113315


In [17]:
edges.head()

Unnamed: 0,SUBJECT_CUI,SEMMED_PRED,OBJECT_CUI,PMID,NEG,bl_domain,bl_pred,bl_range,bl_type
0,C1412045,AFFECTS,C0005935,29798367,False,Gene,AFFECTS,BiologicalProcessOrActivity,Gene.AFFECTS.BiologicalProcessOrActivity
1,C1412045,AFFECTS,C0028754,19789049,False,Gene,AFFECTS,DiseaseOrPhenotypicFeature,Gene.AFFECTS.DiseaseOrPhenotypicFeature
2,C1412045,AFFECTS,C0036421,31505074;31505074,False,Gene,AFFECTS,DiseaseOrPhenotypicFeature,Gene.AFFECTS.DiseaseOrPhenotypicFeature
3,C1412045,AFFECTS,C0597304,1409557,False,Gene,AFFECTS,BiologicalProcessOrActivity,Gene.AFFECTS.BiologicalProcessOrActivity
4,C1412045,AFFECTS,C0599816,7617239,False,Gene,AFFECTS,BiologicalProcessOrActivity,Gene.AFFECTS.BiologicalProcessOrActivity


In [18]:
# rename 'converts_to' edge to 'derives_into'
edges.loc[lambda df: df['bl_pred'] == "CONVERTS_TO", "bl_pred"] = "DERIVES_INTO"

In [19]:
# rename 'isa' edge to 'subclass of'
edges.loc[lambda df: df['bl_pred'] == "ISA", "bl_pred"] = "SUBCLASS_OF"

In [20]:
# rename 'disrupts' edge to 'affects'
edges.loc[lambda df: df['bl_pred'] == "DISRUPTS", "bl_pred"] = "AFFECTS"

In [21]:
# rename 'associated_with' edge to 'related_to'
edges.loc[lambda df: df['bl_pred'] == "ASSOCIATED_WITH", "bl_pred"] = "RELATED_TO"

In [22]:
# rename 'STIMULATES' edge to 'positively_regulates'
edges.loc[lambda df: df['bl_pred'] == "STIMULATES", "bl_pred"] = "positively_regulates".upper()

In [23]:
# rename 'INHIBITS' edge to 'negatively_regulates'
edges.loc[lambda df: df['bl_pred'] == "INHIBITS", "bl_pred"] = "negatively_regulates".upper()

In [24]:
# associated_with/related_to edges with domain: gene, range: disease; rename to gene_associated_with_condition, 
# and leave the rest alone
edges.loc[lambda df:(df['bl_pred'] == "RELATED_TO") & (df['bl_domain'] == "gene") & 
      (df['bl_range'] == "disease_or_phenotypic_feature"), "bl_pred"] = 'gene_associated_with_condition'.upper()

In [25]:
edges.bl_pred.value_counts()

INTERACTS_WITH          2220070
AFFECTS                 2149128
COEXISTS_WITH           1780772
POSITIVELY_REGULATES    1307942
NEGATIVELY_REGULATES    1106486
RELATED_TO               939793
PART_OF                  751794
PRODUCES                 480627
SUBCLASS_OF              237589
PRECEDES                  52602
DERIVES_INTO              48751
MANIFESTATION_OF          36672
MEASURES                   1048
MEASUREMENT_OF               38
PREP                          3
Name: bl_pred, dtype: int64

In [26]:
edges[(edges.bl_domain == "gene") & (edges.bl_range == "disease_or_phenotypic_feature")].bl_pred.value_counts()

Series([], Name: bl_pred, dtype: int64)

In [27]:
print(len(nodes))
nodes = nodes[nodes.ID.isin(set(list(edges['SUBJECT_CUI']) + list(edges['OBJECT_CUI'])))]
print(len(nodes))

156759
145058


In [28]:
## summary
edges['bl_type'] = edges['bl_domain'] + "." + edges['bl_pred'] + "." + edges['bl_range']
print(len(edges))
print(len(nodes))
print(len(set(edges.bl_type)))
print(len(set(edges.bl_pred)))
print(len(set(nodes.blm_category)))

11113315
145058
877
15
12


In [29]:
del edges['bl_type']
del edges['bl_domain']
del edges['bl_range']

In [30]:
edges.head()

Unnamed: 0,SUBJECT_CUI,SEMMED_PRED,OBJECT_CUI,PMID,NEG,bl_pred
0,C1412045,AFFECTS,C0005935,29798367,False,AFFECTS
1,C1412045,AFFECTS,C0028754,19789049,False,AFFECTS
2,C1412045,AFFECTS,C0036421,31505074;31505074,False,AFFECTS
3,C1412045,AFFECTS,C0597304,1409557,False,AFFECTS
4,C1412045,AFFECTS,C0599816,7617239,False,AFFECTS


In [31]:
edges.to_csv(EDGES_BIOLINK_CSV, index=None)
nodes.to_csv(NODES_BIOLINK_CSV, index=None)