# 03-filter_nodes_edges.ipynb
- Remove edges with nodes with no umls type or label
- Remove the following predicates: ['compared_with', 'higher_than', 'lower_than', 'different_from', 'different_than', 
'same_as','OCCURS_IN', 'PROCESS_OF', 'DIAGNOSES', 'METHOD_OF', 'USES','AUGMENTS', 'ADMINISTERED_TO', 'COMPLICATES']

In [1]:
import os
import pickle
%pylab
%matplotlib inline
import pandas as pd
import seaborn as sns
import re
from collections import defaultdict
from tqdm import tqdm
from semmed_biolink_environment import *

Using matplotlib backend: MacOSX
Populating the interactive namespace from numpy and matplotlib


In [2]:
edges = pd.read_csv(EDGES4_TSV, sep='\t', index_col=0)
edges.head()

  mask |= (ar1 == a)


Unnamed: 0,SUBJECT_CUI,PREDICATE,OBJECT_CUI,PMID,NEG
588,C1412045,ADMINISTERED_TO,C0007634,24096582,False
589,C1412045,ADMINISTERED_TO,C0030705,22577025,False
590,C1412045,AFFECTS,C0005935,29798367,False
591,C1412045,AFFECTS,C0020291,6298464,False
592,C1412045,AFFECTS,C0028754,19789049,False


In [3]:
nodes = pd.read_csv(NODES_BLM_TSV, sep='\t', index_col=0)
nodes.head()

Unnamed: 0,ID,LABEL,umls_type,umls_type_label,blm_category
0,C1516083,Attachment Plaque,T026,Cell Component,CellularComponent
1,C0656503,"4-nonyl-3,5-diethoxycarbonyl-1,4-dihydro-2,6-d...",T121|T109,Organic Chemical|Pharmacologic Substance,ChemicalSubstance
3,C0234143,Neurological muscle weakness,T047,Disease or Syndrome,DiseaseOrPhenotypicFeature
5,C0861091,Methanol increased,T033,Finding,DiseaseOrPhenotypicFeature
6,C0074127,scarlet red,T121|T109|T130,Organic Chemical|Pharmacologic Substance|Indic...,ChemicalSubstance


In [4]:
print(len(nodes))
print(len(edges))

185493
23932681


In [5]:
# remove edges with no nodes
print(len(edges))
edges = edges[edges.SUBJECT_CUI.isin(nodes.ID) & edges.OBJECT_CUI.isin(nodes.ID)]
print(len(edges))

23932681
16529669


In [6]:
vc = edges.PREDICATE.value_counts()
vc

INTERACTS_WITH      2229571
LOCATION_OF         2164390
COEXISTS_WITH       1842464
AFFECTS             1644009
STIMULATES          1312678
INHIBITS            1112598
ASSOCIATED_WITH      962351
CAUSES               793698
PART_OF              760353
AUGMENTS             557385
DISRUPTS             542272
TREATS               503437
PRODUCES             483339
compared_with        370958
PREDISPOSES          279729
ISA                  245915
PREVENTS             150134
PROCESS_OF           142358
higher_than          112972
ADMINISTERED_TO       66361
PRECEDES              55586
CONVERTS_TO           49203
COMPLICATES           39838
MANIFESTATION_OF      38284
same_as               24684
lower_than            21574
USES                  15932
DIAGNOSES              5117
OCCURS_IN              1222
MEASURES               1066
METHOD_OF               144
MEASUREMENT_OF           44
PREP                      3
Name: PREDICATE, dtype: int64

In [7]:
def drop_edges(df, drop_edges):
    idx = df.query('PREDICATE in @drop_edges').index
    df.drop(idx, inplace=True)

In [8]:
# remove edges we don't need
print(len(edges))
remove_edges = ['compared_with', 'higher_than', 'lower_than', 'different_from', 'different_than', 'same_as',
               'OCCURS_IN', 'PROCESS_OF', 'DIAGNOSES', 'METHOD_OF', 'USES',
               'AUGMENTS', 'ADMINISTERED_TO', 'COMPLICATES']
drop_edges(edges, remove_edges)
print(len(edges))

16529669
15171124


In [9]:
# remove nodes with no edges
print(len(nodes))
nodes = nodes[nodes.ID.isin(set(list(edges.SUBJECT_CUI) + list(edges.OBJECT_CUI)))]
print(len(nodes))

185493
171636


In [10]:
vc = edges.PREDICATE.value_counts()
vc

INTERACTS_WITH      2229571
LOCATION_OF         2164390
COEXISTS_WITH       1842464
AFFECTS             1644009
STIMULATES          1312678
INHIBITS            1112598
ASSOCIATED_WITH      962351
CAUSES               793698
PART_OF              760353
DISRUPTS             542272
TREATS               503437
PRODUCES             483339
PREDISPOSES          279729
ISA                  245915
PREVENTS             150134
PRECEDES              55586
CONVERTS_TO           49203
MANIFESTATION_OF      38284
MEASURES               1066
MEASUREMENT_OF           44
PREP                      3
Name: PREDICATE, dtype: int64

In [11]:
edges.to_csv(EDGES_FILTERED_TSV, sep='\t', index=False)
nodes.to_csv(NODES_FILTERED_TSV, sep='\t', index=False)