01-initial_data_clean.ipynb

first run download_convert.sh to download sql file and convert to csv
- Expand predicates with OR operations into individual predicates
- Convert cuis that are entrez ids into cuis
- Change neg props to the same prop with a negative flag
- Make a separate nodes table

In [1]:
import os
import pickle
%matplotlib inline
import pandas as pd
from numpy import NAN
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from collections import defaultdict
import itertools
import numpy as np

In [2]:
DATA  = 'data/'
UMLS  = DATA+"2020AA-full/2020AA/META/"
SEMMEDDB_PREDICATION_CSV = DATA+"semmedVER42_2020_R_PREDICATION.csv"
EDGES1_CSV = DATA+"edges1.csv"
EDGES2_CSV = DATA+"edges2.csv"
MRSAT_ARCHIVE = UMLS+"MRSAT.RRF.gz"
MRCONSO_ENG_ARCHIVE = UMLS+"MRCONSO_ENG.RRF.gz"
EDGES3_CSV = DATA+"edges3.csv"
EDGES4_CSV = DATA+"edges4.csv"
NODES1_CSV = DATA+"nodes1.csv"

In [3]:
# read in csv, group rows together with the same subj -> pred -> obj and keep count of number of pmids
# done in chunks
cols = ['SUBJECT_CUI','PREDICATE','OBJECT_CUI','PMID']
gb_cols = ['SUBJECT_CUI','PREDICATE','OBJECT_CUI']
sem_df = pd.DataFrame(columns=cols)
df_iter = pd.read_csv(SEMMEDDB_PREDICATION_CSV, dtype=str, usecols=cols, chunksize=10000000)
for chunk in tqdm(df_iter, total=10):
    chunk.PMID = chunk.PMID.astype("str")
    c = chunk.groupby(gb_cols).PMID.agg(";".join).reset_index()
    sem_df = sem_df.append(c)

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [4]:
# group everything together again
print(len(sem_df))
sem_df = sem_df.groupby(gb_cols).PMID.agg(";".join).reset_index()
print(len(sem_df))

35850848
20925800


In [5]:
print(sem_df.PMID.str.count(";").value_counts()[:5])

0    14039141
1     2904485
2     1168860
3      636552
4      396795
Name: PMID, dtype: int64


In [6]:
sem_df.head()

Unnamed: 0,SUBJECT_CUI,PREDICATE,OBJECT_CUI,PMID
0,1,ADMINISTERED_TO,C0007634,24096582
1,1,AFFECTS,C0005935,29798367
2,1,AFFECTS,C0020291,6298464
3,1,AFFECTS,C0028754,19789049
4,1,AFFECTS,C0036421,31505074;31505074


In [7]:
sem_df.to_csv(EDGES1_CSV)

## In semmedDB some subjects and objects of extracted statements contained the pipe character | as an indicator of multiple concepts in the sentence. Here, we normalize to singular concepts.

In [8]:
# separate out lines with pipes from the rest
multi_start = sem_df['SUBJECT_CUI'].str.contains('|', regex=False)
multi_end = sem_df['OBJECT_CUI'].str.contains('|', regex=False)
pipe_lines = sem_df[multi_start | multi_end].copy()
sem_df = sem_df[~multi_start & ~multi_end]
print('Rows with multiple subjects or objects {:,}'.format(len(pipe_lines)))
print('Rows with only 1 subject AND only 1 object {:,}'.format(len(sem_df)))

Rows with multiple subjects or objects 2,817,756
Rows with only 1 subject AND only 1 object 18,108,044


In [9]:
pipe_lines.SUBJECT_CUI = pipe_lines.SUBJECT_CUI.str.split('|')
pipe_lines.OBJECT_CUI = pipe_lines.OBJECT_CUI.str.split('|')
pipe_lines.head()

Unnamed: 0,SUBJECT_CUI,PREDICATE,OBJECT_CUI,PMID
26,[1],COEXISTS_WITH,"[C0003241, 4099]",24639825;24639825;24639825
39,[1],INHIBITS,"[C0034802, 1956]",22865653
42,[1],INTERACTS_WITH,"[5295, 5296, 8874, 9139, 23368, 54776]",20502503
45,[1],INTERACTS_WITH,"[C0812252, 1398]",20502503
47,[1],INTERACTS_WITH,"[C1335280, 5781]",20502503


In [10]:
# do the combinations
lines = []
for row in tqdm(pipe_lines.itertuples(), total=len(pipe_lines)):
    #print(row)
    a = [row.SUBJECT_CUI, row.OBJECT_CUI]
    c = list(itertools.product(*a))
    lines.extend([{'SUBJECT_CUI':x[0], 'PREDICATE':row.PREDICATE, 'OBJECT_CUI':x[1], 'PMID': row.PMID} for x in c])
expanded_df = pd.DataFrame(lines)

HBox(children=(FloatProgress(value=0.0, max=2817756.0), HTML(value='')))




In [11]:
print(len(expanded_df))
expanded_df.head()

6859161


Unnamed: 0,SUBJECT_CUI,PREDICATE,OBJECT_CUI,PMID
0,1,COEXISTS_WITH,C0003241,24639825;24639825;24639825
1,1,COEXISTS_WITH,4099,24639825;24639825;24639825
2,1,INHIBITS,C0034802,22865653
3,1,INHIBITS,1956,22865653
4,1,INTERACTS_WITH,5295,20502503


In [12]:
# append the expanded rows to the original df
print(len(sem_df))
sem_df = sem_df.append(expanded_df, sort=True)
print(len(sem_df))

18108044
24967205


In [13]:
# Group rows together again and aggregate pmids
sem_df = sem_df.groupby(gb_cols).PMID.agg(";".join).reset_index()
print(len(sem_df))

23971617


In [14]:
# Not sure why but some SUBJECT_CUI's and OBJECT_CUI's are empty (or with "NaN") at this point
sem_df.head()

Unnamed: 0,SUBJECT_CUI,PREDICATE,OBJECT_CUI,PMID
0,,ADMINISTERED_TO,C0007600,14657188;16621368
1,,ADMINISTERED_TO,C0007634,31497221
2,,ADMINISTERED_TO,C0018496,18848895
3,,ADMINISTERED_TO,C1512505,9617575
4,,AFFECTS,C0001272,18096820;19204724


In [15]:
# But we will just ignore them
# replace empty string fields with NaN
sem_df.replace("", NAN, inplace=True)
# Then drop the rows with NaN
sem_df.dropna(subset=['SUBJECT_CUI','OBJECT_CUI'], inplace=True)

In [16]:
# Dataset purged empty concept entries
print(len(sem_df))
sem_df.head()

23970499


Unnamed: 0,SUBJECT_CUI,PREDICATE,OBJECT_CUI,PMID
588,1,ADMINISTERED_TO,C0007634,24096582
589,1,ADMINISTERED_TO,C0030705,22577025
590,1,AFFECTS,C0005935,29798367
591,1,AFFECTS,C0020291,6298464
592,1,AFFECTS,C0028754,19789049


In [17]:
# Cache a copy of the current dataframe
sem_df.to_csv(EDGES2_CSV)

### There are many CUIs that are not actually CUIs. Mike has determined they are (usually/always?) Entrez IDs

In [18]:
# are going to get them out of this MRSAT file (prepared from UMLS Metathesaurus using their Metamorphosys tool)
# We have downloaded the UMLS, generated RRF files, and compressed them (for convenient stream access)
# Here, we access MRSAT.RRF.gz (other UMLS files used elsewhere). See top of Notebook for file path setting
names = list("abcdefghijklmn")
iter_csv = pd.read_csv(MRSAT_ARCHIVE, delimiter="|", names=names, index_col=None, chunksize=1000000)
chunks = []
umls_entrez = dict()
for chunk in tqdm(iter_csv, total=67668372/1000000):
    chunk.fillna(method='ffill', inplace=True)
    chunk = chunk[chunk.i == "ENTREZGENE_ID"]
    d = dict(zip(chunk.a, chunk.k))
    umls_entrez.update(d)

HBox(children=(FloatProgress(value=0.0, max=67.668372), HTML(value='')))

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)





In [20]:
entrez_umls = {v:k for k,v in umls_entrez.items()}
entrez_umls['9987']

'C1415639'

In [21]:
sem_df.SUBJECT_CUI = sem_df.SUBJECT_CUI.map(lambda x:entrez_umls[x] if x in entrez_umls else x)
sem_df.OBJECT_CUI = sem_df.OBJECT_CUI.map(lambda x:entrez_umls[x] if x in entrez_umls else x)

In [22]:
noncdf = sem_df[~sem_df.SUBJECT_CUI.str.startswith("C")]
print(len(noncdf))

18249


In [23]:
# dump everything that doesn't starts with a "C"
print(len(sem_df))
sem_df = sem_df[sem_df.SUBJECT_CUI.str.startswith("C")]
sem_df = sem_df[sem_df.OBJECT_CUI.str.startswith("C")]
print(len(sem_df))

23970499
23932681


In [24]:
sem_df.to_csv(EDGES3_CSV)

## Some SemMedDb predicates are negations - Biolink tags these as negations of the unnegated predicated, so we tag them as such

In [26]:
# change the neg to the same prop without neg and add a neg flag field
idx = sem_df["PREDICATE"].str.startswith("NEG_")
sem_df['NEG'] = False
sem_df.loc[idx, 'NEG'] = True
sem_df.loc[idx, 'PREDICATE'] = sem_df[idx].PREDICATE.str.replace("NEG_", "")
sem_df[sem_df.NEG].head()

Unnamed: 0,SUBJECT_CUI,PREDICATE,OBJECT_CUI,PMID,NEG
656,C1412045,AFFECTS,C0031845,19737390,True
657,C1412045,ASSOCIATED_WITH,C0206754,28792692,True
658,C1412045,CAUSES,C1457887,18943647,True
659,C1412045,COEXISTS_WITH,C0376261,24009956,True
1117,C1705514,AFFECTS,C0001038,10030449,True


In [27]:
# Cache updated edges with new neg flag field
sem_df.to_csv(EDGES4_CSV)

In [28]:
### Make a nodes table
conso = pd.read_csv(MRCONSO_ENG_ARCHIVE, delimiter="|", index_col=None, names = list("abcdefghijklmnopqrs"))
conso = conso[(conso['c'] == "P") & (conso['e'] == "PF")]
conso.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s
0,C0000005,ENG,P,L0000005,PF,S0007492,Y,A26634265,,M0019694,D012711,MSH,PEP,D012711,(131)I-Macroaggregated Albumin,0,N,256.0,
2,C0000039,ENG,P,L0000039,PF,S17175117,N,A28315139,9194921.0,1926948,,RXNORM,IN,1926948,"1,2-dipalmitoylphosphatidylcholine",0,N,256.0,
3,C0000039,ENG,P,L0000039,PF,S17175117,Y,A28572604,,,,MTH,PN,NOCODE,"1,2-dipalmitoylphosphatidylcholine",0,N,256.0,
20,C0000052,ENG,P,L0000052,PF,S0007584,N,A0016535,,M0023173,D015061,MSH,MH,D015061,"1,4-alpha-Glucan Branching Enzyme",0,N,256.0,
21,C0000052,ENG,P,L0000052,PF,S0007584,Y,A0016536,,,,MTH,PN,NOCODE,"1,4-alpha-Glucan Branching Enzyme",0,N,256.0,


In [29]:
node_label = dict(zip(conso.a, conso.o))
print(len(node_label))

3376510


In [30]:
nodes = set(sem_df.SUBJECT_CUI) | set(sem_df.OBJECT_CUI)
print(len(nodes))

279816


In [31]:
nodes = pd.DataFrame({"ID":x, "LABEL": node_label.get(x)} for x in nodes)
nodes = nodes.dropna()
print(len(nodes))
nodes.head()

235430


Unnamed: 0,ID,LABEL
0,C3842672,Day 7
1,C1002758,Brachypodium pinnatum
6,C0020684,Hypoxanthine
7,C0853225,INR Increased
8,C1513022,Mature Centriole


In [32]:
nodes.to_csv(NODES1_CSV)