# SemMedDB Import

In [2]:
import pandas as pd

In [2]:
# Select only the columns of interest from the predication table
columns = ['SENTENCE_ID', 'PMID', 'PREDICATE', 'SUBJECT_CUI','OBJECT_CUI']
predication_select = pd.read_csv('/home/tim/Documents/GrApH_AI/Data/SemMedDB/semmedVER43_2021_R_PREDICATION.23871.csv', usecols = [1,2,3,4,8], header=None, names = columns)
predication_select.head()

Unnamed: 0,SENTENCE_ID,PMID,PREDICATE,SUBJECT_CUI,OBJECT_CUI
0,16,16530475,PROCESS_OF,C0003725,C0999630
1,17,16530475,ISA,C0039258,C0446169
2,17,16530475,ISA,C0318627,C0206590
3,17,16530475,ISA,C0446169,C0003725
4,18,16530475,PROCESS_OF,C0012634,C0020114


In [3]:
causal = predication_select[predication_select['PREDICATE'] == 'CAUSES'].copy()
causal

Unnamed: 0,SENTENCE_ID,PMID,PREDICATE,SUBJECT_CUI,OBJECT_CUI
5,18,16530475,CAUSES,C0042776,C0012634
46,103,16530483,CAUSES,C1504598,C0004368
53,112,16530485,CAUSES,C0812258|1869|7332,C0162638
55,116,16530485,CAUSES,C0812258|1869|7332,C0162638
161,384,16530601,CAUSES,C0403425,C1261469
...,...,...,...,...,...
111845480,367045376,33909265,CAUSES,C5203670,C0011065
111845666,367045836,33909316,CAUSES,C3166216,C1262477
111845766,367046022,33909339,CAUSES,C0007222,C0011065
111845834,367046134,33909350,CAUSES,C0042210,C0040034


In [98]:
causal.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3074874 entries, 5 to 111845852
Data columns (total 5 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   SENTENCE_ID  int64 
 1   PMID         int64 
 2   PREDICATE    object
 3   SUBJECT_CUI  object
 4   OBJECT_CUI   object
dtypes: int64(2), object(3)
memory usage: 140.8+ MB


In [4]:
# Some items in the SUBJECT_CUI column have extra numbers after the CUI. Let's remove those.
causal['SUBJECT_CUI'] = causal['SUBJECT_CUI'].apply([lambda x: x.split('|')[0]])
causal

Unnamed: 0,SENTENCE_ID,PMID,PREDICATE,SUBJECT_CUI,OBJECT_CUI
5,18,16530475,CAUSES,C0042776,C0012634
46,103,16530483,CAUSES,C1504598,C0004368
53,112,16530485,CAUSES,C0812258,C0162638
55,116,16530485,CAUSES,C0812258,C0162638
161,384,16530601,CAUSES,C0403425,C1261469
...,...,...,...,...,...
111845480,367045376,33909265,CAUSES,C5203670,C0011065
111845666,367045836,33909316,CAUSES,C3166216,C1262477
111845766,367046022,33909339,CAUSES,C0007222,C0011065
111845834,367046134,33909350,CAUSES,C0042210,C0040034


In [100]:
causal['SUBJECT_CUI'].str.len().value_counts()

8    2990694
4      45515
5      20558
3      12252
6       4707
2        585
9        527
0         30
1          6
Name: SUBJECT_CUI, dtype: int64

In [5]:
# Remove any rows where the length of the CUI is incorrect for the SUBJECT_CUI or OBJECT_CUI
mask = (causal['SUBJECT_CUI'].str.len() == 8) & (causal['OBJECT_CUI'].str.len() == 8)
causal = causal.loc[mask]
causal

Unnamed: 0,SENTENCE_ID,PMID,PREDICATE,SUBJECT_CUI,OBJECT_CUI
5,18,16530475,CAUSES,C0042776,C0012634
46,103,16530483,CAUSES,C1504598,C0004368
53,112,16530485,CAUSES,C0812258,C0162638
55,116,16530485,CAUSES,C0812258,C0162638
161,384,16530601,CAUSES,C0403425,C1261469
...,...,...,...,...,...
111845480,367045376,33909265,CAUSES,C5203670,C0011065
111845666,367045836,33909316,CAUSES,C3166216,C1262477
111845766,367046022,33909339,CAUSES,C0007222,C0011065
111845834,367046134,33909350,CAUSES,C0042210,C0040034


In [6]:
causal.to_csv('/home/tim/Documents/GrApH_AI/Data/SemMedDB/causal_predicates.csv', index=False)

Due to memory limits it was necessary to restart the notebook's kernal before attempting to load the SENTENCE table.

In [2]:
# Obtain all information necessary to retrieve the sentence stating the causal relationship
# from the article where it was found

# Note that the order of columns in the actual data differs slightly from the documentation
columns = ['SENTENCE_ID', 'TYPE', 'SENT_START_INDEX', 'SENT_END_INDEX']   
sentence = pd.read_csv('/home/tim/Documents/GrApH_AI/Data/SemMedDB/semmedVER43_2021_R_SENTENCE.23871.csv', usecols = [0,2,4,6], header=None, names = columns)
sentence.head()

Unnamed: 0,SENTENCE_ID,TYPE,SENT_START_INDEX,SENT_END_INDEX
0,6,ti,21,119
1,7,ab,125,302
2,8,ab,302,385
3,9,ab,385,578
4,10,ab,578,757


In [3]:
sentence.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219049752 entries, 0 to 219049751
Data columns (total 4 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   SENTENCE_ID       int64 
 1   TYPE              object
 2   SENT_START_INDEX  int64 
 3   SENT_END_INDEX    object
dtypes: int64(2), object(2)
memory usage: 6.5+ GB


In [3]:
sentence.to_csv('/home/tim/Documents/GrApH_AI/Data/SemMedDB/sentence_locations.csv', index=False)

In [3]:
# Load the causal predicates
causal_predicates = pd.read_csv('/home/tim/Documents/GrApH_AI/Data/SemMedDB/causal_predicates.csv')
causal_predicates.head()

Unnamed: 0,SENTENCE_ID,PMID,PREDICATE,SUBJECT_CUI,OBJECT_CUI
0,18,16530475,CAUSES,C0042776,C0012634
1,103,16530483,CAUSES,C1504598,C0004368
2,112,16530485,CAUSES,C0812258,C0162638
3,116,16530485,CAUSES,C0812258,C0162638
4,384,16530601,CAUSES,C0403425,C1261469


In [4]:
# Load the sentence locations
sentence_locations = pd.read_csv('/home/tim/Documents/GrApH_AI/Data/SemMedDB/sentence_locations.csv')
sentence_locations.head()

Unnamed: 0,SENTENCE_ID,TYPE,SENT_START_INDEX,SENT_END_INDEX
0,6,ti,21,119
1,7,ab,125,302
2,8,ab,302,385
3,9,ab,385,578
4,10,ab,578,757


In [None]:
# Merge the sentence data with the causal subject-predicate-object data
causal_predicates = pd.merge(causal_predicates, sentence_locations, on=['SENTENCE_ID'])
causal_predicates.head()