In [1]:
import pandas as pd
from glob import glob
from IPython.display import display, Markdown
from tqdm import tqdm
pd.__version__

'2.0.2'

# Unified Biomedical Knowledge Graph (UBKG)
## Nodes
### Concept

In [2]:
concepts = pd.read_csv("neo4j/import/CUIs.csv")
concepts = pd.DataFrame(index=concepts["CUI:ID"].unique())
concepts.index.name = "id"
concepts.head()

C0000097
C0000359
C0000610
C0000739
C0000873


### Semantics

In [3]:
semantics = pd.read_csv("neo4j/import/TUIs.csv", index_col=0)
semantics.head()

Unnamed: 0_level_0,name,STN,DEF
TUI:ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
T012,Bird,A1.1.3.1.1.2,A vertebrate having a constant body temperatur...
T014,Reptile,A1.1.3.1.1.5,A cold-blooded vertebrate having an external c...
T015,Mammal,A1.1.3.1.1.4,A vertebrate having a constant body temperatur...
T021,Fully Formed Anatomical Structure,A1.2.3,An anatomical structure in a fully formed orga...
T022,Body System,A2.1.4.1,A complex of anatomical structures that perfor...


### Definition

In [4]:
definition = pd.read_csv("neo4j/import/DEFs.csv", index_col=0)
definition.head()

Unnamed: 0_level_0,SAB,DEF
ATUI:ID,Unnamed: 1_level_1,Unnamed: 2_level_1
AT38138756,MSH,A tool for the study of liver damage which cau...
AT43116097,MSH,A hepatic carcinogen whose mechanism of activa...
AT38141939,MSH,Simple amine found in the brain. It may be mod...
AT235449311,MSH,A physiologically active metabolite of VITAMIN...
AT38137186,MSH,A material used in the manufacture of azo dyes...


### Term

In [5]:
terms = pd.read_csv("neo4j/import/SUIs.csv", index_col=0)
terms.head()

Unnamed: 0_level_0,name
SUI:ID,Unnamed: 1_level_1
S0009776,"Acid, 2-Aminohexanedioic"
S7249234,BR CAMP
S11872577,cramps abdominal
S14680596,Retained tissue after pregnancy loss
S3417882,Missed miscarriage


### Code

In [6]:
codes = pd.read_csv("neo4j/import/CODEs.csv", index_col=0)
codes.head()

  codes = pd.read_csv("neo4j/import/CODEs.csv", index_col=0)


Unnamed: 0_level_0,SAB,CODE,value:float,lowerbound:float,upperbound:float,unit
CodeID:ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MTHSPL J7A92W69L7,MTHSPL,J7A92W69L7,,,,
NCI C76777,NCI,C76777,,,,
ATC N07XX07,ATC,N07XX07,,,,
GS 1946,GS,1946,,,,
NOC 040413,NOC,040413,,,,


## Edges
### Concept-Term

In [7]:
concept_term = pd.read_csv("neo4j/import/CUI-SUIs.csv")
concept_term.head()

Unnamed: 0,:START_ID,:END_ID
0,C0000163,S0007806
1,C0000427,S0008336
2,C0000665,S11922611
3,C0000992,S1577859
4,C0001021,S0797872


### Concept-Semantics

In [8]:
concept_semantics = pd.read_csv("neo4j/import/CUI-TUIs.csv")
concept_semantics.head()

Unnamed: 0,:START_ID,:END_ID
0,C0000132,T126
1,C0000246,T116
2,C0000895,T060
3,C0000908,T037
4,C0000931,T067


### Concept-Code

In [9]:
concept_code = pd.read_csv("neo4j/import/CUI-CODEs.csv")
concept_code.head()

Unnamed: 0,:START_ID,:END_ID
0,C0000294,ATC V03AF01
1,C0000481,CHV 0000000513
2,C0000661,MSH D015124
3,C0000665,VANDF 4020847
4,C0000737,LNC LA15468-4


### Concept-Concept

In [10]:
# concept_concept = pd.read_csv("neo4j/import/CUI-CUIs.csv", dtype_backend="pyarrow")
# concept_concept.head()

### Semantic-Semantic
Prolly skip this

In [11]:
semantics_semantics = pd.read_csv("neo4j/import/TUIrel.csv")
semantics_semantics.head()

Unnamed: 0,:END_ID,:START_ID
0,T204,T002
1,T001,T004
2,T071,T004
3,T072,T010
4,T204,T010


### Concept-Definition

In [12]:
concept_definition = pd.read_csv("neo4j/import/DEFrel.csv")
concept_definition.head()

Unnamed: 0,:END_ID,:START_ID
0,AT38152019,C0000039
1,AT69817678,C0000039
2,AT264439104,C0000039
3,AT254753550,C0000039
4,AT267611046,C0000039


### Code-Term

In [13]:
code_term = pd.read_csv("neo4j/import/CODE-SUIs.csv")
code_term.head()

Unnamed: 0,:END_ID,:START_ID,:TYPE,CUI
0,S1424701,RXNORM 74,IN,C0000473
1,S18541041,SNOMEDCT_US 80994002,FN,C0000477
2,S11730064,SNOMEDCT_US 226367006,SY,C0000545
3,S0288461,CSP 2005-4146,PT,C0000735
4,S1957040,MDR 10048885,LLT,C0000735


## Merge Concept and Terms

In [14]:
concept_term.columns = ["CUI:ID", "SUI:ID"]
concept_term.head()

Unnamed: 0,CUI:ID,SUI:ID
0,C0000163,S0007806
1,C0000427,S0008336
2,C0000665,S11922611
3,C0000992,S1577859
4,C0001021,S0797872


In [15]:
concept_term = pd.merge(concept_term, terms, on="SUI:ID", how='left')
concept_term = concept_term.groupby('CUI:ID').first()
concept_term.head()

Unnamed: 0_level_0,SUI:ID,name
CUI:ID,Unnamed: 1_level_1,Unnamed: 2_level_1
C0000005,S0007492,(131)I-Macroaggregated Albumin
C0000039,S17175117,"1,2-dipalmitoylphosphatidylcholine"
C0000052,S0007584,"1,4-alpha-Glucan Branching Enzyme"
C0000074,S0007615,1-Alkyl-2-Acylphosphatidates
C0000084,S0007627,1-Carboxyglutamic Acid


In [16]:
concept_term.columns = ["SUI:ID", "label"]
concept_term = concept_term[["label"]]
concept_term.head()

Unnamed: 0_level_0,label
CUI:ID,Unnamed: 1_level_1
C0000005,(131)I-Macroaggregated Albumin
C0000039,"1,2-dipalmitoylphosphatidylcholine"
C0000052,"1,4-alpha-Glucan Branching Enzyme"
C0000074,1-Alkyl-2-Acylphosphatidates
C0000084,1-Carboxyglutamic Acid


In [17]:
concepts.loc[concept_term.index, 'label'] = concept_term.loc[concept_term.index, 'label']
concepts.head()

Unnamed: 0_level_0,label
id,Unnamed: 1_level_1
C0000097,"1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine"
C0000359,"3',5'-Cyclic-Nucleotide Phosphodiesterase"
C0000610,6-Aminonicotinamide
C0000739,Abdominal Muscles
C0000873,Academic Problem


In [18]:
concepts.label = concepts.label.fillna("-")

## Merge Concept and Type

In [19]:
semantics.head()

Unnamed: 0_level_0,name,STN,DEF
TUI:ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
T012,Bird,A1.1.3.1.1.2,A vertebrate having a constant body temperatur...
T014,Reptile,A1.1.3.1.1.5,A cold-blooded vertebrate having an external c...
T015,Mammal,A1.1.3.1.1.4,A vertebrate having a constant body temperatur...
T021,Fully Formed Anatomical Structure,A1.2.3,An anatomical structure in a fully formed orga...
T022,Body System,A2.1.4.1,A complex of anatomical structures that perfor...


In [20]:
concept_semantics

Unnamed: 0,:START_ID,:END_ID
0,C0000132,T126
1,C0000246,T116
2,C0000895,T060
3,C0000908,T037
4,C0000931,T067
...,...,...
3595717,C5779300,T061
3595718,C5779311,T061
3595719,C5779361,T184
3595720,C5779410,T061


In [21]:
with open('out/semantics_ranked.tsv') as o:
    ranked_type = [i.strip() for i in o.read().strip().split("\n")]

In [22]:
concept_semantics.columns = ["id", "TUI:ID"]
concept_semantics["type"] = [semantics.at[i, 'name'] for i in concept_semantics['TUI:ID']]
concept_semantics.head()

Unnamed: 0,id,TUI:ID,type
0,C0000132,T126,Enzyme
1,C0000246,T116,"Amino Acid, Peptide, or Protein"
2,C0000895,T060,Diagnostic Procedure
3,C0000908,T037,Injury or Poisoning
4,C0000931,T067,Phenomenon or Process


In [23]:
def fetch_type(v):
    cat = ""
    rank = len(ranked_type)
    for i in v:
        r = ranked_type.index(i)
        if r < rank:
            cat = i
            rank = r
    return cat

In [24]:
cs = concept_semantics.groupby('id')['type'].apply(lambda x: "; ".join(set(x)))
cs.head()

id
C0000005    Indicator, Reagent, or Diagnostic Aid; Pharmac...
C0000039            Organic Chemical; Pharmacologic Substance
C0000052              Enzyme; Amino Acid, Peptide, or Protein
C0000074                                     Organic Chemical
C0000084    Biologically Active Substance; Amino Acid, Pep...
Name: type, dtype: object

In [25]:
cs_ranked = concept_semantics.groupby('id')['type'].apply(fetch_type)
cs_ranked.head()

id
C0000005    Amino Acid, Peptide, or Protein
C0000039                   Organic Chemical
C0000052                             Enzyme
C0000074                   Organic Chemical
C0000084    Amino Acid, Peptide, or Protein
Name: type, dtype: object

In [26]:
common = list(set(concepts.index).intersection(cs.index))
cs[common].head()

id
C3222979              Clinical Drug
C0182275             Medical Device
C2033221         Neoplastic Process
C2981012    Pharmacologic Substance
C4713416       Health Care Activity
Name: type, dtype: object

In [27]:
concept_semantics
concepts.loc[common, 'type'] = cs_ranked[common]
concepts.loc[common, 'type_combined'] = cs[common]
concepts.head()

Unnamed: 0_level_0,label,type,type_combined
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C0000097,"1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine",Organic Chemical,Hazardous or Poisonous Substance; Organic Chem...
C0000359,"3',5'-Cyclic-Nucleotide Phosphodiesterase",Enzyme,"Enzyme; Amino Acid, Peptide, or Protein"
C0000610,6-Aminonicotinamide,Pharmacologic Substance,Vitamin; Pharmacologic Substance
C0000739,Abdominal Muscles,"Body Part, Organ, or Organ Component","Body Part, Organ, or Organ Component"
C0000873,Academic Problem,Finding,Finding


In [28]:
concepts.groupby("type_combined").first().to_csv('out/semantics.tsv', sep="\t")

In [29]:
concepts.head()

Unnamed: 0_level_0,label,type,type_combined
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C0000097,"1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine",Organic Chemical,Hazardous or Poisonous Substance; Organic Chem...
C0000359,"3',5'-Cyclic-Nucleotide Phosphodiesterase",Enzyme,"Enzyme; Amino Acid, Peptide, or Protein"
C0000610,6-Aminonicotinamide,Pharmacologic Substance,Vitamin; Pharmacologic Substance
C0000739,Abdominal Muscles,"Body Part, Organ, or Organ Component","Body Part, Organ, or Organ Component"
C0000873,Academic Problem,Finding,Finding


In [30]:
len(concepts.type.unique())

128

In [31]:
with open('out/semantics_list.tsv', 'w') as o:
    o.write("\n".join([str(i) for i in concept_semantics.type.unique()]))

## Merge Concept and Code

In [32]:
codes.head()

Unnamed: 0_level_0,SAB,CODE,value:float,lowerbound:float,upperbound:float,unit
CodeID:ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MTHSPL J7A92W69L7,MTHSPL,J7A92W69L7,,,,
NCI C76777,NCI,C76777,,,,
ATC N07XX07,ATC,N07XX07,,,,
GS 1946,GS,1946,,,,
NOC 040413,NOC,040413,,,,


In [33]:
concept_code.columns = ["id", "CodeID:ID"]
concept_code.head()

Unnamed: 0,id,CodeID:ID
0,C0000294,ATC V03AF01
1,C0000481,CHV 0000000513
2,C0000661,MSH D015124
3,C0000665,VANDF 4020847
4,C0000737,LNC LA15468-4


In [34]:
concept_code = pd.merge(concept_code, codes, on="CodeID:ID", how='left')
concept_code.head()

Unnamed: 0,id,CodeID:ID,SAB,CODE,value:float,lowerbound:float,upperbound:float,unit
0,C0000294,ATC V03AF01,ATC,V03AF01,,,,
1,C0000481,CHV 0000000513,CHV,0000000513,,,,
2,C0000661,MSH D015124,MSH,D015124,,,,
3,C0000665,VANDF 4020847,VANDF,4020847,,,,
4,C0000737,LNC LA15468-4,LNC,LA15468-4,,,,


In [39]:
concept_code[concept_code.id == 'C0000097']

Unnamed: 0,id,CodeID:ID,SAB,CODE,value:float,lowerbound:float,upperbound:float,unit
623690,C0000097,LCH_NW sh86002892,LCH_NW,sh86002892,,,,
1108107,C0000097,CSP 2511-0411,CSP,2511-0411,,,,
1246274,C0000097,PSY 31213,PSY,31213,,,,
2179442,C0000097,MSH D015632,MSH,D015632,,,,
3529094,C0000097,CHV 0000000501,CHV,0000000501,,,,
3736237,C0000097,PSY 32433,PSY,32433,,,,
4116966,C0000097,SNOMEDCT_US 285407008,SNOMEDCT_US,285407008,,,,


In [35]:
concepts.head()

Unnamed: 0_level_0,label,type,type_combined
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C0000097,"1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine",Organic Chemical,Hazardous or Poisonous Substance; Organic Chem...
C0000359,"3',5'-Cyclic-Nucleotide Phosphodiesterase",Enzyme,"Enzyme; Amino Acid, Peptide, or Protein"
C0000610,6-Aminonicotinamide,Pharmacologic Substance,Vitamin; Pharmacologic Substance
C0000739,Abdominal Muscles,"Body Part, Organ, or Organ Component","Body Part, Organ, or Organ Component"
C0000873,Academic Problem,Finding,Finding


In [41]:
type = "Organic Chemical"
cc.head()
# sab = 'CSP'



Unnamed: 0,id,CodeID:ID,SAB,CODE,value:float,lowerbound:float,upperbound:float,unit
0,C0000294,ATC V03AF01,ATC,V03AF01,,,,
1,C0000481,CHV 0000000513,CHV,0000000513,,,,
3,C0000665,VANDF 4020847,VANDF,4020847,,,,
11,C0000948,DRUGBANK DB13262,DRUGBANK,DB13262,,,,
12,C0000956,DRUGBANK DB01418,DRUGBANK,DB01418,,,,


In [49]:
for type in tqdm(concepts.type.unique()):
    con = concepts[concepts.type==type].copy()
    cc = concept_code[concept_code.id.isin(con.index)]
    for sab in tqdm(cc.SAB.unique()):
        c = cc[cc.SAB == sab]
        c = c.groupby('id').first()
        common = list(set(con.index).intersection(c.index))
        con.loc[common, sab] = c.loc[common, "CodeID:ID"]
        if c.loc[common, "value:float"].isna().sum() != len(common):
            con.loc[common, "%s value"%sab] = c.loc[common, "value:float"]
        if c.loc[common, "lowerbound:float"].isna().sum() != len(common):
            con.loc[common, "%s lowerbound"%sab] = c.loc[common, "lowerbound:float"]
        if c.loc[common, "upperbound:float"].isna().sum() != len(common):
            con.loc[common, "%s upperbound"%sab] = c.loc[common, "upperbound:float"]
        if c.loc[common, "unit"].isna().sum() != len(common):
            con.loc[common, "%s unit"%sab] = c.loc[common, "unit"]
        con.to_csv("out/serialization/%s.nodes.csv"%(type))


100%|██████████| 45/45 [00:36<00:00,  1.25it/s]
100%|██████████| 35/35 [00:03<00:00,  8.86it/s]t]
100%|██████████| 46/46 [00:05<00:00,  7.78it/s]  
100%|██████████| 45/45 [00:15<00:00,  2.91it/s]
100%|██████████| 66/66 [01:08<00:00,  1.04s/it]
100%|██████████| 22/22 [00:00<00:00, 44.08it/s]t]
100%|██████████| 41/41 [00:00<00:00, 51.89it/s]  
100%|██████████| 33/33 [00:00<00:00, 66.25it/s]
100%|██████████| 47/47 [00:01<00:00, 26.69it/s]
100%|██████████| 48/48 [00:21<00:00,  2.22it/s]
100%|██████████| 33/33 [00:00<00:00, 52.52it/s]]
100%|██████████| 56/56 [01:12<00:00,  1.29s/it]]
100%|██████████| 37/37 [00:00<00:00, 37.26it/s]]
100%|██████████| 50/50 [00:20<00:00,  2.48it/s]]
100%|██████████| 23/23 [00:02<00:00, 10.45it/s]]
100%|██████████| 16/16 [00:00<00:00, 55.22it/s]]
100%|██████████| 26/26 [00:00<00:00, 149.55it/s]
100%|██████████| 28/28 [00:00<00:00, 31.66it/s]]
100%|██████████| 29/29 [00:03<00:00,  9.50it/s]]
100%|██████████| 26/26 [00:00<00:00, 148.83it/s]
100%|██████████| 15/15

In [48]:
c.loc[common, "CodeID:ID"]

id
C0614007       MSH C031120
C0055443       MSH C004616
C4079815    MSH C000602708
C1873299       MSH C517986
C1098299       MSH C437945
                 ...      
C3252423       MSH C568289
C0244833       MSH C046833
C0665039       MSH C107809
C0971017       MSH C413580
C0951645       MSH C007276
Name: CodeID:ID, Length: 195821, dtype: object

In [167]:
for sab in tqdm(concept_code.SAB.unique()):
    c = concept_code[concept_code.SAB == sab]
    c = c.groupby('id').first()
    # get ids that are in concept
    common = list(set(concepts.index).intersection(c.index))
    
    concepts.loc[common, sab] = c.loc[common, "CodeID:ID"]
    if c.loc[common, "value:float"].isna().sum() != len(common):
        concepts.loc[common, "%s value"%sab] = c.loc[common, "value:float"]
    if c.loc[common, "lowerbound:float"].isna().sum() != len(common):
        concepts.loc[common, "%s lowerbound"%sab] = c.loc[common, "lowerbound:float"]
    if c.loc[common, "upperbound:float"].isna().sum() != len(common):
        concepts.loc[common, "%s upperbound"%sab] = c.loc[common, "upperbound:float"]
    if c.loc[common, "unit"].isna().sum() != len(common):
        concepts.loc[common, "%s unit"%sab] = c.loc[common, "unit"]

  concepts.loc[common, sab] = c.loc[common, "CodeID:ID"]
  concepts.loc[common, sab] = c.loc[common, "CodeID:ID"]
  concepts.loc[common, sab] = c.loc[common, "CodeID:ID"]
  concepts.loc[common, sab] = c.loc[common, "CodeID:ID"]
  concepts.loc[common, sab] = c.loc[common, "CodeID:ID"]
  concepts.loc[common, sab] = c.loc[common, "CodeID:ID"]
  concepts.loc[common, sab] = c.loc[common, "CodeID:ID"]
  concepts.loc[common, sab] = c.loc[common, "CodeID:ID"]
  concepts.loc[common, sab] = c.loc[common, "CodeID:ID"]
  concepts.loc[common, sab] = c.loc[common, "CodeID:ID"]
  concepts.loc[common, sab] = c.loc[common, "CodeID:ID"]
  concepts.loc[common, sab] = c.loc[common, "CodeID:ID"]
  concepts.loc[common, sab] = c.loc[common, "CodeID:ID"]
  concepts.loc[common, sab] = c.loc[common, "CodeID:ID"]
  concepts.loc[common, sab] = c.loc[common, "CodeID:ID"]
  concepts.loc[common, sab] = c.loc[common, "CodeID:ID"]
  concepts.loc[common, sab] = c.loc[common, "CodeID:ID"]
  concepts.loc[common, sab] = c

In [168]:
concepts.shape

(11424664, 222)

In [50]:
concepts.head()

Unnamed: 0_level_0,label,type,type_combined
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C0000097,"1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine",Organic Chemical,Hazardous or Poisonous Substance; Organic Chem...
C0000359,"3',5'-Cyclic-Nucleotide Phosphodiesterase",Enzyme,"Enzyme; Amino Acid, Peptide, or Protein"
C0000610,6-Aminonicotinamide,Pharmacologic Substance,Vitamin; Pharmacologic Substance
C0000739,Abdominal Muscles,"Body Part, Organ, or Organ Component","Body Part, Organ, or Organ Component"
C0000873,Academic Problem,Finding,Finding


In [58]:
import csv
import os

In [62]:
row_headers = ["source", "relation", "target", "source_label", "target_label", "evidence"]
with open("neo4j/import/CUI-CUIs.csv") as o:
    csv_reader = csv.reader(o)
    headers = None
    for row in tqdm(csv_reader):
        if not headers:
            headers = row
        else:
            source = row[0]
            source_label = concepts.at[source, 'label']
            source_type = concepts.at[source, 'type']
            target = row[1]
            target_label = concepts.at[target, 'label']
            target_type = concepts.at[target, 'type']
            relation = row[2]
            sab = row[3]
            evidence = ''
            if len(row) > 4:
                evidence = row[4]
            filename = 'out/serialization/edges/%s.%s.%s.edges.tsv'%(source_type, relation, target_type)
            write_header = False
            operation = "a"
            if not os.path.isfile(filename):
                write_header = True
                operation = "w"
            with open(filename, operation) as w:
                csv_writer = csv.writer(o)
                if write_header:
                    csv_writer.writerow(row_headers)
                csv_writer.writerow([source, relation, target, source_label, target_label, evidence])

1it [00:00, 976.56it/s]


UnsupportedOperation: not writable