# 06b-kgx-load
 - reformat for Biolink-compliant Neo4j import using KGX

#### make the file headers correct for KGX CSV loading
#### (See https://github.com/NCATS-Tangerine/kgx/blob/RichardBruskiewich-patch-1/data-preparation.md)

In [8]:
import pandas as pd

In [9]:
DATA  = 'data/'

NODES_XREF_CSV = DATA+"nodes_xref.csv"
EDGES_BIOLINK_CSV = DATA+"edges_biolink.csv"

NODES_KGX_CSV = DATA+"nodes_kgx.csv"
EDGES_KGX_CSV = DATA+"edges_kgx.csv"

## Nodes

In [10]:
# https://biolink.github.io/biolink-model/docs/node_property
nodes = pd.read_csv(NODES_XREF_CSV)
nodes.head()

Unnamed: 0,ID,LABEL,umls_type,umls_type_label,blm_category,xrefs
0,C1419689,RPL9 gene,T028,Gene or Genome,Gene,MTH:NOCODE;HGNC:10369;OMIM:603686
1,C0089147,butyl alcohol,T109,Organic Chemical,ChemicalSubstance,MTHSPL:8PJ61P6TS3;MTH:NOCODE;VANDF:4025696;MES...
2,C0661927,peptitergent PD1,T121|T116,"Pharmacologic Substance|Amino Acid, Peptide, o...",Protein,MESH:C083798
3,C0380168,glutathione S-conjugate export pump,T123|T116,"Amino Acid, Peptide, or Protein|Biologically A...",Protein,MTH:NOCODE;MESH:C086860
4,C0141404,SAGM solution,T104|T122,Biomedical or Dental Material|Chemical Viewed ...,ChemicalSubstance,MESH:C037839


In [11]:
nodes.blm_category.value_counts()

ChemicalSubstance              52455
Protein                        25945
DiseaseOrPhenotypicFeature     24498
Gene                           23116
BiologicalProcessOrActivity     7052
GrossAnatomicalStructure        6885
CellComponent                   1582
AnatomicalEntity                1210
Cell                            1114
ActivityAndBehavior              809
PhenotypicFeature                223
GenomicEntity                    169
Name: blm_category, dtype: int64

In [12]:
nodes.ID = "UMLS:" + nodes.ID
nodes.blm_category = "biolink:" + nodes.blm_category
nodes.xrefs = nodes.xrefs.str.replace(";","|")

In [13]:
nodes.rename(columns = {'ID': 'id', 
                        'LABEL': 'name', 
                        'blm_category': 'category'}, inplace=True)

In [14]:
nodes.head()

Unnamed: 0,id,name,umls_type,umls_type_label,category,xrefs
0,UMLS:C1419689,RPL9 gene,T028,Gene or Genome,biolink:Gene,MTH:NOCODE|HGNC:10369|OMIM:603686
1,UMLS:C0089147,butyl alcohol,T109,Organic Chemical,biolink:ChemicalSubstance,MTHSPL:8PJ61P6TS3|MTH:NOCODE|VANDF:4025696|MES...
2,UMLS:C0661927,peptitergent PD1,T121|T116,"Pharmacologic Substance|Amino Acid, Peptide, o...",biolink:Protein,MESH:C083798
3,UMLS:C0380168,glutathione S-conjugate export pump,T123|T116,"Amino Acid, Peptide, or Protein|Biologically A...",biolink:Protein,MTH:NOCODE|MESH:C086860
4,UMLS:C0141404,SAGM solution,T104|T122,Biomedical or Dental Material|Chemical Viewed ...,biolink:ChemicalSubstance,MESH:C037839


In [15]:
nodes.to_csv(NODES_KGX_CSV, index=False)

## EDGES

In [16]:
# https://biolink.github.io/biolink-model/docs/association_slot
edges = pd.read_csv(EDGES_BIOLINK_CSV)

In [17]:
edges.head(10)

Unnamed: 0,SUBJECT_CUI,SEMMED_PRED,OBJECT_CUI,PMID,NEG,bl_pred
0,C1412045,AFFECTS,C0005935,29798367,False,AFFECTS
1,C1412045,AFFECTS,C0028754,19789049,False,AFFECTS
2,C1412045,AFFECTS,C0036421,31505074;31505074,False,AFFECTS
3,C1412045,AFFECTS,C0597304,1409557,False,AFFECTS
4,C1412045,AFFECTS,C0599816,7617239,False,AFFECTS
5,C1412045,ASSOCIATED_WITH,C0001807,8503828;8240219,False,RELATED_TO
6,C1412045,ASSOCIATED_WITH,C0007222,26915323,False,RELATED_TO
7,C1412045,ASSOCIATED_WITH,C0009324,22465933,False,RELATED_TO
8,C1412045,ASSOCIATED_WITH,C0010346,1955175;22465933,False,RELATED_TO
9,C1412045,ASSOCIATED_WITH,C0012634,19737390,False,RELATED_TO


In [18]:
# fix the CURIES and add 'provided_by' source
edges.SUBJECT_CUI = "UMLS:" + edges.SUBJECT_CUI
edges.OBJECT_CUI = "UMLS:" + edges.OBJECT_CUI
edges.SEMMED_PRED = "semmed:" + edges.SEMMED_PRED
edges.PMID = edges.PMID.map(lambda x: {"PMID:"+y for y in x.split(';')}).apply(lambda x: "|".join(x))
edges.bl_pred = "biolink:"+edges.bl_pred.str.lower()
edges['provided_by'] = "https://ii.nlm.nih.gov/SemRep_SemMedDB_SKR/"

In [19]:
# relabel the data columns
edges.rename(columns = {'SUBJECT_CUI': 'subject', 
                        'bl_pred': 'edge_label',
                        'OBJECT_CUI': 'object', 
                        'SEMMED_PRED': 'relation',
                        'PMID': 'publications',
                        'NEG': 'negated'
                       }, inplace=True)

In [20]:
edges.head(10)

Unnamed: 0,subject,relation,object,publications,negated,edge_label,provided_by
0,UMLS:C1412045,semmed:AFFECTS,UMLS:C0005935,PMID:29798367,False,biolink:affects,https://ii.nlm.nih.gov/SemRep_SemMedDB_SKR/
1,UMLS:C1412045,semmed:AFFECTS,UMLS:C0028754,PMID:19789049,False,biolink:affects,https://ii.nlm.nih.gov/SemRep_SemMedDB_SKR/
2,UMLS:C1412045,semmed:AFFECTS,UMLS:C0036421,PMID:31505074,False,biolink:affects,https://ii.nlm.nih.gov/SemRep_SemMedDB_SKR/
3,UMLS:C1412045,semmed:AFFECTS,UMLS:C0597304,PMID:1409557,False,biolink:affects,https://ii.nlm.nih.gov/SemRep_SemMedDB_SKR/
4,UMLS:C1412045,semmed:AFFECTS,UMLS:C0599816,PMID:7617239,False,biolink:affects,https://ii.nlm.nih.gov/SemRep_SemMedDB_SKR/
5,UMLS:C1412045,semmed:ASSOCIATED_WITH,UMLS:C0001807,PMID:8240219|PMID:8503828,False,biolink:related_to,https://ii.nlm.nih.gov/SemRep_SemMedDB_SKR/
6,UMLS:C1412045,semmed:ASSOCIATED_WITH,UMLS:C0007222,PMID:26915323,False,biolink:related_to,https://ii.nlm.nih.gov/SemRep_SemMedDB_SKR/
7,UMLS:C1412045,semmed:ASSOCIATED_WITH,UMLS:C0009324,PMID:22465933,False,biolink:related_to,https://ii.nlm.nih.gov/SemRep_SemMedDB_SKR/
8,UMLS:C1412045,semmed:ASSOCIATED_WITH,UMLS:C0010346,PMID:1955175|PMID:22465933,False,biolink:related_to,https://ii.nlm.nih.gov/SemRep_SemMedDB_SKR/
9,UMLS:C1412045,semmed:ASSOCIATED_WITH,UMLS:C0012634,PMID:19737390,False,biolink:related_to,https://ii.nlm.nih.gov/SemRep_SemMedDB_SKR/


In [21]:
edges.to_csv(EDGES_KGX_CSV, index=False)

## At this point, the data should be ready for loading into Neo4j by KGX tools