# 06-kgx-load
 - reformat for Biolink-compliant Neo4j import using KGX

#### make the file headers correct for KGX CSV loading
#### (See https://github.com/NCATS-Tangerine/kgx/blob/RichardBruskiewich-patch-1/data-preparation.md)

In [1]:
import pandas as pd

In [2]:
DATA  = 'data/'

NODES_XREF_CSV = DATA+"nodes_xref.csv"
EDGES_BIOLINK_CSV = DATA+"edges_biolink.csv"

NODES_KGX_CSV = "nodes_kgx.csv"
EDGES_KGX_CSV = "edges_kgx.csv"

## Nodes

In [3]:
# https://biolink.github.io/biolink-model/docs/node_property
nodes = pd.read_csv(NODES_XREF_CSV)
nodes.head()

Unnamed: 0,ID,LABEL,umls_type,umls_type_label,blm_category,xrefs
0,C3842672,Day 7,T033,Finding,disease_or_phenotypic_feature,LNC:LA16590-4
1,C0020684,Hypoxanthine,T123|T109,Biologically Active Substance|Organic Chemical,chemical_substance,LNC:LP15651-0;NCI:C29105;CHV:0000006506;MESH:D...
2,C0853225,INR Increased,T033,Finding,disease_or_phenotypic_feature,OMIM:MTHU055758;NCI_CTCAE:E12175;NCI:C78366;CH...
3,C1513022,Mature Centriole,T026,Cell Component,cell_component,NCI:C33060
4,C0267183,Hourglass stricture or stenosis of stomach,T047,Disease or Syndrome,disease_or_phenotypic_feature,MTHICD9:537.6;ICD9CM:537.6;DOID:12234


In [4]:
nodes.blm_category.value_counts()

chemical_substance                57087
protein                           27274
disease_or_phenotypic_feature     26634
gene                              23704
gross_anatomical_structure         8211
biological_process_or_activity     7260
anatomical_entity                  2275
cell_component                     1673
cell                               1193
activity_and_behavior               845
phenotypic_feature                  252
genomic_entity                      170
Name: blm_category, dtype: int64

In [5]:
nodes.ID = "UMLS:" + nodes.ID
nodes.blm_category = "biolink:" + nodes.blm_category
nodes.xrefs = nodes.xrefs.str.replace(";","|")

In [6]:
nodes.rename(columns = {'ID': 'id', 
                        'LABEL': 'name', 
                        'blm_category': 'category',
                        'xrefs': 'systematic_synonym'}, inplace=True)

In [7]:
nodes.head()

Unnamed: 0,id,name,umls_type,umls_type_label,category,systematic_synonym
0,UMLS:C3842672,Day 7,T033,Finding,disease_or_phenotypic_feature,LNC:LA16590-4
1,UMLS:C0020684,Hypoxanthine,T123|T109,Biologically Active Substance|Organic Chemical,chemical_substance,LNC:LP15651-0|NCI:C29105|CHV:0000006506|MESH:D...
2,UMLS:C0853225,INR Increased,T033,Finding,disease_or_phenotypic_feature,OMIM:MTHU055758|NCI_CTCAE:E12175|NCI:C78366|CH...
3,UMLS:C1513022,Mature Centriole,T026,Cell Component,cell_component,NCI:C33060
4,UMLS:C0267183,Hourglass stricture or stenosis of stomach,T047,Disease or Syndrome,disease_or_phenotypic_feature,MTHICD9:537.6|ICD9CM:537.6|DOID:12234


In [8]:
nodes.to_csv(NODES_KGX_CSV, index=False)

## EDGES

In [9]:
# https://biolink.github.io/biolink-model/docs/association_slot
edges = pd.read_csv(EDGES_BIOLINK_CSV)

In [10]:
edges.head(10)

Unnamed: 0,SUBJECT_CUI,SEMMED_PRED,OBJECT_CUI,PMID,NEG,bl_pred
0,C1412045,AFFECTS,C0005935,29798367,False,AFFECTS
1,C1412045,AFFECTS,C0028754,19789049,False,AFFECTS
2,C1412045,AFFECTS,C0036421,31505074;31505074,False,AFFECTS
3,C1412045,AFFECTS,C0597304,1409557,False,AFFECTS
4,C1412045,AFFECTS,C0599816,7617239,False,AFFECTS
5,C1412045,ASSOCIATED_WITH,C0001807,8503828;8240219,False,RELATED_TO
6,C1412045,ASSOCIATED_WITH,C0007222,26915323,False,GENE_ASSOCIATED_WITH_CONDITION
7,C1412045,ASSOCIATED_WITH,C0009324,22465933,False,GENE_ASSOCIATED_WITH_CONDITION
8,C1412045,ASSOCIATED_WITH,C0010346,1955175;22465933,False,GENE_ASSOCIATED_WITH_CONDITION
9,C1412045,ASSOCIATED_WITH,C0012634,19737390,False,GENE_ASSOCIATED_WITH_CONDITION


In [11]:
# fix the CURIES and add 'provided_by' source
edges.SUBJECT_CUI = "UMLS:" + edges.SUBJECT_CUI
edges.OBJECT_CUI = "UMLS:" + edges.OBJECT_CUI
edges.SEMMED_PRED = "semmed:" + edges.SEMMED_PRED
edges.PMID = edges.PMID.map(lambda x: {"PMID:"+y for y in x.split(';')}).apply(lambda x: "|".join(x))
edges.bl_pred = "biolink:"+edges.bl_pred.str.lower()
edges['provided_by'] = "https://ii.nlm.nih.gov/SemRep_SemMedDB_SKR/"

In [12]:
# relabel the data columns
edges.rename(columns = {'SUBJECT_CUI': 'subject', 
                        'bl_pred': 'edge_label',
                        'OBJECT_CUI': 'object', 
                        'SEMMED_PRED': 'relation',
                        'PMID': 'publications',
                        'NEG': 'negated'
                       }, inplace=True)

In [13]:
edges.head(10)

Unnamed: 0,subject,relation,object,publications,negated,edge_label,provided_by
0,UMLS:C1412045,semmed:AFFECTS,UMLS:C0005935,PMID:29798367,False,biolink:affects,https://ii.nlm.nih.gov/SemRep_SemMedDB_SKR/
1,UMLS:C1412045,semmed:AFFECTS,UMLS:C0028754,PMID:19789049,False,biolink:affects,https://ii.nlm.nih.gov/SemRep_SemMedDB_SKR/
2,UMLS:C1412045,semmed:AFFECTS,UMLS:C0036421,PMID:31505074,False,biolink:affects,https://ii.nlm.nih.gov/SemRep_SemMedDB_SKR/
3,UMLS:C1412045,semmed:AFFECTS,UMLS:C0597304,PMID:1409557,False,biolink:affects,https://ii.nlm.nih.gov/SemRep_SemMedDB_SKR/
4,UMLS:C1412045,semmed:AFFECTS,UMLS:C0599816,PMID:7617239,False,biolink:affects,https://ii.nlm.nih.gov/SemRep_SemMedDB_SKR/
5,UMLS:C1412045,semmed:ASSOCIATED_WITH,UMLS:C0001807,PMID:8503828|PMID:8240219,False,biolink:related_to,https://ii.nlm.nih.gov/SemRep_SemMedDB_SKR/
6,UMLS:C1412045,semmed:ASSOCIATED_WITH,UMLS:C0007222,PMID:26915323,False,biolink:gene_associated_with_condition,https://ii.nlm.nih.gov/SemRep_SemMedDB_SKR/
7,UMLS:C1412045,semmed:ASSOCIATED_WITH,UMLS:C0009324,PMID:22465933,False,biolink:gene_associated_with_condition,https://ii.nlm.nih.gov/SemRep_SemMedDB_SKR/
8,UMLS:C1412045,semmed:ASSOCIATED_WITH,UMLS:C0010346,PMID:1955175|PMID:22465933,False,biolink:gene_associated_with_condition,https://ii.nlm.nih.gov/SemRep_SemMedDB_SKR/
9,UMLS:C1412045,semmed:ASSOCIATED_WITH,UMLS:C0012634,PMID:19737390,False,biolink:gene_associated_with_condition,https://ii.nlm.nih.gov/SemRep_SemMedDB_SKR/


In [14]:
edges.to_csv(EDGES_KGX_CSV, index=False)

## At this point, the data should be ready for loading into Neo4j by KGX tools