# 06b-kgx-load
 - reformat for Biolink-compliant Neo4j import using KGX

#### make the file headers correct for KGX TSV loading
#### (See https://github.com/NCATS-Tangerine/kgx/blob/RichardBruskiewich-patch-1/data-preparation.md)

In [1]:
import pandas as pd
from semmed_biolink_environment import *

## Nodes

In [2]:
# https://biolink.github.io/biolink-model/docs/node_property
nodes = pd.read_csv(NODES_XREF_TSV, sep='\t')
nodes.head()

Unnamed: 0,ID,LABEL,umls_type,umls_type_label,blm_category,xrefs
0,C0270951,Ocular muscular dystrophy,T047,Disease or Syndrome,DiseaseOrPhenotypicFeature,ICPC2ICD10ENG:MTHU051186;ICPC2ICD10ENG:MTHU053...
1,C1706324,EREG wt Allele,T028,Gene or Genome,Gene,MTH:NOCODE;NCI:C51422
2,C0075561,sulfochloranthine,T109|T121,Organic Chemical|Pharmacologic Substance,ChemicalSubstance,CAS:55945-60-3;MESH:C022876
3,C3470895,LINC00607 gene,T028,Gene or Genome,Gene,MTH:NOCODE;HGNC:43944
4,C0486433,Extended-spectrum beta lactamase,T126|T116,"Amino Acid, Peptide, or Protein|Enzyme",Protein,LNC:LP17264-0;CHV:0000037238;SNOMEDCT_US:10276...


In [3]:
nodes.blm_category.value_counts()

ChemicalSubstance              58700
DiseaseOrPhenotypicFeature     36958
Protein                        28019
Gene                           23788
GrossAnatomicalStructure        9086
BiologicalProcessOrActivity     7665
AnatomicalEntity                2757
CellularComponent               1732
Cell                            1255
ActivityAndBehavior              881
PhenotypicFeature                418
GenomicEntity                    172
Name: blm_category, dtype: int64

In [4]:
nodes.ID = "UMLS:" + nodes.ID
nodes.blm_category = "biolink:" + nodes.blm_category
nodes.xrefs = nodes.xrefs.str.replace(";","|")

In [5]:
nodes.rename(columns = {'ID': 'id', 
                        'LABEL': 'name', 
                        'blm_category': 'category'}, inplace=True)

In [6]:
nodes.head()

Unnamed: 0,id,name,umls_type,umls_type_label,category,xrefs
0,UMLS:C0270951,Ocular muscular dystrophy,T047,Disease or Syndrome,biolink:DiseaseOrPhenotypicFeature,ICPC2ICD10ENG:MTHU051186|ICPC2ICD10ENG:MTHU053...
1,UMLS:C1706324,EREG wt Allele,T028,Gene or Genome,biolink:Gene,MTH:NOCODE|NCI:C51422
2,UMLS:C0075561,sulfochloranthine,T109|T121,Organic Chemical|Pharmacologic Substance,biolink:ChemicalSubstance,CAS:55945-60-3|MESH:C022876
3,UMLS:C3470895,LINC00607 gene,T028,Gene or Genome,biolink:Gene,MTH:NOCODE|HGNC:43944
4,UMLS:C0486433,Extended-spectrum beta lactamase,T126|T116,"Amino Acid, Peptide, or Protein|Enzyme",biolink:Protein,LNC:LP17264-0|CHV:0000037238|SNOMEDCT_US:10276...


In [7]:
nodes.to_csv(NODES_KGX_TSV, sep='\t', index=False)

## EDGES

In [8]:
# https://biolink.github.io/biolink-model/docs/association_slot
edges = pd.read_csv(EDGES_BIOLINK_TSV, sep='\t')

In [9]:
edges.head(10)

Unnamed: 0,SUBJECT_CUI,SEMMED_PRED,OBJECT_CUI,PMID,NEG,bl_pred
0,C1412045,AFFECTS,C0005935,29798367,False,affects
1,C1412045,AFFECTS,C0028754,19789049,False,affects
2,C1412045,AFFECTS,C0036421,31505074;31505074,False,affects
3,C1412045,AFFECTS,C0597304,1409557,False,affects
4,C1412045,AFFECTS,C0599816,7617239,False,affects
5,C1412045,ASSOCIATED_WITH,C0001807,8503828;8240219,False,related_to
6,C1412045,ASSOCIATED_WITH,C0007222,26915323,False,gene_associated_with_condition
7,C1412045,ASSOCIATED_WITH,C0009324,22465933,False,gene_associated_with_condition
8,C1412045,ASSOCIATED_WITH,C0010346,1955175;22465933,False,gene_associated_with_condition
9,C1412045,ASSOCIATED_WITH,C0012634,19737390,False,gene_associated_with_condition


In [10]:
# fix the CURIES and add 'provided_by' source
edges.SUBJECT_CUI = "UMLS:" + edges.SUBJECT_CUI
edges.OBJECT_CUI = "UMLS:" + edges.OBJECT_CUI
edges.SEMMED_PRED = "semmed:" + edges.SEMMED_PRED
edges.PMID = edges.PMID.map(lambda x: {"PMID:"+y for y in x.split(';')}).apply(lambda x: "|".join(x))
edges.bl_pred = "biolink:"+edges.bl_pred
edges['provided_by'] = "https://ii.nlm.nih.gov/SemRep_SemMedDB_SKR/"

In [11]:
# relabel the data columns
edges.rename(columns = {'SUBJECT_CUI': 'subject', 
                        'bl_pred': 'predicate',
                        'OBJECT_CUI': 'object', 
                        'SEMMED_PRED': 'relation',
                        'PMID': 'publications',
                        'NEG': 'negated'
                       }, inplace=True)

In [12]:
edges.head(10)

Unnamed: 0,subject,relation,object,publications,negated,predicate,provided_by
0,UMLS:C1412045,semmed:AFFECTS,UMLS:C0005935,PMID:29798367,False,biolink:affects,https://ii.nlm.nih.gov/SemRep_SemMedDB_SKR/
1,UMLS:C1412045,semmed:AFFECTS,UMLS:C0028754,PMID:19789049,False,biolink:affects,https://ii.nlm.nih.gov/SemRep_SemMedDB_SKR/
2,UMLS:C1412045,semmed:AFFECTS,UMLS:C0036421,PMID:31505074,False,biolink:affects,https://ii.nlm.nih.gov/SemRep_SemMedDB_SKR/
3,UMLS:C1412045,semmed:AFFECTS,UMLS:C0597304,PMID:1409557,False,biolink:affects,https://ii.nlm.nih.gov/SemRep_SemMedDB_SKR/
4,UMLS:C1412045,semmed:AFFECTS,UMLS:C0599816,PMID:7617239,False,biolink:affects,https://ii.nlm.nih.gov/SemRep_SemMedDB_SKR/
5,UMLS:C1412045,semmed:ASSOCIATED_WITH,UMLS:C0001807,PMID:8240219|PMID:8503828,False,biolink:related_to,https://ii.nlm.nih.gov/SemRep_SemMedDB_SKR/
6,UMLS:C1412045,semmed:ASSOCIATED_WITH,UMLS:C0007222,PMID:26915323,False,biolink:gene_associated_with_condition,https://ii.nlm.nih.gov/SemRep_SemMedDB_SKR/
7,UMLS:C1412045,semmed:ASSOCIATED_WITH,UMLS:C0009324,PMID:22465933,False,biolink:gene_associated_with_condition,https://ii.nlm.nih.gov/SemRep_SemMedDB_SKR/
8,UMLS:C1412045,semmed:ASSOCIATED_WITH,UMLS:C0010346,PMID:1955175|PMID:22465933,False,biolink:gene_associated_with_condition,https://ii.nlm.nih.gov/SemRep_SemMedDB_SKR/
9,UMLS:C1412045,semmed:ASSOCIATED_WITH,UMLS:C0012634,PMID:19737390,False,biolink:gene_associated_with_condition,https://ii.nlm.nih.gov/SemRep_SemMedDB_SKR/


In [13]:
edges.to_csv(EDGES_KGX_TSV, index=False, sep='\t')

## At this point, the SemMedDb node and edge tab separated (TSV) data files should be ready for loading into Neo4j by KGX tools