# 06b-kgx-load
 - reformat for Biolink-compliant Neo4j import using KGX

#### Make the file headers correct for KGX TSV loading

See [KGX TSV Data Format](https://github.com/biolink/kgx/blob/master/specification/kgx-format.md#kgx-format-as-tsv)

Note that the use of the `provided_by` node/slot property is deprecatedin Biolink 2.0 so the KGX, so here below we substitute one of  the new properties, `original_knowledge_source`, with an anticipated **InfoRes** identifier value, with SemMedDb versioning included: `infores:sri-semmeddb.4.3`

In [1]:
import pandas as pd
from semmeddb_biolink_environment import *

## Nodes

In [2]:
# https://biolink.github.io/biolink-model/docs/node_property
nodes = pd.read_csv(NODES_XREF_TSV, sep='\t')
nodes.head()

Unnamed: 0,ID,LABEL,umls_type,umls_type_label,blm_category,xrefs
0,C0007952,Personality Character,T041,Mental Process,Behavior,MESH:D002605;PSY:08470;LCH_NW:sh85022615;MTH:U...
1,C3574797,Pbunavirus,T005,Virus,OrganismTaxon,NCBI:1198980
2,C0948102,Salivary gland adenoma,T191,Neoplastic Process,Disease,CHV:0000053218;MDR:10051636
3,C0210064,quinotolast,T121|T109,Pharmacologic Substance|Organic Chemical,MolecularEntity,INCHIKEY:ZUPLNRDTYQWUHP-UHFFFAOYSA-N;MESH:C074...
4,C1416967,MAFF gene,T028,Gene or Genome,NucleicAcidEntity,OMIM:604877;HGNC:6780;MTH:NOCODE


In [3]:
nodes.blm_category.value_counts()

MolecularEntity                    64203
OrganismTaxon                      43810
Disease                            32871
NucleicAcidEntity                  26630
Polypeptide                        23054
Procedure                          13371
DiseaseOrPhenotypicFeature         12402
GrossAnatomicalStructure           10880
PhysiologicalProcess                5632
MolecularActivity                   4512
Drug                                4276
Device                              3843
AnatomicalEntity                    3532
PhenotypicFeature                   2561
CellularComponent                   2519
Activity                            2165
InformationContentEntity            2023
Cell                                1527
Phenomenon                          1380
SmallMolecule                       1325
Behavior                            1213
Cohort                              1200
Food                                 773
PopulationOfIndividualOrganisms      617
PhysicalEntity  

In [4]:
nodes.ID = "UMLS:" + nodes.ID
nodes.blm_category = "biolink:" + nodes.blm_category
nodes.xrefs = nodes.xrefs.str.replace(";","|")

In [5]:
nodes.rename(columns = {'ID': 'id', 
                        'LABEL': 'name', 
                        'blm_category': 'category'}, inplace=True)

In [6]:
nodes.head()

Unnamed: 0,id,name,umls_type,umls_type_label,category,xrefs
0,UMLS:C0007952,Personality Character,T041,Mental Process,biolink:Behavior,MESH:D002605|PSY:08470|LCH_NW:sh85022615|MTH:U...
1,UMLS:C3574797,Pbunavirus,T005,Virus,biolink:OrganismTaxon,NCBI:1198980
2,UMLS:C0948102,Salivary gland adenoma,T191,Neoplastic Process,biolink:Disease,CHV:0000053218|MDR:10051636
3,UMLS:C0210064,quinotolast,T121|T109,Pharmacologic Substance|Organic Chemical,biolink:MolecularEntity,INCHIKEY:ZUPLNRDTYQWUHP-UHFFFAOYSA-N|MESH:C074...
4,UMLS:C1416967,MAFF gene,T028,Gene or Genome,biolink:NucleicAcidEntity,OMIM:604877|HGNC:6780|MTH:NOCODE


In [7]:
nodes.to_csv(NODES_KGX_TSV, sep='\t', index=False)

## EDGES

In [8]:
# https://biolink.github.io/biolink-model/docs/association_slot
edges = pd.read_csv(EDGES_BIOLINK_TSV, sep='\t')

In [9]:
edges.head(10)

Unnamed: 0,SUBJECT_CUI,SEMMED_PRED,OBJECT_CUI,PMID,NEG,bl_pred
0,C1412045,AFFECTS,C0005935,29798367,False,affects
1,C1412045,AFFECTS,C0020291,6298464,False,affects
2,C1412045,AFFECTS,C0028754,19789049,False,affects
3,C1412045,AFFECTS,C0036421,31505074;31505074,False,affects
4,C1412045,AFFECTS,C0597304,1409557,False,affects
5,C1412045,AFFECTS,C0599816,7617239,False,affects
6,C1412045,ASSOCIATED_WITH,C0001807,8503828;8240219,False,related_to
7,C1412045,ASSOCIATED_WITH,C0007222,26915323,False,related_to
8,C1412045,ASSOCIATED_WITH,C0009324,22465933,False,related_to
9,C1412045,ASSOCIATED_WITH,C0010346,1955175;22465933,False,related_to


In [10]:
# fix the CURIES and add 'provided_by' source
edges.SUBJECT_CUI = "UMLS:" + edges.SUBJECT_CUI
edges.OBJECT_CUI = "UMLS:" + edges.OBJECT_CUI
edges.SEMMED_PRED = "SEMMEDDB:" + edges.SEMMED_PRED
edges.PMID = edges.PMID.map(lambda x: {"PMID:"+y for y in x.split(';')}).apply(lambda x: "|".join(x))
edges.bl_pred = "biolink:"+edges.bl_pred
# Biolink Model 2.0 - compliant InfoRes tagging of knowledge sources
edges['original_knowledge_source'] = "infores:sri-semmeddb.4.3"   # formerly set to "https://ii.nlm.nih.gov/SemRep_SemMedDB_SKR/"?

In [11]:
# relabel the data columns
edges.rename(columns = {'SUBJECT_CUI': 'subject', 
                        'bl_pred': 'predicate',
                        'OBJECT_CUI': 'object', 
                        'SEMMED_PRED': 'relation',
                        'PMID': 'publications',
                        'NEG': 'negated'
                       }, inplace=True)

In [12]:
edges.head(10)

Unnamed: 0,subject,relation,object,publications,negated,predicate,original_knowledge_source
0,UMLS:C1412045,SEMMEDDB:AFFECTS,UMLS:C0005935,PMID:29798367,False,biolink:affects,infores:sri-semmeddb.4.3
1,UMLS:C1412045,SEMMEDDB:AFFECTS,UMLS:C0020291,PMID:6298464,False,biolink:affects,infores:sri-semmeddb.4.3
2,UMLS:C1412045,SEMMEDDB:AFFECTS,UMLS:C0028754,PMID:19789049,False,biolink:affects,infores:sri-semmeddb.4.3
3,UMLS:C1412045,SEMMEDDB:AFFECTS,UMLS:C0036421,PMID:31505074,False,biolink:affects,infores:sri-semmeddb.4.3
4,UMLS:C1412045,SEMMEDDB:AFFECTS,UMLS:C0597304,PMID:1409557,False,biolink:affects,infores:sri-semmeddb.4.3
5,UMLS:C1412045,SEMMEDDB:AFFECTS,UMLS:C0599816,PMID:7617239,False,biolink:affects,infores:sri-semmeddb.4.3
6,UMLS:C1412045,SEMMEDDB:ASSOCIATED_WITH,UMLS:C0001807,PMID:8503828|PMID:8240219,False,biolink:related_to,infores:sri-semmeddb.4.3
7,UMLS:C1412045,SEMMEDDB:ASSOCIATED_WITH,UMLS:C0007222,PMID:26915323,False,biolink:related_to,infores:sri-semmeddb.4.3
8,UMLS:C1412045,SEMMEDDB:ASSOCIATED_WITH,UMLS:C0009324,PMID:22465933,False,biolink:related_to,infores:sri-semmeddb.4.3
9,UMLS:C1412045,SEMMEDDB:ASSOCIATED_WITH,UMLS:C0010346,PMID:22465933|PMID:1955175,False,biolink:related_to,infores:sri-semmeddb.4.3


In [13]:
edges.to_csv(EDGES_KGX_TSV, index=False, sep='\t')

## At this point, the SemMedDb node and edge tab separated (TSV) data files should be ready for loading by KGX tools