# 06-neo4j-classical-load
 - reformat for neo4j import

In [1]:
# make the file headers correct
# https://neo4j.com/docs/operations-manual/current/tools/import/file-header-format/

In [1]:
import os
import pickle
%matplotlib inline
import pandas as pd
import seaborn as sns
import shelve
import re
from collections import defaultdict, Counter
from tqdm import tqdm
import requests
from pyquery import PyQuery as pq

In [2]:
# https://biolink.github.io/biolink-model/docs/NamedThing.html
nodes = pd.read_csv("nodes_xref.csv")
nodes.head()

Unnamed: 0,ID,LABEL,umls_type,umls_type_label,blm_type,xrefs
0,C0061133,gastrin releasing peptide (14-27),T116,"Amino Acid, Peptide, or Protein",protein,MESH:C041922
1,C1523610,"regulation of tube length, open tracheal system",T042,Organ or Tissue Function,biological_process_or_activity,GO:GO:0035159
2,C0312636,Antibody to hepatitis E virus,T116|T129,"Amino Acid, Peptide, or Protein|Immunologic Fa...",biological_entity,LNC:MTHU004056;SNMI:F-C2A90;MTH:NOCODE;SNOMEDC...
3,C0539817,cytochrome p30,T116|T126,"Amino Acid, Peptide, or Protein|Enzyme",protein,MESH:C106367
4,C0406240,Photosensitive atopic dermatitis,T047,Disease or Syndrome,disease_or_phenotypic_feature,RCD:X505U;SNOMEDCT_US:238548006;SNOMEDCT_US:23...


In [3]:
nodes.blm_category.value_counts()

chemical_substance                58812
disease_or_phenotypic_feature     36248
gene                              20695
biological_entity                 14905
protein                           12644
gross_anatomical_structure         8472
biological_process_or_activity     6887
anatomical_entity                  2750
cell_component                     1644
cell                               1099
activity_and_behavior               935
phenotypic_feature                  393
genomic_entity                      174
Name: blm_type, dtype: int64

In [4]:
nodes.ID = "UMLS:" + nodes.ID
nodes['category:STRING'] = nodes.blm_category
nodes['id:STRING'] = nodes.ID
nodes.umls_type_label = nodes.umls_type_label.str.replace("|", ";")
nodes.umls_type = nodes.umls_type.str.replace("|", ";")

In [5]:
nodes.rename(columns = {'ID': ':ID', 
                        'LABEL': 'name:STRING', 
                        'blm_category': ':LABEL',
                        'umls_type_label': 'umls_type_label:STRING[]',
                        'umls_type': 'umls_type:STRING[]',
                        'xrefs': 'xrefs:STRING[]'}, inplace=True)

In [6]:
nodes.head()

Unnamed: 0,:ID,name:STRING,umls_type:STRING[],umls_type_label:STRING[],:LABEL,xrefs:STRING[],category:STRING,id:STRING
0,UMLS:C0061133,gastrin releasing peptide (14-27),T116,"Amino Acid, Peptide, or Protein",protein,MESH:C041922,protein,UMLS:C0061133
1,UMLS:C1523610,"regulation of tube length, open tracheal system",T042,Organ or Tissue Function,biological_process_or_activity,GO:GO:0035159,biological_process_or_activity,UMLS:C1523610
2,UMLS:C0312636,Antibody to hepatitis E virus,T116;T129,"Amino Acid, Peptide, or Protein;Immunologic Fa...",biological_entity,LNC:MTHU004056;SNMI:F-C2A90;MTH:NOCODE;SNOMEDC...,biological_entity,UMLS:C0312636
3,UMLS:C0539817,cytochrome p30,T116;T126,"Amino Acid, Peptide, or Protein;Enzyme",protein,MESH:C106367,protein,UMLS:C0539817
4,UMLS:C0406240,Photosensitive atopic dermatitis,T047,Disease or Syndrome,disease_or_phenotypic_feature,RCD:X505U;SNOMEDCT_US:238548006;SNOMEDCT_US:23...,disease_or_phenotypic_feature,UMLS:C0406240


In [7]:
nodes.to_csv("nodes_neo4j.csv", index=False)

In [9]:
###### EDGES

In [10]:
# https://biolink.github.io/biolink-model/docs/Association.html
edges = pd.read_csv('edges_biolink.csv')

In [11]:
edges['START_ID'] = "UMLS:" + edges.SUBJECT_CUI
edges['END_ID'] = "UMLS:" + edges.OBJECT_CUI

In [12]:
edges['n_pmids'] = edges.PMID.str.count(";")+1

In [13]:
edges.head()

Unnamed: 0,SUBJECT_CUI,SEMMED_PRED,OBJECT_CUI,PMID,NEG,bl_pred,START_ID,END_ID,n_pmids
0,C1412045,AFFECTS,C0023946,20801151,False,AFFECTS,UMLS:C1412045,UMLS:C0023946,1
1,C1412045,AFFECTS,C0028754,19789049,False,AFFECTS,UMLS:C1412045,UMLS:C0028754,1
2,C1412045,AFFECTS,C0597304,1409557,False,AFFECTS,UMLS:C1412045,UMLS:C0597304,1
3,C1412045,AFFECTS,C0599816,7617239,False,AFFECTS,UMLS:C1412045,UMLS:C0599816,1
4,C1412045,ASSOCIATED_WITH,C0001807,8503828;8240219,False,RELATED_TO,UMLS:C1412045,UMLS:C0001807,2


In [14]:
edges.bl_pred = edges.bl_pred.str.lower()
edges.rename(columns = {'START_ID': ':START_ID', 'END_ID': ':END_ID', 
                        'bl_pred': ':TYPE', 'NEG': 'negated',
                       'PMID': 'pmids'}, inplace=True)

In [15]:
edges['is_defined_by'] = "semmeddb"
edges['relation'] = "semmeddb:" + edges[":TYPE"].str.lower()
edges['provided_by'] = "semmeddb_sulab"

In [16]:
del edges['SUBJECT_CUI']
del edges['OBJECT_CUI']

In [17]:
edges.head()

Unnamed: 0,SEMMED_PRED,pmids,negated,:TYPE,:START_ID,:END_ID,n_pmids,is_defined_by,relation,provided_by
0,AFFECTS,20801151,False,affects,UMLS:C1412045,UMLS:C0023946,1,semmeddb,semmeddb:affects,semmeddb_sulab
1,AFFECTS,19789049,False,affects,UMLS:C1412045,UMLS:C0028754,1,semmeddb,semmeddb:affects,semmeddb_sulab
2,AFFECTS,1409557,False,affects,UMLS:C1412045,UMLS:C0597304,1,semmeddb,semmeddb:affects,semmeddb_sulab
3,AFFECTS,7617239,False,affects,UMLS:C1412045,UMLS:C0599816,1,semmeddb,semmeddb:affects,semmeddb_sulab
4,ASSOCIATED_WITH,8503828;8240219,False,related_to,UMLS:C1412045,UMLS:C0001807,2,semmeddb,semmeddb:related_to,semmeddb_sulab


In [18]:
edges.to_csv("edges_neo4j.csv", index=False)