In [2]:
import pandas as pd
from glob import glob
from IPython.display import display, Markdown
from tqdm import tqdm
import csv
pd.__version__

'2.0.2'

# Unified Biomedical Knowledge Graph (UBKG)
## Nodes
### Concept

In [2]:
concepts = pd.read_csv("neo4j/import/CUIs.csv")
concepts = pd.DataFrame(index=concepts["CUI:ID"].unique())
concepts.index.name = "id"
concepts.head()

C0000097
C0000359
C0000610
C0000739
C0000873


### Semantics

In [3]:
semantics = pd.read_csv("neo4j/import/TUIs.csv", index_col=0)
semantics.head()

Unnamed: 0_level_0,name,STN,DEF
TUI:ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
T012,Bird,A1.1.3.1.1.2,A vertebrate having a constant body temperatur...
T014,Reptile,A1.1.3.1.1.5,A cold-blooded vertebrate having an external c...
T015,Mammal,A1.1.3.1.1.4,A vertebrate having a constant body temperatur...
T021,Fully Formed Anatomical Structure,A1.2.3,An anatomical structure in a fully formed orga...
T022,Body System,A2.1.4.1,A complex of anatomical structures that perfor...


### Definition

In [4]:
definition = pd.read_csv("neo4j/import/DEFs.csv", index_col=0)
definition.head()

Unnamed: 0_level_0,SAB,DEF
ATUI:ID,Unnamed: 1_level_1,Unnamed: 2_level_1
AT38138756,MSH,A tool for the study of liver damage which cau...
AT43116097,MSH,A hepatic carcinogen whose mechanism of activa...
AT38141939,MSH,Simple amine found in the brain. It may be mod...
AT235449311,MSH,A physiologically active metabolite of VITAMIN...
AT38137186,MSH,A material used in the manufacture of azo dyes...


### Term

In [5]:
terms = pd.read_csv("neo4j/import/SUIs.csv", index_col=0)
terms.head()

Unnamed: 0_level_0,name
SUI:ID,Unnamed: 1_level_1
S0009776,"Acid, 2-Aminohexanedioic"
S7249234,BR CAMP
S11872577,cramps abdominal
S14680596,Retained tissue after pregnancy loss
S3417882,Missed miscarriage


### Code

In [6]:
codes = pd.read_csv("neo4j/import/CODEs.csv", index_col=0)
codes.head()

  codes = pd.read_csv("neo4j/import/CODEs.csv", index_col=0)


Unnamed: 0_level_0,SAB,CODE,value:float,lowerbound:float,upperbound:float,unit
CodeID:ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MTHSPL J7A92W69L7,MTHSPL,J7A92W69L7,,,,
NCI C76777,NCI,C76777,,,,
ATC N07XX07,ATC,N07XX07,,,,
GS 1946,GS,1946,,,,
NOC 040413,NOC,040413,,,,


## Edges
### Concept-Term

In [7]:
concept_term = pd.read_csv("neo4j/import/CUI-SUIs.csv")
concept_term.head()

Unnamed: 0,:START_ID,:END_ID
0,C0000163,S0007806
1,C0000427,S0008336
2,C0000665,S11922611
3,C0000992,S1577859
4,C0001021,S0797872


### Concept-Semantics

In [8]:
concept_semantics = pd.read_csv("neo4j/import/CUI-TUIs.csv")
concept_semantics.head()

Unnamed: 0,:START_ID,:END_ID
0,C0000132,T126
1,C0000246,T116
2,C0000895,T060
3,C0000908,T037
4,C0000931,T067


### Concept-Code

In [9]:
concept_code = pd.read_csv("neo4j/import/CUI-CODEs.csv")
concept_code.head()

Unnamed: 0,:START_ID,:END_ID
0,C0000294,ATC V03AF01
1,C0000481,CHV 0000000513
2,C0000661,MSH D015124
3,C0000665,VANDF 4020847
4,C0000737,LNC LA15468-4


### Concept-Concept

In [10]:
# concept_concept = pd.read_csv("neo4j/import/CUI-CUIs.csv", dtype_backend="pyarrow")
# concept_concept.head()

### Semantic-Semantic
Prolly skip this

In [11]:
semantics_semantics = pd.read_csv("neo4j/import/TUIrel.csv")
semantics_semantics.head()

Unnamed: 0,:END_ID,:START_ID
0,T204,T002
1,T001,T004
2,T071,T004
3,T072,T010
4,T204,T010


### Concept-Definition

In [12]:
concept_definition = pd.read_csv("neo4j/import/DEFrel.csv")
concept_definition.head()

Unnamed: 0,:END_ID,:START_ID
0,AT38152019,C0000039
1,AT69817678,C0000039
2,AT264439104,C0000039
3,AT254753550,C0000039
4,AT267611046,C0000039


### Code-Term

In [13]:
code_term = pd.read_csv("neo4j/import/CODE-SUIs.csv")
code_term.head()

Unnamed: 0,:END_ID,:START_ID,:TYPE,CUI
0,S1424701,RXNORM 74,IN,C0000473
1,S18541041,SNOMEDCT_US 80994002,FN,C0000477
2,S11730064,SNOMEDCT_US 226367006,SY,C0000545
3,S0288461,CSP 2005-4146,PT,C0000735
4,S1957040,MDR 10048885,LLT,C0000735


## Merge Concept and Terms

In [14]:
concept_term.columns = ["CUI:ID", "SUI:ID"]
concept_term.shape

(4508555, 2)

In [15]:
concept_term = pd.merge(concept_term, terms, on="SUI:ID", how='outer')
concept_term = concept_term.groupby('CUI:ID').first()
concept_term.head()

Unnamed: 0_level_0,SUI:ID,name
CUI:ID,Unnamed: 1_level_1,Unnamed: 2_level_1
C0000005,S0007492,(131)I-Macroaggregated Albumin
C0000039,S17175117,"1,2-dipalmitoylphosphatidylcholine"
C0000052,S0007584,"1,4-alpha-Glucan Branching Enzyme"
C0000074,S0007615,1-Alkyl-2-Acylphosphatidates
C0000084,S0007627,1-Carboxyglutamic Acid


In [16]:
concept_term.shape

(4508537, 2)

In [17]:
concept_term.columns = ["SUI:ID", "label"]
concept_term = concept_term[["label"]]
concept_term.head()

Unnamed: 0_level_0,label
CUI:ID,Unnamed: 1_level_1
C0000005,(131)I-Macroaggregated Albumin
C0000039,"1,2-dipalmitoylphosphatidylcholine"
C0000052,"1,4-alpha-Glucan Branching Enzyme"
C0000074,1-Alkyl-2-Acylphosphatidates
C0000084,1-Carboxyglutamic Acid


In [18]:
concept_term.shape

(4508537, 1)

In [19]:
concepts.loc[concept_term.index, 'label'] = concept_term.loc[concept_term.index, 'label']
concepts.head()

Unnamed: 0_level_0,label
id,Unnamed: 1_level_1
C0000097,"1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine"
C0000359,"3',5'-Cyclic-Nucleotide Phosphodiesterase"
C0000610,6-Aminonicotinamide
C0000739,Abdominal Muscles
C0000873,Academic Problem


In [20]:
concepts.label = concepts.label.fillna("-")
concepts.shape

(11424664, 1)

## Merge Concept and Type

In [21]:
semantics.head()

Unnamed: 0_level_0,name,STN,DEF
TUI:ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
T012,Bird,A1.1.3.1.1.2,A vertebrate having a constant body temperatur...
T014,Reptile,A1.1.3.1.1.5,A cold-blooded vertebrate having an external c...
T015,Mammal,A1.1.3.1.1.4,A vertebrate having a constant body temperatur...
T021,Fully Formed Anatomical Structure,A1.2.3,An anatomical structure in a fully formed orga...
T022,Body System,A2.1.4.1,A complex of anatomical structures that perfor...


In [22]:
concept_semantics.head()

Unnamed: 0,:START_ID,:END_ID
0,C0000132,T126
1,C0000246,T116
2,C0000895,T060
3,C0000908,T037
4,C0000931,T067


In [23]:
no_type = set(concepts.index) - set(concept_semantics[':START_ID'])
len(no_type)

8161231

In [24]:
unique_relation = set()
unique_sab = set()
examples = set()
with open("neo4j/import/CUI-CUIs.csv") as o:
    csv_reader = csv.reader(o)
    headers = None
    for row in tqdm(csv_reader):
        if not headers:
            headers = row
        else:
            source = row[0]
            target = row[1]
            relation = row[2]
            sab = row[3]
            if source in no_type or target in no_type:
                if relation not in unique_relation or sab not in unique_sab:
                    examples.add((source, relation, sab, target))
                    unique_relation.add(relation)
                    unique_sab.add(sab)

66724477it [01:10, 952924.38it/s] 


KeyboardInterrupt: 

In [28]:
with open("no_types.txt", "w") as o:
    o.write("\n".join([",".join(i) for i in examples]))

In [29]:
with open("SABs_no_semantic.txt", "w") as o:
    o.write("\n".join(unique_sab))

In [25]:
with open('out/semantics_ranked.tsv') as o:
    ranked_type = [i.strip() for i in o.read().strip().split("\n")]

In [26]:
concept_semantics.columns = ["id", "TUI:ID"]
concept_semantics["type"] = [semantics.at[i, 'name'] for i in concept_semantics['TUI:ID']]
concept_semantics.head()

Unnamed: 0,id,TUI:ID,type
0,C0000132,T126,Enzyme
1,C0000246,T116,"Amino Acid, Peptide, or Protein"
2,C0000895,T060,Diagnostic Procedure
3,C0000908,T037,Injury or Poisoning
4,C0000931,T067,Phenomenon or Process


In [27]:
def fetch_type(v):
    cat = ""
    rank = len(ranked_type)
    for i in v:
        r = ranked_type.index(i)
        if r < rank:
            cat = i
            rank = r
    return cat

In [28]:
cs = concept_semantics.groupby('id')['type'].apply(lambda x: "; ".join(set(x)))
cs.head()

id
C0000005    Indicator, Reagent, or Diagnostic Aid; Amino A...
C0000039            Organic Chemical; Pharmacologic Substance
C0000052              Amino Acid, Peptide, or Protein; Enzyme
C0000074                                     Organic Chemical
C0000084    Amino Acid, Peptide, or Protein; Biologically ...
Name: type, dtype: object

In [29]:
cs_ranked = concept_semantics.groupby('id')['type'].apply(fetch_type)
cs_ranked.head()

id
C0000005    Amino Acid, Peptide, or Protein
C0000039                   Organic Chemical
C0000052                             Enzyme
C0000074                   Organic Chemical
C0000084    Amino Acid, Peptide, or Protein
Name: type, dtype: object

In [30]:
common = list(set(concepts.index).intersection(cs.index))
cs[common].head()

id
C2686925                  Therapeutic or Preventive Procedure
C4324202                                  Embryonic Structure
C5187125                                              Finding
C5772188                                     Population Group
C1429239    Amino Acid, Peptide, or Protein; Biologically ...
Name: type, dtype: object

In [31]:
concept_semantics
concepts.loc[common, 'type'] = cs_ranked[common]
concepts.loc[common, 'type_combined'] = cs[common]
concepts.head()

Unnamed: 0_level_0,label,type,type_combined
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C0000097,"1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine",Organic Chemical,Organic Chemical; Hazardous or Poisonous Subst...
C0000359,"3',5'-Cyclic-Nucleotide Phosphodiesterase",Enzyme,"Enzyme; Amino Acid, Peptide, or Protein"
C0000610,6-Aminonicotinamide,Pharmacologic Substance,Vitamin; Pharmacologic Substance
C0000739,Abdominal Muscles,"Body Part, Organ, or Organ Component","Body Part, Organ, or Organ Component"
C0000873,Academic Problem,Finding,Finding


In [32]:
concepts.groupby("type_combined").first().to_csv('out/semantics.tsv', sep="\t")

In [33]:
concepts.head()

Unnamed: 0_level_0,label,type,type_combined
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C0000097,"1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine",Organic Chemical,Organic Chemical; Hazardous or Poisonous Subst...
C0000359,"3',5'-Cyclic-Nucleotide Phosphodiesterase",Enzyme,"Enzyme; Amino Acid, Peptide, or Protein"
C0000610,6-Aminonicotinamide,Pharmacologic Substance,Vitamin; Pharmacologic Substance
C0000739,Abdominal Muscles,"Body Part, Organ, or Organ Component","Body Part, Organ, or Organ Component"
C0000873,Academic Problem,Finding,Finding


In [34]:
concepts.shape

(11424664, 3)

In [35]:
with open('out/semantics_list.tsv', 'w') as o:
    o.write("\n".join([str(i) for i in concept_semantics.type.unique()]))

## Merge Concept and Code

In [36]:
codes.head()

Unnamed: 0_level_0,SAB,CODE,value:float,lowerbound:float,upperbound:float,unit
CodeID:ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MTHSPL J7A92W69L7,MTHSPL,J7A92W69L7,,,,
NCI C76777,NCI,C76777,,,,
ATC N07XX07,ATC,N07XX07,,,,
GS 1946,GS,1946,,,,
NOC 040413,NOC,040413,,,,


In [37]:
concept_code.columns = ["id", "CodeID:ID"]
concept_code.head()

Unnamed: 0,id,CodeID:ID
0,C0000294,ATC V03AF01
1,C0000481,CHV 0000000513
2,C0000661,MSH D015124
3,C0000665,VANDF 4020847
4,C0000737,LNC LA15468-4


In [38]:
concept_code = pd.merge(concept_code, codes, on="CodeID:ID", how='left')
concept_code.head()

Unnamed: 0,id,CodeID:ID,SAB,CODE,value:float,lowerbound:float,upperbound:float,unit
0,C0000294,ATC V03AF01,ATC,V03AF01,,,,
1,C0000481,CHV 0000000513,CHV,0000000513,,,,
2,C0000661,MSH D015124,MSH,D015124,,,,
3,C0000665,VANDF 4020847,VANDF,4020847,,,,
4,C0000737,LNC LA15468-4,LNC,LA15468-4,,,,


In [39]:
print("\n".join(codes.SAB.unique()))

MTHSPL
NCI
ATC
GS
NOC
SNOMEDCT_US
CCS
MDR
CHV
LNC
PDQ
MEDCIN
NDDF
MSH
CSP
LCH_NW
DRUGBANK
MMSL
ICPC2ICD10ENG
ICD10CM
ICPC2P
VANDF
GO
RXNORM
PSY
NANDA-I
FMA
OMIM
ICD10AMAE
ICD10
MTHICD9
CPM
SNOMEDCT_VET
ORPHANET
MEDLINEPLUS
HPO
USP
NCBI
ICD9CM
HCPT
UWDA
USPMG
ICD10AM
HL7V2.5
ICD10AE
MTH
ICF
HL7V3.0
NEU
AOT
ICPC2EENG
OMS
ICNP
SPN
CPT
NIC
CCC
UMD
CDCREC
ALT
MMX
DSM-5
HCPCS
CDT
HCDT
HGNC
SRC
ICF-CY
MED-RT
MVX
NUCCHCPT
ICD10PCS
CVX
PNDS
CCSR_ICD10CM
CCSR_ICD10PCS
SOP
MTHICPC2ICD10AE
MTHICPC2EAE
MTHCMSFRF
NDC
UBERON
CL
ENVO
PATO
CHEBI
PR
NBO
PO
OBI
MGI
SO
PCO
CARO
DDANAT
MOD
DOID
ASCTB-TEMP
UO
APOLLO
OGMS
OMIABIS
OPL
IDO
CLO
VO
CHMO
COB
EDAM
HSAPDV
SBO
MI
MP
MPATH
OBSOLETECLASS
UNIPROTKB
MONDO
CHR
FOODON
ECTO
MF
MFOMD
MAXO
EXO
OBA
UPHENO
EFO
FBBT
BTO
ZFA
JAPAN
EO
MA
BAO
TO
HANCESTRO
UNITED
FBDV
SOUTH
OMIT
CHINA
IRAN
ZEA
WBLS
INDIA
REPUBLIC
PHILIPPINES
AZ
PGO
GENCODE
ENSEMBL
ENTREZ
REFSEQ
MEDGEN
HCOP
MSIGDB
CHLO
4DND
4DNF
4DNL
4DNQ
FALDO
UNIPROT
CORE
22-RDF-SYNTAX-NS
GLYCAN
FOAF
GCO
SIO
CODAO

In [40]:
concept_code[concept_code.id == 'C0000097']

Unnamed: 0,id,CodeID:ID,SAB,CODE,value:float,lowerbound:float,upperbound:float,unit
623690,C0000097,LCH_NW sh86002892,LCH_NW,sh86002892,,,,
1108107,C0000097,CSP 2511-0411,CSP,2511-0411,,,,
1246274,C0000097,PSY 31213,PSY,31213,,,,
2179442,C0000097,MSH D015632,MSH,D015632,,,,
3529094,C0000097,CHV 0000000501,CHV,0000000501,,,,
3736237,C0000097,PSY 32433,PSY,32433,,,,
4116966,C0000097,SNOMEDCT_US 285407008,SNOMEDCT_US,285407008,,,,


In [41]:
concepts.head()

Unnamed: 0_level_0,label,type,type_combined
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C0000097,"1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine",Organic Chemical,Organic Chemical; Hazardous or Poisonous Subst...
C0000359,"3',5'-Cyclic-Nucleotide Phosphodiesterase",Enzyme,"Enzyme; Amino Acid, Peptide, or Protein"
C0000610,6-Aminonicotinamide,Pharmacologic Substance,Vitamin; Pharmacologic Substance
C0000739,Abdominal Muscles,"Body Part, Organ, or Organ Component","Body Part, Organ, or Organ Component"
C0000873,Academic Problem,Finding,Finding


In [42]:
with open("unique_SABS_of_Concepr.txt", "w") as o:
    sabs = [str(i) for i in concept_code[concept_code.id.isin(concepts[concepts.type.isna()].index)].SAB.unique()]
    o.write("\n".join(sabs))

In [43]:
type_mapper = {}
with open("unique_SABS_of_Concept.txt") as o:
    for line in o:
        r = line.strip().split(":")
        if len(r) == 2:
            type_mapper[r[0]] = r[1]
        else:
            type_mapper[r[0]] = r[0]

In [44]:
for i,row in tqdm(concept_code[concept_code.id.isin(concepts[concepts.type.isna()].index)].iterrows()):
    sab = row["SAB"]
    ind = row["id"]
    if type(sab) == str:
        tp = type_mapper[sab]
        if tp:
            concepts.at[ind, "type"] = tp
            concepts.at[ind, "type_combined"] = tp


11606580it [05:24, 35748.66it/s]


In [45]:
for type in tqdm(concepts.type.unique()):
    con = concepts[concepts.type==type].copy()
    cc = concept_code[concept_code.id.isin(con.index)]
    for sab in tqdm(cc.SAB.unique()):
        c = cc[cc.SAB == sab]
        c = c.groupby('id').first()
        common = list(set(con.index).intersection(c.index))
        con.loc[common, sab] = c.loc[common, "CodeID:ID"]
        if c.loc[common, "value:float"].isna().sum() != len(common):
            con.loc[common, "%s value"%sab] = c.loc[common, "value:float"]
        if c.loc[common, "lowerbound:float"].isna().sum() != len(common):
            con.loc[common, "%s lowerbound"%sab] = c.loc[common, "lowerbound:float"]
        if c.loc[common, "upperbound:float"].isna().sum() != len(common):
            con.loc[common, "%s upperbound"%sab] = c.loc[common, "upperbound:float"]
        if c.loc[common, "unit"].isna().sum() != len(common):
            con.loc[common, "%s unit"%sab] = c.loc[common, "unit"]
        con.to_csv("out/serialization/nodes/%s.nodes.csv"%(type))


100%|██████████| 45/45 [00:34<00:00,  1.32it/s]
100%|██████████| 35/35 [00:03<00:00,  9.96it/s]t]
100%|██████████| 46/46 [00:05<00:00,  8.67it/s]t]
100%|██████████| 45/45 [00:14<00:00,  3.17it/s]  
100%|██████████| 66/66 [01:06<00:00,  1.01s/it]t]
100%|██████████| 22/22 [00:00<00:00, 49.05it/s]t]
100%|██████████| 41/41 [00:00<00:00, 55.56it/s]t]
100%|██████████| 33/33 [00:00<00:00, 65.21it/s]t]
100%|██████████| 47/47 [00:01<00:00, 26.54it/s]  
100%|██████████| 48/48 [00:21<00:00,  2.24it/s]
100%|██████████| 33/33 [00:00<00:00, 60.97it/s]]
100%|██████████| 56/56 [01:09<00:00,  1.25s/it]]
100%|██████████| 37/37 [00:01<00:00, 35.59it/s]it]
100%|██████████| 50/50 [00:20<00:00,  2.49it/s]it]
100%|██████████| 23/23 [00:02<00:00, 11.47it/s]it]
100%|██████████| 16/16 [00:00<00:00, 61.86it/s]it]
100%|██████████| 26/26 [00:00<00:00, 162.01it/s]  
100%|██████████| 28/28 [00:01<00:00, 18.53it/s]]
100%|██████████| 29/29 [00:03<00:00,  9.54it/s]]
100%|██████████| 26/26 [00:00<00:00, 141.09it/s]
100%

In [46]:
import os

In [49]:
row_headers = ["source", "relation", "target", "source_label", "target_label", "SAB", "evidence"]
with open("neo4j/import/CUI-CUIs.csv") as o:
    csv_reader = csv.reader(o)
    headers = None
    for row in tqdm(csv_reader):
        if not headers:
            headers = row
        else:
            source = row[0]
            target = row[1]
            if source in concepts.index and target in concepts.index:
                source_label = concepts.at[source, 'label']
                source_type = concepts.at[source, 'type']
                
                target_label = concepts.at[target, 'label']
                target_type = concepts.at[target, 'type']
                relation = row[2]
                sab = row[3]
                evidence = ''
                if len(row) > 4:
                    evidence = row[4]
                filename = 'out/serialization/edges/%s.%s.%s.edges.csv'%(source_type, relation, target_type)
                write_header = False
                operation = "a"
                if not os.path.isfile(filename):
                    if relation == "interacts_with":
                        print(source_type, relation, target_type)
                    write_header = True
                    operation = "w"
                with open(filename, operation) as w:
                    csv_writer = csv.writer(w)
                    if write_header:
                        csv_writer.writerow(row_headers)
                    csv_writer.writerow([source, relation, target, source_label, target_label, sab, evidence])

25434953it [38:03, 13421.61it/s]

EXO interacts_with EXO


127920727it [2:00:51, 17640.15it/s]


In [None]:
with open("relationship_SABs.txt", "w") as o:
    o.write("\n".join(sabs))

In [95]:
relationships = set()
with open("neo4j/import/CUI-CUIs.csv") as o:
    csv_reader = csv.reader(o)
    headers = None
    for row in tqdm(csv_reader):
        if not headers:
            headers = row
        else:
            source = row[0]
            target = row[1]
            relation = row[2]
            sab = row[3]
            relationships.add(relation)

127920727it [02:08, 997041.95it/s] 


In [97]:
with open("relationships.txt", "w") as o:
    o.write("\n".join(relationships))

In [171]:
# with open("edge_filenames", "w") as o:
#     o.write("\n".join(edge_filenames))

In [172]:
# with open("node_filenames", "w") as o:
#     o.write("\n".join(node_filenames))

In [8]:
with open("relationship_SABs_keep.txt") as o:
    sabs_to_keep = set(o.read().strip().split("\n"))

In [6]:
import re
import os
edge_pattern = "(?P<directory>.+)/(?P<source_type>.+)\.(?P<relation>.+)\.(?P<target_type>.+)\.(?P<entity>.+)\.csv"


In [81]:
node_base = "out/serialization/nodes/%s.nodes.csv"
new_node_base = "out/filtered/nodes/%s.nodes.csv"
new_edge_base = "out/filtered/edges/%s.%s.%s.edges.csv"
sab_relations = {}
processed = set()

In [101]:
def glygen(s):
    return s.replace("GLYGEN.RESIDUE", "GLYGEN_RESIDUE").replace("GLYCAN.MOTIF", "GLYCAN_MOTIF").replace('GLYCOSYLTRANSFERASE.REACTION', 'GLYCOSYLTRANSFERASE_REACTION').replace("GLYGEN.SRC", "GLYGEN_SRC").replace('GLYGEN.GLYCOSYLATION', 'GLYGEN_GLYCOSYLATION')

def glygen_reverse(s):
    return s.replace("GLYGEN_RESIDUE", "GLYGEN.RESIDUE").replace("GLYCAN_MOTIF", "GLYCAN.MOTIF").replace('GLYCOSYLTRANSFERASE_REACTION', 'GLYCOSYLTRANSFERASE.REACTION').replace("GLYGEN_SRC", "GLYGEN.SRC").replace('GLYGEN_GLYCOSYLATION', 'GLYGEN.GLYCOSYLATION')


In [102]:
for filename in tqdm(glob("out/serialization/edges/*.csv")):
    if filename not in processed:
        match = re.match(edge_pattern, glygen(filename)).groupdict()
        entity = match["entity"]
        source_type = glygen_reverse(match["source_type"])
        relation = match["relation"].replace("_", " ")
        target_type = glygen_reverse(match["target_type"])
        if "inverse" not in relation:
            edge_df = pd.read_csv(filename, low_memory=False)
            # filter for SAB
            sabs = sabs_to_keep.intersection(edge_df.SAB.unique())
            for sab in sabs:
                if sab not in sab_relations:
                    sab_relations[sab] = set()
                sab_relations[sab].add(relation)
            if len(sabs) > 0:
                edge_df = edge_df[edge_df.SAB.isin(sabs)]
                source_df = pd.read_csv(node_base%source_type, index_col=0, low_memory=False)
                if os.path.isfile(new_node_base%(source_type)):
                    new_source_df = pd.read_csv(new_node_base%(source_type), index_col=0, low_memory=False)
                    pd.concat([new_source_df, source_df]).dropna(axis=1).to_csv(new_node_base%(source_type))
                else:
                    source_df.dropna(axis=1).to_csv(new_node_base%(source_type))
                
                target_df = pd.read_csv(node_base%target_type, index_col=0, low_memory=False)
                if os.path.isfile(new_node_base%(target_type)):
                    new_target_df = pd.read_csv(new_node_base%(target_type), index_col=0, low_memory=False)
                    pd.concat([new_target_df, target_df]).dropna(axis=1).to_csv(new_node_base%(target_type))
                else:
                    target_df.dropna(axis=1).to_csv(new_node_base%(target_type))
                edge_df.to_csv(new_edge_base%(source_type, relation, target_type), index=False)
        processed.add(filename.replace("GLYGEN_RESIDUE", "GLYGEN.RESIDUE"))
                

 78%|███████▊  | 58962/75964 [08:49<02:32, 111.33it/s]


KeyboardInterrupt: 

In [100]:
match = re.match(edge_pattern, glygen(filename)).groupdict()
entity = match["entity"]
source_type = glygen_reverse(match["source_type"]).replace("_", " ")
relation = match["relation"].replace("_", " ")
target_type = glygen_reverse(match["target_type"]).replace("_", " ")
filename

'out/serialization/edges/UNIPROTKB.inverse_has_enzyme_protein.GLYGEN.GLYCOSYLATION.edges.csv'

In [92]:
'out/serialization/edges/Drug.indication.SNOMEDCT_US.edges.csv'


{'directory': 'out/serialization/edges',
 'source_type': 'Drug',
 'relation': 'indication',
 'target_type': 'SNOMEDCT_US',
 'entity': 'edges'}

In [3]:
node_base = "out/serialization/nodes/%s.nodes.csv"
new_node_base = "out/filtered/nodes/%s.nodes.csv"
new_edge_base = "out/filtered/edges/%s.%s.%s.edges.csv"
sab_relations = {}
processed = set()

In [12]:
def glygen(s):
    return s.replace("GLYGEN.RESIDUE", "GLYGEN_RESIDUE").replace("GLYCAN.MOTIF", "GLYCAN_MOTIF").replace('GLYCOSYLTRANSFERASE.REACTION', 'GLYCOSYLTRANSFERASE_REACTION').replace("GLYGEN.SRC", "GLYGEN_SRC").replace('GLYGEN.GLYCOSYLATION', 'GLYGEN_GLYCOSYLATION').replace("GLYGEN.GLYCOSEQUENCE", "GLYGEN_GLYCOSEQUENCE")

def glygen_reverse(s):
    return s.replace("GLYGEN_RESIDUE", "GLYGEN.RESIDUE").replace("GLYCAN_MOTIF", "GLYCAN.MOTIF").replace('GLYCOSYLTRANSFERASE_REACTION', 'GLYCOSYLTRANSFERASE.REACTION').replace("GLYGEN_SRC", "GLYGEN.SRC").replace('GLYGEN_GLYCOSYLATION', 'GLYGEN.GLYCOSYLATION').replace("GLYGEN_GLYCOSEQUENCE", "GLYGEN.GLYCOSEQUENCE")


In [13]:
for filename in tqdm(glob("out/serialization/edges/*.csv")):
    if filename not in processed:
        match = re.match(edge_pattern, glygen(filename)).groupdict()
        entity = match["entity"]
        source_type = match["source_type"]
        relation = match["relation"].replace("_", " ")
        target_type = match["target_type"]
        if "inverse" not in relation:
            edge_df = pd.read_csv(filename, low_memory=False)
            # filter for SAB
            sabs = sabs_to_keep.intersection(edge_df.SAB.unique())
            for sab in sabs:
                if sab not in sab_relations:
                    sab_relations[sab] = set()
                sab_relations[sab].add(relation)
            if len(sabs) > 0:
                edge_df = edge_df[edge_df.SAB.isin(sabs)]
                source_df = pd.read_csv(node_base%glygen_reverse(source_type), index_col=0, low_memory=False)
                if not os.path.isfile(new_node_base%(source_type)):
                    # new_source_df = pd.read_csv(new_node_base%(source_type), index_col=0, low_memory=False)
                    # pd.concat([new_source_df, source_df]).dropna(axis=1).to_csv(new_node_base%(source_type))
                    source_df.to_csv(new_node_base%(source_type))
                # else:
                #     source_df.dropna(axis=1).to_csv(new_node_base%(source_type))
                
                target_df = pd.read_csv(node_base%glygen_reverse(target_type), index_col=0, low_memory=False)
                if not os.path.isfile(new_node_base%(target_type)):
                    # new_target_df = pd.read_csv(new_node_base%(target_type), index_col=0, low_memory=False)
                    # pd.concat([new_target_df, target_df]).dropna(axis=1).to_csv(new_node_base%(target_type))
                    target_df.to_csv(new_node_base%(target_type))
                # else:
                #     target_df.dropna(axis=1).to_csv(new_node_base%(target_type))
                edge_df.to_csv(new_edge_base%(source_type, relation, target_type), index=False)
        processed.add(filename)
                

100%|██████████| 75964/75964 [01:14<00:00, 1018.49it/s] 


In [11]:
match = re.match(edge_pattern, glygen(filename)).groupdict()
entity = match["entity"]
source_type = glygen_reverse(match["source_type"]).replace("_", " ")
relation = match["relation"].replace("_", " ")
target_type = glygen_reverse(match["target_type"]).replace("_", " ")
filename

'out/serialization/edges/GLYTOUCAN.has_glycosequence.GLYGEN.GLYCOSEQUENCE.edges.csv'

In [14]:
a = "WURCS=2.0/4,37,36/[AUd21122h_5*NCC/3=O][uxxxxm][uxxxxh_2*NCC/3=O][uxxxxh]/1-1-2-2-2-2-2-2-3-3-3-3-3-3-3-3-3-3-3-3-3-3-4-4-4-4-4-4-4-4-4-4-4-4-4-4-4/a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?"

In [15]:
len(a)

8211

In [18]:
source_df = pd.read_csv(node_base%glygen_reverse("Gene or Genome"), index_col=0, low_memory=False)


In [19]:
source_df.head()

Unnamed: 0_level_0,label,type,type_combined,NCI,LCH_NW,MSH,CSP,SNOMEDCT_US,HGNC,MTH,...,HPO,EFO,CHEBI,ENSEMBL,ENSEMBL lowerbound,ENSEMBL upperbound,ENTREZ,ENTREZ lowerbound,ENTREZ upperbound,PUBCHEM
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0079559,HTLV-I tax Genes,Gene or Genome,Gene or Genome,NCI C17366,,MSH D016355,,,,,...,,,,,,,,,,
C0314607,Human structural gene,Gene or Genome,Gene or Genome,,,,,SNOMEDCT_US 49046007,,,...,,,,,,,,,,
C0440471,Genetic Materials,Gene or Genome,Gene or Genome,,,MSH D005796,,SNOMEDCT_US 256926007,,,...,,,,,,,,,,
C0440482,A*1102,Gene or Genome,Gene or Genome,,,,,SNOMEDCT_US 256943005,,,...,,,,,,,,,,
C0440498,A*6602,Gene or Genome,Gene or Genome,,,,,SNOMEDCT_US 256962008,,,...,,,,,,,,,,
