In [1]:
import pandas as pd
from glob import glob
from IPython.display import display, Markdown
from tqdm import tqdm
import csv
pd.__version__

'2.0.2'

# Unified Biomedical Knowledge Graph (UBKG)
## Nodes
### Concept

In [2]:
concepts = pd.read_csv("neo4j/import/CUIs.csv")
concepts = pd.DataFrame(index=concepts["CUI:ID"].unique())
concepts.index.name = "id"
concepts.head()

C0000097
C0000359
C0000610
C0000739
C0000873


### Semantics

In [3]:
semantics = pd.read_csv("neo4j/import/TUIs.csv", index_col=0)
semantics.head()

Unnamed: 0_level_0,name,STN,DEF
TUI:ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
T012,Bird,A1.1.3.1.1.2,A vertebrate having a constant body temperatur...
T014,Reptile,A1.1.3.1.1.5,A cold-blooded vertebrate having an external c...
T015,Mammal,A1.1.3.1.1.4,A vertebrate having a constant body temperatur...
T021,Fully Formed Anatomical Structure,A1.2.3,An anatomical structure in a fully formed orga...
T022,Body System,A2.1.4.1,A complex of anatomical structures that perfor...


### Definition

In [4]:
definition = pd.read_csv("neo4j/import/DEFs.csv", index_col=0)
definition.head()

Unnamed: 0_level_0,SAB,DEF
ATUI:ID,Unnamed: 1_level_1,Unnamed: 2_level_1
AT38138756,MSH,A tool for the study of liver damage which cau...
AT43116097,MSH,A hepatic carcinogen whose mechanism of activa...
AT38141939,MSH,Simple amine found in the brain. It may be mod...
AT235449311,MSH,A physiologically active metabolite of VITAMIN...
AT38137186,MSH,A material used in the manufacture of azo dyes...


### Term

In [5]:
terms = pd.read_csv("neo4j/import/SUIs.csv", index_col=0)
terms.head()

Unnamed: 0_level_0,name
SUI:ID,Unnamed: 1_level_1
S0009776,"Acid, 2-Aminohexanedioic"
S7249234,BR CAMP
S11872577,cramps abdominal
S14680596,Retained tissue after pregnancy loss
S3417882,Missed miscarriage


### Code

In [6]:
codes = pd.read_csv("neo4j/import/CODEs.csv", index_col=0)
codes.head()

  codes = pd.read_csv("neo4j/import/CODEs.csv", index_col=0)


Unnamed: 0_level_0,SAB,CODE,value:float,lowerbound:float,upperbound:float,unit
CodeID:ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MTHSPL J7A92W69L7,MTHSPL,J7A92W69L7,,,,
NCI C76777,NCI,C76777,,,,
ATC N07XX07,ATC,N07XX07,,,,
GS 1946,GS,1946,,,,
NOC 040413,NOC,040413,,,,


## Edges
### Concept-Term

In [7]:
concept_term = pd.read_csv("neo4j/import/CUI-SUIs.csv")
concept_term.head()

Unnamed: 0,:START_ID,:END_ID
0,C0000163,S0007806
1,C0000427,S0008336
2,C0000665,S11922611
3,C0000992,S1577859
4,C0001021,S0797872


### Concept-Semantics

In [8]:
concept_semantics = pd.read_csv("neo4j/import/CUI-TUIs.csv")
concept_semantics.head()

Unnamed: 0,:START_ID,:END_ID
0,C0000132,T126
1,C0000246,T116
2,C0000895,T060
3,C0000908,T037
4,C0000931,T067


### Concept-Code

In [9]:
concept_code = pd.read_csv("neo4j/import/CUI-CODEs.csv")
concept_code.head()

Unnamed: 0,:START_ID,:END_ID
0,C0000294,ATC V03AF01
1,C0000481,CHV 0000000513
2,C0000661,MSH D015124
3,C0000665,VANDF 4020847
4,C0000737,LNC LA15468-4


### Concept-Concept

In [10]:
# concept_concept = pd.read_csv("neo4j/import/CUI-CUIs.csv", dtype_backend="pyarrow")
# concept_concept.head()

### Semantic-Semantic
Prolly skip this

In [11]:
semantics_semantics = pd.read_csv("neo4j/import/TUIrel.csv")
semantics_semantics.head()

Unnamed: 0,:END_ID,:START_ID
0,T204,T002
1,T001,T004
2,T071,T004
3,T072,T010
4,T204,T010


### Concept-Definition

In [12]:
concept_definition = pd.read_csv("neo4j/import/DEFrel.csv")
concept_definition.head()

Unnamed: 0,:END_ID,:START_ID
0,AT38152019,C0000039
1,AT69817678,C0000039
2,AT264439104,C0000039
3,AT254753550,C0000039
4,AT267611046,C0000039


### Code-Term

In [13]:
code_term = pd.read_csv("neo4j/import/CODE-SUIs.csv")
code_term.head()

Unnamed: 0,:END_ID,:START_ID,:TYPE,CUI
0,S1424701,RXNORM 74,IN,C0000473
1,S18541041,SNOMEDCT_US 80994002,FN,C0000477
2,S11730064,SNOMEDCT_US 226367006,SY,C0000545
3,S0288461,CSP 2005-4146,PT,C0000735
4,S1957040,MDR 10048885,LLT,C0000735


## Merge Concept and Terms

In [14]:
concept_term.columns = ["CUI:ID", "SUI:ID"]
concept_term.shape

(4508555, 2)

In [15]:
concept_term = pd.merge(concept_term, terms, on="SUI:ID", how='outer')
concept_term = concept_term.groupby('CUI:ID').first()
concept_term.head()

Unnamed: 0_level_0,SUI:ID,name
CUI:ID,Unnamed: 1_level_1,Unnamed: 2_level_1
C0000005,S0007492,(131)I-Macroaggregated Albumin
C0000039,S17175117,"1,2-dipalmitoylphosphatidylcholine"
C0000052,S0007584,"1,4-alpha-Glucan Branching Enzyme"
C0000074,S0007615,1-Alkyl-2-Acylphosphatidates
C0000084,S0007627,1-Carboxyglutamic Acid


In [16]:
concept_term.shape

(4508537, 2)

In [17]:
concept_term.columns = ["SUI:ID", "label"]
concept_term = concept_term[["label"]]
concept_term.head()

Unnamed: 0_level_0,label
CUI:ID,Unnamed: 1_level_1
C0000005,(131)I-Macroaggregated Albumin
C0000039,"1,2-dipalmitoylphosphatidylcholine"
C0000052,"1,4-alpha-Glucan Branching Enzyme"
C0000074,1-Alkyl-2-Acylphosphatidates
C0000084,1-Carboxyglutamic Acid


In [18]:
concept_term.shape

(4508537, 1)

In [19]:
concepts.loc[concept_term.index, 'label'] = concept_term.loc[concept_term.index, 'label']
concepts.head()

Unnamed: 0_level_0,label
id,Unnamed: 1_level_1
C0000097,"1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine"
C0000359,"3',5'-Cyclic-Nucleotide Phosphodiesterase"
C0000610,6-Aminonicotinamide
C0000739,Abdominal Muscles
C0000873,Academic Problem


In [20]:
concepts.label = concepts.label.fillna("-")
concepts.shape

(11424664, 1)

## Merge Concept and Type

In [21]:
semantics.head()

Unnamed: 0_level_0,name,STN,DEF
TUI:ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
T012,Bird,A1.1.3.1.1.2,A vertebrate having a constant body temperatur...
T014,Reptile,A1.1.3.1.1.5,A cold-blooded vertebrate having an external c...
T015,Mammal,A1.1.3.1.1.4,A vertebrate having a constant body temperatur...
T021,Fully Formed Anatomical Structure,A1.2.3,An anatomical structure in a fully formed orga...
T022,Body System,A2.1.4.1,A complex of anatomical structures that perfor...


In [22]:
concept_semantics.head()

Unnamed: 0,:START_ID,:END_ID
0,C0000132,T126
1,C0000246,T116
2,C0000895,T060
3,C0000908,T037
4,C0000931,T067


In [23]:
no_type = set(concepts.index) - set(concept_semantics[':START_ID'])
len(no_type)

8161231

In [24]:
# unique_relation = set()
# unique_sab = set()
# examples = set()
# with open("neo4j/import/CUI-CUIs.csv") as o:
#     csv_reader = csv.reader(o)
#     headers = None
#     for row in tqdm(csv_reader):
#         if not headers:
#             headers = row
#         else:
#             source = row[0]
#             target = row[1]
#             relation = row[2]
#             sab = row[3]
#             if source in no_type or target in no_type:
#                 if relation not in unique_relation or sab not in unique_sab:
#                     examples.add((source, relation, sab, target))
#                     unique_relation.add(relation)
#                     unique_sab.add(sab)

In [25]:
# with open("no_types.txt", "w") as o:
#     o.write("\n".join([",".join(i) for i in examples]))

In [26]:
# with open("SABs_no_semantic.txt", "w") as o:
#     o.write("\n".join(unique_sab))

In [27]:
with open('out/semantics_ranked.tsv') as o:
    ranked_type = [i.strip() for i in o.read().strip().split("\n")]

In [28]:
concept_semantics.columns = ["id", "TUI:ID"]
concept_semantics["type"] = [semantics.at[i, 'name'] for i in concept_semantics['TUI:ID']]
concept_semantics.head()

Unnamed: 0,id,TUI:ID,type
0,C0000132,T126,Enzyme
1,C0000246,T116,"Amino Acid, Peptide, or Protein"
2,C0000895,T060,Diagnostic Procedure
3,C0000908,T037,Injury or Poisoning
4,C0000931,T067,Phenomenon or Process


In [29]:
def fetch_type(v):
    cat = ""
    rank = len(ranked_type)
    for i in v:
        r = ranked_type.index(i)
        if r < rank:
            cat = i
            rank = r
    return cat

In [30]:
cs = concept_semantics.groupby('id')['type'].apply(lambda x: "; ".join(set(x)))
cs.head()

id
C0000005    Pharmacologic Substance; Amino Acid, Peptide, ...
C0000039            Pharmacologic Substance; Organic Chemical
C0000052              Enzyme; Amino Acid, Peptide, or Protein
C0000074                                     Organic Chemical
C0000084    Biologically Active Substance; Amino Acid, Pep...
Name: type, dtype: object

In [31]:
cs_ranked = concept_semantics.groupby('id')['type'].apply(fetch_type)
cs_ranked.head()

id
C0000005    Amino Acid, Peptide, or Protein
C0000039                   Organic Chemical
C0000052                             Enzyme
C0000074                   Organic Chemical
C0000084    Amino Acid, Peptide, or Protein
Name: type, dtype: object

In [32]:
common = list(set(concepts.index).intersection(cs.index))
cs[common].head()

id
C2872811     Injury or Poisoning
C5697998    Laboratory Procedure
C4297234      Clinical Attribute
C1263700      Neoplastic Process
C2840805     Injury or Poisoning
Name: type, dtype: object

In [33]:
concept_semantics
concepts.loc[common, 'type'] = cs_ranked[common]
concepts.loc[common, 'type_combined'] = cs[common]
concepts.head()

Unnamed: 0_level_0,label,type,type_combined
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C0000097,"1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine",Organic Chemical,Organic Chemical; Hazardous or Poisonous Subst...
C0000359,"3',5'-Cyclic-Nucleotide Phosphodiesterase",Enzyme,"Enzyme; Amino Acid, Peptide, or Protein"
C0000610,6-Aminonicotinamide,Pharmacologic Substance,Pharmacologic Substance; Vitamin
C0000739,Abdominal Muscles,"Body Part, Organ, or Organ Component","Body Part, Organ, or Organ Component"
C0000873,Academic Problem,Finding,Finding


In [34]:
concepts.groupby("type_combined").first().to_csv('out/semantics.tsv', sep="\t")

In [35]:
concepts.head()

Unnamed: 0_level_0,label,type,type_combined
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C0000097,"1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine",Organic Chemical,Organic Chemical; Hazardous or Poisonous Subst...
C0000359,"3',5'-Cyclic-Nucleotide Phosphodiesterase",Enzyme,"Enzyme; Amino Acid, Peptide, or Protein"
C0000610,6-Aminonicotinamide,Pharmacologic Substance,Pharmacologic Substance; Vitamin
C0000739,Abdominal Muscles,"Body Part, Organ, or Organ Component","Body Part, Organ, or Organ Component"
C0000873,Academic Problem,Finding,Finding


In [36]:
concepts.shape

(11424664, 3)

In [37]:
with open('out/semantics_list.tsv', 'w') as o:
    o.write("\n".join([str(i) for i in concept_semantics.type.unique()]))

## Merge Concept and Code

In [38]:
codes.head()

Unnamed: 0_level_0,SAB,CODE,value:float,lowerbound:float,upperbound:float,unit
CodeID:ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MTHSPL J7A92W69L7,MTHSPL,J7A92W69L7,,,,
NCI C76777,NCI,C76777,,,,
ATC N07XX07,ATC,N07XX07,,,,
GS 1946,GS,1946,,,,
NOC 040413,NOC,040413,,,,


In [39]:
concept_code.columns = ["id", "CodeID:ID"]
concept_code.head()

Unnamed: 0,id,CodeID:ID
0,C0000294,ATC V03AF01
1,C0000481,CHV 0000000513
2,C0000661,MSH D015124
3,C0000665,VANDF 4020847
4,C0000737,LNC LA15468-4


In [40]:
concept_code = pd.merge(concept_code, codes, on="CodeID:ID", how='left')
concept_code.head()

Unnamed: 0,id,CodeID:ID,SAB,CODE,value:float,lowerbound:float,upperbound:float,unit
0,C0000294,ATC V03AF01,ATC,V03AF01,,,,
1,C0000481,CHV 0000000513,CHV,0000000513,,,,
2,C0000661,MSH D015124,MSH,D015124,,,,
3,C0000665,VANDF 4020847,VANDF,4020847,,,,
4,C0000737,LNC LA15468-4,LNC,LA15468-4,,,,


In [41]:
concept_code[concept_code.id == 'C0000097']

Unnamed: 0,id,CodeID:ID,SAB,CODE,value:float,lowerbound:float,upperbound:float,unit
623690,C0000097,LCH_NW sh86002892,LCH_NW,sh86002892,,,,
1108107,C0000097,CSP 2511-0411,CSP,2511-0411,,,,
1246274,C0000097,PSY 31213,PSY,31213,,,,
2179442,C0000097,MSH D015632,MSH,D015632,,,,
3529094,C0000097,CHV 0000000501,CHV,0000000501,,,,
3736237,C0000097,PSY 32433,PSY,32433,,,,
4116966,C0000097,SNOMEDCT_US 285407008,SNOMEDCT_US,285407008,,,,


In [42]:
concepts.head()

Unnamed: 0_level_0,label,type,type_combined
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C0000097,"1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine",Organic Chemical,Organic Chemical; Hazardous or Poisonous Subst...
C0000359,"3',5'-Cyclic-Nucleotide Phosphodiesterase",Enzyme,"Enzyme; Amino Acid, Peptide, or Protein"
C0000610,6-Aminonicotinamide,Pharmacologic Substance,Pharmacologic Substance; Vitamin
C0000739,Abdominal Muscles,"Body Part, Organ, or Organ Component","Body Part, Organ, or Organ Component"
C0000873,Academic Problem,Finding,Finding


In [42]:
with open("unique_SABS_of_Concepr.txt", "w") as o:
    sabs = [str(i) for i in concept_code[concept_code.id.isin(concepts[concepts.type.isna()].index)].SAB.unique()]
    o.write("\n".join(sabs))

In [43]:
type_mapper = {}
with open("unique_SABS_of_Concept.txt") as o:
    for line in o:
        r = line.strip().split(":")
        if len(r) == 2:
            type_mapper[r[0]] = r[1]
        else:
            type_mapper[r[0]] = r[0]

In [44]:
for i,row in tqdm(concept_code[concept_code.id.isin(concepts[concepts.type.isna()].index)].iterrows()):
    sab = row["SAB"]
    ind = row["id"]
    if type(sab) == str:
        tp = type_mapper[sab]
        if tp:
            concepts.at[ind, "type"] = tp
            concepts.at[ind, "type_combined"] = tp


11606580it [05:24, 35770.17it/s]


In [88]:
for type in tqdm(concepts.type.unique()):
    con = concepts[concepts.type==type].copy()
    cc = concept_code[concept_code.id.isin(con.index)]
    for sab in cc.SAB.unique():
        c = cc[cc.SAB == sab]
        c = c.groupby('id').first()
        common = list(set(con.index).intersection(c.index))
        con.loc[common, sab] = c.loc[common, "CodeID:ID"]
        if c.loc[common, "value:float"].isna().sum() != len(common):
            con.loc[common, "%s value"%sab] = c.loc[common, "value:float"]
        if c.loc[common, "lowerbound:float"].isna().sum() != len(common):
            con.loc[common, "%s lowerbound"%sab] = c.loc[common, "lowerbound:float"]
        if c.loc[common, "upperbound:float"].isna().sum() != len(common):
            con.loc[common, "%s upperbound"%sab] = c.loc[common, "upperbound:float"]
        if c.loc[common, "unit"].isna().sum() != len(common):
            con.loc[common, "%s unit"%sab] = c.loc[common, "unit"]
        if "-" in list(con["label"]):
            tmp = con[con.label == "-"]
            ind = set(tmp.index).intersection(con.index)
            ind2 = set(tmp.index).intersection(c.index)
            if len(ind.intersection(ind2)) > 0:
                l = list(ind.intersection(ind2))
                con.loc[l, "label"] = c.loc[l, 'CodeID:ID']
        con.to_csv("out/serialization/nodes/%s.nodes.csv"%(type))


100%|██████████| 262/262 [13:45<00:00,  3.15s/it] 


In [89]:
import os

In [96]:
row_headers = ["source", "relation", "target", "source_label", "target_label", "SAB", "evidence"]
with open("neo4j/import/CUI-CUIs.csv") as o:
    csv_reader = csv.reader(o)
    headers = None
    for row in tqdm(csv_reader):
        if not headers:
            headers = row
        else:
            source = row[0]
            target = row[1]
            if source in concepts.index and target in concepts.index:
                source_label = concepts.at[source, 'label']
                source_type = concepts.at[source, 'type']
                
                target_label = concepts.at[target, 'label']
                target_type = concepts.at[target, 'type']
                relation = row[2]
                sab = row[3]
                evidence = ''
                if len(row) > 4:
                    evidence = row[4]
                filename = 'out/serialization/edges/%s.%s.%s.edges.csv'%(source_type, relation, target_type)
                write_header = False
                operation = "a"
                if not os.path.isfile(filename):
                    write_header = True
                    operation = "w"
                # source_list = set()
                # target_list = set()
                with open(filename, operation) as w:
                    csv_writer = csv.writer(w)
                    if write_header:
                        csv_writer.writerow(row_headers)
                    csv_writer.writerow([source, relation, target, source_label, target_label, sab, evidence])
                #     source_list.add(source)
                #     target_list.add(target)

                # # take note of nodes that are used for source and target
                # source_ids = "out/serialization/ids/%s.txt"%source_type
                # if not os.path.isfile(source_ids):
                #     with open(source_ids, 'w') as o:
                #         o.write("\n".join(source_list))
                # else:
                #     with open(source_ids) as o:
                #         source_list = source_list.union(o.read().strip().split("\n"))
                #     with open(source_ids, 'w') as o:
                #         o.write("\n".join(source_list))
                # target_ids = "out/serialization/ids/%s.txt"%target_type
                # if not os.path.isfile(target_ids):
                #     with open(target_ids, 'w') as o:
                #         o.write("\n".join(target_list))
                # else:
                #     with open(target_ids) as o:
                #         target_list = target_list.union(o.read().strip().split("\n"))
                #     with open(target_ids, 'w') as o:
                #         o.write("\n".join(target_list))

127920727it [2:37:49, 13508.20it/s]


In [None]:
with open("relationship_SABs.txt", "w") as o:
    o.write("\n".join(sabs))

In [95]:
relationships = set()
with open("neo4j/import/CUI-CUIs.csv") as o:
    csv_reader = csv.reader(o)
    headers = None
    for row in tqdm(csv_reader):
        if not headers:
            headers = row
        else:
            source = row[0]
            target = row[1]
            relation = row[2]
            sab = row[3]
            relationships.add(relation)

127920727it [02:08, 997041.95it/s] 


In [97]:
with open("relationships.txt", "w") as o:
    o.write("\n".join(relationships))

In [171]:
# with open("edge_filenames", "w") as o:
#     o.write("\n".join(edge_filenames))

In [172]:
# with open("node_filenames", "w") as o:
#     o.write("\n".join(node_filenames))

In [97]:
with open("relationship_SABs_keep.txt") as o:
    sabs_to_keep = set(o.read().strip().split("\n"))

In [98]:
import re
import os
edge_pattern = "(?P<directory>.+)/(?P<source_type>.+)\.(?P<relation>.+)\.(?P<target_type>.+)\.(?P<entity>.+)\.csv"


In [103]:
node_base = "out/serialization/nodes/%s.nodes.csv"
new_node_base = "out/dd/nodes/%s.nodes.csv"
new_edge_base = "out/dd/edges/%s.%s.%s.edges.csv"
ids_base = "out/dd/ids/%s.txt"
node_ids = {}
sab_relations = {}
processed = set()

In [104]:
def glygen(s):
    return s.replace("GLYGEN.RESIDUE", "GLYGEN_RESIDUE").replace("GLYCAN.MOTIF", "GLYCAN_MOTIF").replace('GLYCOSYLTRANSFERASE.REACTION', 'GLYCOSYLTRANSFERASE_REACTION').replace("GLYGEN.SRC", "GLYGEN_SRC").replace('GLYGEN.GLYCOSYLATION', 'GLYGEN_GLYCOSYLATION')

def glygen_reverse(s):
    return s.replace("GLYGEN_RESIDUE", "GLYGEN.RESIDUE").replace("GLYCAN_MOTIF", "GLYCAN.MOTIF").replace('GLYCOSYLTRANSFERASE_REACTION', 'GLYCOSYLTRANSFERASE.REACTION').replace("GLYGEN_SRC", "GLYGEN.SRC").replace('GLYGEN_GLYCOSYLATION', 'GLYGEN.GLYCOSYLATION')


In [107]:
for filename in tqdm(glob("out/serialization/edges/*.csv")):
    if filename not in processed:
        match = re.match(edge_pattern, glygen(filename)).groupdict()
        entity = match["entity"]
        source_type = glygen_reverse(match["source_type"])
        relation = match["relation"].replace("_", " ")
        target_type = glygen_reverse(match["target_type"])
        if "inverse" not in relation:
            edge_df = pd.read_csv(filename, low_memory=False)
            # filter for SAB
            sabs = sabs_to_keep.intersection(edge_df.SAB.unique())
            for sab in sabs:
                if sab not in sab_relations:
                    sab_relations[sab] = set()
                sab_relations[sab].add(relation)
            if len(sabs) > 0:
                edge_df = edge_df[edge_df.SAB.isin(sabs)]
                if not os.path.isfile(ids_base%source_type):
                    with open(ids_base%source_type, 'w') as o:
                        o.write("\n".join(edge_df.source))
                else:
                    with open(ids_base%source_type) as o:
                        ids = set(o.read().strip().split("\n"))
                    with open(ids_base%source_type, 'w') as o:
                        ids = ids.union(edge_df.source)
                        o.write("\n".join(ids))
                if not os.path.isfile(ids_base%target_type):
                    with open(ids_base%target_type, 'w') as o:
                        o.write("\n".join(edge_df.target))
                else:
                    with open(ids_base%target_type) as o:
                        ids = set(o.read().strip().split("\n"))
                    with open(ids_base%target_type, 'w') as o:
                        ids = ids.union(edge_df.target)
                        o.write("\n".join(ids))
                # source_df = pd.read_csv(node_base%source_type, index_col=0, low_memory=False)
                # if os.path.isfile(new_node_base%(source_type)):
                #     new_source_df = pd.read_csv(new_node_base%(source_type), index_col=0, low_memory=False)
                #     pd.concat([new_source_df, source_df]).dropna(axis=1).to_csv(new_node_base%(source_type))
                # else:
                #     source_df.dropna(axis=1).to_csv(new_node_base%(source_type))
                
                # target_df = pd.read_csv(node_base%target_type, index_col=0, low_memory=False)
                # if os.path.isfile(new_node_base%(target_type)):
                #     new_target_df = pd.read_csv(new_node_base%(target_type), index_col=0, low_memory=False)
                #     pd.concat([new_target_df, target_df]).dropna(axis=1).to_csv(new_node_base%(target_type))
                # else:
                #     target_df.dropna(axis=1).to_csv(new_node_base%(target_type))
                edge_df.to_csv(new_edge_base%(source_type, relation, target_type), index=False)
        processed.add(filename.replace("GLYGEN_RESIDUE", "GLYGEN.RESIDUE"))
                

100%|██████████| 75964/75964 [04:37<00:00, 273.97it/s] 


In [109]:
count = 0
for filename in tqdm(glob("out/dd/ids/*.txt")):
    count+=1
count

100%|██████████| 87/87 [00:00<00:00, 1158426.82it/s]


87

In [114]:
id_pattern = "(?P<directory>.+)/(?P<type>.+)\.txt"
for filename in tqdm(glob("out/dd/ids/*.txt")):
    match = re.match(id_pattern, filename).groupdict()
    node_type = match["type"]
    node_df = pd.read_csv(node_base%node_type, index_col=0, low_memory=False)
    with open(filename) as o:
        ids = list(set(o.read().strip().split("\n")))
    node_df.loc[ids].dropna(axis=1, how="all").to_csv(new_node_base%node_type)



100%|██████████| 86/86 [00:56<00:00,  1.51it/s]


In [3]:
node_base = "out/serialization/nodes/%s.nodes.csv"
new_node_base = "out/filtered/nodes/%s.nodes.csv"
new_edge_base = "out/filtered/edges/%s.%s.%s.edges.csv"
sab_relations = {}
processed = set()

In [12]:
def glygen(s):
    return s.replace("GLYGEN.RESIDUE", "GLYGEN_RESIDUE").replace("GLYCAN.MOTIF", "GLYCAN_MOTIF").replace('GLYCOSYLTRANSFERASE.REACTION', 'GLYCOSYLTRANSFERASE_REACTION').replace("GLYGEN.SRC", "GLYGEN_SRC").replace('GLYGEN.GLYCOSYLATION', 'GLYGEN_GLYCOSYLATION').replace("GLYGEN.GLYCOSEQUENCE", "GLYGEN_GLYCOSEQUENCE")

def glygen_reverse(s):
    return s.replace("GLYGEN_RESIDUE", "GLYGEN.RESIDUE").replace("GLYCAN_MOTIF", "GLYCAN.MOTIF").replace('GLYCOSYLTRANSFERASE_REACTION', 'GLYCOSYLTRANSFERASE.REACTION').replace("GLYGEN_SRC", "GLYGEN.SRC").replace('GLYGEN_GLYCOSYLATION', 'GLYGEN.GLYCOSYLATION').replace("GLYGEN_GLYCOSEQUENCE", "GLYGEN.GLYCOSEQUENCE")


In [13]:
for filename in tqdm(glob("out/serialization/edges/*.csv")):
    if filename not in processed:
        match = re.match(edge_pattern, glygen(filename)).groupdict()
        entity = match["entity"]
        source_type = match["source_type"]
        relation = match["relation"].replace("_", " ")
        target_type = match["target_type"]
        if "inverse" not in relation:
            edge_df = pd.read_csv(filename, low_memory=False)
            # filter for SAB
            sabs = sabs_to_keep.intersection(edge_df.SAB.unique())
            for sab in sabs:
                if sab not in sab_relations:
                    sab_relations[sab] = set()
                sab_relations[sab].add(relation)
            if len(sabs) > 0:
                edge_df = edge_df[edge_df.SAB.isin(sabs)]
                source_df = pd.read_csv(node_base%glygen_reverse(source_type), index_col=0, low_memory=False)
                if not os.path.isfile(new_node_base%(source_type)):
                    # new_source_df = pd.read_csv(new_node_base%(source_type), index_col=0, low_memory=False)
                    # pd.concat([new_source_df, source_df]).dropna(axis=1).to_csv(new_node_base%(source_type))
                    source_df.to_csv(new_node_base%(source_type))
                # else:
                #     source_df.dropna(axis=1).to_csv(new_node_base%(source_type))
                
                target_df = pd.read_csv(node_base%glygen_reverse(target_type), index_col=0, low_memory=False)
                if not os.path.isfile(new_node_base%(target_type)):
                    # new_target_df = pd.read_csv(new_node_base%(target_type), index_col=0, low_memory=False)
                    # pd.concat([new_target_df, target_df]).dropna(axis=1).to_csv(new_node_base%(target_type))
                    target_df.to_csv(new_node_base%(target_type))
                # else:
                #     target_df.dropna(axis=1).to_csv(new_node_base%(target_type))
                edge_df.to_csv(new_edge_base%(source_type, relation, target_type), index=False)
        processed.add(filename)
                

100%|██████████| 75964/75964 [01:14<00:00, 1018.49it/s] 


In [11]:
match = re.match(edge_pattern, glygen(filename)).groupdict()
entity = match["entity"]
source_type = glygen_reverse(match["source_type"]).replace("_", " ")
relation = match["relation"].replace("_", " ")
target_type = glygen_reverse(match["target_type"]).replace("_", " ")
filename

'out/serialization/edges/GLYTOUCAN.has_glycosequence.GLYGEN.GLYCOSEQUENCE.edges.csv'

In [14]:
a = "WURCS=2.0/4,37,36/[AUd21122h_5*NCC/3=O][uxxxxm][uxxxxh_2*NCC/3=O][uxxxxh]/1-1-2-2-2-2-2-2-3-3-3-3-3-3-3-3-3-3-3-3-3-3-4-4-4-4-4-4-4-4-4-4-4-4-4-4-4/a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?_a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?}-{a?|b?|c?|d?|e?|f?|g?|h?|i?|j?|k?|l?|m?|n?|o?|p?|q?|r?|s?|t?|u?|v?|w?|x?|y?|z?|A?|B?|C?|D?|E?|F?|G?|H?|I?|J?|K?"

In [15]:
len(a)

8211

In [18]:
source_df = pd.read_csv(node_base%glygen_reverse("Gene or Genome"), index_col=0, low_memory=False)


In [19]:
source_df.head()

Unnamed: 0_level_0,label,type,type_combined,NCI,LCH_NW,MSH,CSP,SNOMEDCT_US,HGNC,MTH,...,HPO,EFO,CHEBI,ENSEMBL,ENSEMBL lowerbound,ENSEMBL upperbound,ENTREZ,ENTREZ lowerbound,ENTREZ upperbound,PUBCHEM
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0079559,HTLV-I tax Genes,Gene or Genome,Gene or Genome,NCI C17366,,MSH D016355,,,,,...,,,,,,,,,,
C0314607,Human structural gene,Gene or Genome,Gene or Genome,,,,,SNOMEDCT_US 49046007,,,...,,,,,,,,,,
C0440471,Genetic Materials,Gene or Genome,Gene or Genome,,,MSH D005796,,SNOMEDCT_US 256926007,,,...,,,,,,,,,,
C0440482,A*1102,Gene or Genome,Gene or Genome,,,,,SNOMEDCT_US 256943005,,,...,,,,,,,,,,
C0440498,A*6602,Gene or Genome,Gene or Genome,,,,,SNOMEDCT_US 256962008,,,...,,,,,,,,,,


In [29]:
for i in glob('out/filtered/edges/*Thyroid*'):
    print(i, pd.read_csv(i).shape)

out/filtered/edges/Gene or Genome.coexpression Thyroid.Gene or Genome.edges.csv (31591, 7)
out/filtered/edges/HGNC.coexpression Thyroid.Gene or Genome.edges.csv (2, 7)


In [11]:
rows = []
with open("neo4j/import/CUI-CUIs.csv") as o:
    csv_reader = csv.reader(o)
    headers = None
    for row in csv_reader:
        if not headers:
            headers = row
        else:
            if row[3] == "LINCS":
                rows.append(row + [""])
                if len(rows) == 5:
                    break

In [13]:
headers + [""]

[':START_ID', ':END_ID', ':TYPE', 'SAB', 'evidence_class:string', '']

In [15]:
pd.DataFrame(rows, columns=headers+ [""])

Unnamed: 0,:START_ID,:END_ID,:TYPE,SAB,evidence_class:string,Unnamed: 6
0,C1412480,UFVCQ0hFTSA5ODMwMTkx,positively_regulated_by,LINCS,0.0037570596,
1,C1412234,UFVCQ0hFTSAzMzQwMDc=,negatively_regulated_by,LINCS,-0.010075697,
2,C1416933,UFVCQ0hFTSA2NDgxMjM2,negatively_regulated_by,LINCS,-0.0066971174,
3,C1416717,UFVCQ0hFTSA3NzE5MTA=,negatively_regulated_by,LINCS,-0.02080909,
4,C1423844,UFVCQ0hFTSAxMjI3MTg=,positively_regulated_by,LINCS,0.015338226,


In [7]:
rows

[['C0027934', 'C0000097', 'RB', 'MTH'],
 ['C0039795', 'C0000232', 'AQ', 'MSH'],
 ['C1522005', 'C0000246', 'AQ', 'MSH'],
 ['C1168993', 'C0000294', 'has_ingredient', 'MMSL'],
 ['C4763809', 'C0000294', 'subset_includes_concept', 'NCI']]

In [25]:
pd.read_csv("out/filtered/nodes/Drug.nodes.csv", index_col=0).tail()

  pd.read_csv("out/filtered/nodes/Drug.nodes.csv", index_col=0).tail()


Unnamed: 0_level_0,label,type,type_combined,CHEBI,PUBCHEM
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
UFVCQ0hFTSA3MDIwMjE1,Val-Val-Val,Drug,Drug,,PUBCHEM 7020215
UFVCQ0hFTSAxODY0OA==,xi-2-Ethylheptanoic acid,Drug,Drug,,PUBCHEM 18648
UFVCQ0hFTSAxMDI2Nzc=,"xi-2,3-Dihydro-3-methylfuran",Drug,Drug,,PUBCHEM 102677
UFVCQ0hFTSAxMTUyMA==,xi-3-Heptanol,Drug,Drug,,PUBCHEM 11520
Q0hFQkkgMzMyNDM=,natural product,Drug,Drug,CHEBI 33243,


In [32]:
import re
node_pattern = "(?P<directory>.+)/(?P<label>.+)\.(?P<entity>.+)\.csv"

In [38]:
nodes_list = []
for filename in glob('out/filtered/nodes/*.csv'):
    match = re.match(node_pattern, filename).groupdict()
    entity = match["entity"]
    label = match["label"]
    df = pd.read_csv(filename)
    nodes_list.append({
        "node": label,
        "count": len(df.index)
    })

  df = pd.read_csv(filename)
  df = pd.read_csv(filename)
  df = pd.read_csv(filename)
  df = pd.read_csv(filename)
  df = pd.read_csv(filename)
  df = pd.read_csv(filename)
  df = pd.read_csv(filename)
  df = pd.read_csv(filename)
  df = pd.read_csv(filename)
  df = pd.read_csv(filename)
  df = pd.read_csv(filename)
  df = pd.read_csv(filename)
  df = pd.read_csv(filename)
  df = pd.read_csv(filename)
  df = pd.read_csv(filename)
  df = pd.read_csv(filename)
  df = pd.read_csv(filename)
  df = pd.read_csv(filename)
  df = pd.read_csv(filename)
  df = pd.read_csv(filename)
  df = pd.read_csv(filename)
  df = pd.read_csv(filename)


In [39]:
pd.DataFrame.from_records(nodes_list).to_csv("nodes_count.csv", index=False)

In [42]:
edge_count = {}
edge_pattern = "(?P<directory>.+)/(?P<source_type>.+)\.(?P<relation>.+)\.(?P<target_type>.+)\.(?P<entity>.+)\.csv"

for filename in glob('out/filtered/edges/*.csv'):
    match = re.match(edge_pattern, filename).groupdict()
    entity = match["entity"]
    relation = match["relation"].replace("_", " ")
    with open(filename) as o:
        csv_reader = csv.reader(o, delimiter=",")
        count = 0
        for i in csv_reader:
            count += 1
    if relation not in edge_count:
        edge_count[relation] = {
            "relation": relation,
            "count": 0
        }
    edge_count[relation]["count"] += count

In [44]:
pd.DataFrame.from_records(list(edge_count.values())).to_csv("edges_count.csv", index=False)

In [4]:
with open('SABS_DCC') as o:
    dccs = {}
    for line in o:
        dcc,sabs = line.strip().split(":")
        for sab in sabs.split(","):
            dccs[sab] = dcc

In [5]:
dccs

{'GLYCORDF': 'Glygen',
 'GLYCANS': 'Glygen',
 'GLYCOCOO': 'Glygen',
 'LINCS': 'LINCS',
 'MW': 'MW',
 'GTEXEXP': 'GTEX',
 'GTEXEQTL': 'GTEX',
 'GTEXCOEXP': 'GTEX',
 'IDGP': 'IDG',
 'IDGD': 'IDG',
 '4DN': '4DN'}

In [10]:
dcc_counter = {}
with open("neo4j/import/CUI-CUIs.csv") as o:
    csv_reader = csv.reader(o)
    headers = None
    for row in tqdm(csv_reader):
        if not headers:
            headers = row
        else:
            source = row[0]
            target = row[1]
            relation = row[2]
            sab = row[3]
            if sab in dccs:
                dcc = dccs[sab]
                if dcc not in dcc_counter:
                    dcc_counter[dcc] = {
                        "nodes": set(),
                        "edges": 0
                    }
                dcc_counter[dcc]["nodes"].add(source)
                dcc_counter[dcc]["nodes"].add(target)
                dcc_counter[dcc]["edges"] += 1

127920727it [02:30, 848771.43it/s]


In [11]:
for k in dcc_counter:
    dcc_counter[k]["nodes"] = len(dcc_counter[k]["nodes"])

In [12]:
dcc_counter

{'4DN': {'nodes': 766621, 'edges': 5265898},
 'Glygen': {'nodes': 182490, 'edges': 929548},
 'GTEX': {'nodes': 2862268, 'edges': 65966674},
 'IDG': {'nodes': 329119, 'edges': 873144},
 'LINCS': {'nodes': 8942, 'edges': 492588},
 'MW': {'nodes': 2465, 'edges': 33144}}

In [15]:
pd.DataFrame.from_records(dcc_counter).T.to_csv("dcc_counts.csv")

In [16]:
df = pd.read_csv('out/filtered/nodes/GTEXEXP.nodes.csv')
df.head()

Unnamed: 0,id,label,type,type_combined,GTEXEXP
0,R1RFWEVYUCBFTlNHMDAwMDAyMjM5NzItNS1BZGlwb3NlLV...,-,GTEXEXP,GTEXEXP,GTEXEXP ENSG00000223972-5-Adipose-Subcutaneous
1,R1RFWEVYUCBFTlNHMDAwMDAyMjM5NzItNS1BZHJlbmFsLU...,-,GTEXEXP,GTEXEXP,GTEXEXP ENSG00000223972-5-Adrenal-Gland
2,R1RFWEVYUCBFTlNHMDAwMDAyMjM5NzItNS1BcnRlcnktQW...,-,GTEXEXP,GTEXEXP,GTEXEXP ENSG00000223972-5-Artery-Aorta
3,R1RFWEVYUCBFTlNHMDAwMDAyMjM5NzItNS1BcnRlcnktQ2...,-,GTEXEXP,GTEXEXP,GTEXEXP ENSG00000223972-5-Artery-Coronary
4,R1RFWEVYUCBFTlNHMDAwMDAyMjM5NzItNS1BcnRlcnktVG...,-,GTEXEXP,GTEXEXP,GTEXEXP ENSG00000223972-5-Artery-Tibial


In [18]:
df.shape

(1573920, 5)

In [21]:
import re

In [48]:
tissues = {}
genes = {}
g = set()
gtex_pattern = "GTEXEXP (?P<Gene>ENSG[0-9]+)-[A-Za-z0-9 ]+-(?P<Tissue>.+)"
for i in df.GTEXEXP:
    match = re.match(gtex_pattern, i).groupdict()
    if "Gland" in i:
        g.add(i)
    if match["Tissue"] not in tissues:
        tissues[match["Tissue"]] = 0
    tissues[match["Tissue"]] += 1
    if match["Gene"] not in genes:
        genes[match["Gene"]] = 0
    genes[match["Gene"]] += 1

In [49]:
i = list(g)[0]
match = re.match(gtex_pattern, i).groupdict()
match

{'Gene': 'ENSG00000178921', 'Tissue': 'Adrenal-Gland'}

In [50]:
list(genes.items())[0:5]

[('ENSG00000223972', 45),
 ('ENSG00000227232', 45),
 ('ENSG00000278267', 45),
 ('ENSG00000243485', 45),
 ('ENSG00000237613', 45)]

In [58]:
len(tissues), len(genes), len(tissues)*len(genes)

(45, 34940, 1572300)

In [54]:
tissues

{'Adipose-Subcutaneous': 34976,
 'Adrenal-Gland': 34976,
 'Artery-Aorta': 34976,
 'Artery-Coronary': 34976,
 'Artery-Tibial': 34976,
 'Bladder': 34976,
 'Brain-Amygdala': 34976,
 'Brain-Cerebellar-Hemisphere': 34976,
 'Brain-Cerebellum': 34976,
 'Brain-Cortex': 34976,
 'Brain-Hippocampus': 34976,
 'Brain-Hypothalamus': 34976,
 'Brain-Substantia-nigra': 34976,
 'Breast-Mammary-Tissue': 34976,
 'Cells-Cultured-fibroblasts': 34976,
 'Cells-EBV-transformed-lymphocytes': 34976,
 'Cervix-Ectocervix': 34976,
 'Cervix-Endocervix': 34976,
 'Colon-Sigmoid': 34976,
 'Colon-Transverse': 34976,
 'Esophagus-Gastroesophageal-Junction': 34976,
 'Esophagus-Mucosa': 34976,
 'Esophagus-Muscularis': 34976,
 'Fallopian-Tube': 34976,
 'Heart-Atrial-Appendage': 34976,
 'Heart-Left-Ventricle': 34976,
 'Kidney-Cortex': 34976,
 'Kidney-Medulla': 34976,
 'Liver': 34976,
 'Lung': 34976,
 'Minor-Salivary-Gland': 34976,
 'Muscle-Skeletal': 34976,
 'Nerve-Tibial': 34976,
 'Ovary': 34976,
 'Pancreas': 34976,
 'Pituit

In [56]:
for i in glob("out/filtered/edges/*.csv"):
    if "GTEXEXP" in i:
        print(i)

out/filtered/edges/GTEXEXP.expressed in.Tissue.edges.csv
out/filtered/edges/Gene or Genome.expresses.GTEXEXP.edges.csv
out/filtered/edges/GTEXEXP.has expression.EXPBINS.edges.csv
out/filtered/edges/GTEXEXP.expressed in.Body Location or Region.edges.csv
out/filtered/edges/Tissue.expresses.GTEXEXP.edges.csv
out/filtered/edges/Body Part, Organ, or Organ Component.expresses.GTEXEXP.edges.csv
out/filtered/edges/GTEXEXP.expressed in.Body Part, Organ, or Organ Component.edges.csv
out/filtered/edges/Body Location or Region.expresses.GTEXEXP.edges.csv
out/filtered/edges/GTEXEXP.expressed in.Gene or Genome.edges.csv


In [59]:
df = pd.read_csv("out/filtered/edges/Gene or Genome.expresses.GTEXEXP.edges.csv")
df.shape

(1573785, 7)

In [61]:
df.head()

Unnamed: 0,source,relation,target,source_label,target_label,SAB,evidence
0,C2239334,expresses,R1RFWEVYUCBFTlNHMDAwMDAyMjM5NzItNS1BZGlwb3NlLV...,DDX11L1 gene,-,GTEXEXP,
1,C2239334,expresses,R1RFWEVYUCBFTlNHMDAwMDAyMjM5NzItNS1BZHJlbmFsLU...,DDX11L1 gene,-,GTEXEXP,
2,C2239334,expresses,R1RFWEVYUCBFTlNHMDAwMDAyMjM5NzItNS1BcnRlcnktQW...,DDX11L1 gene,-,GTEXEXP,
3,C2239334,expresses,R1RFWEVYUCBFTlNHMDAwMDAyMjM5NzItNS1BcnRlcnktQ2...,DDX11L1 gene,-,GTEXEXP,
4,C2239334,expresses,R1RFWEVYUCBFTlNHMDAwMDAyMjM5NzItNS1BcnRlcnktVG...,DDX11L1 gene,-,GTEXEXP,
