In [1]:
import pandas as pd
from glob import glob
from IPython.display import display, Markdown
from tqdm import tqdm
import csv
pd.__version__

'2.0.3'

In [2]:
base_input = "dd_data/20230802/"

In [3]:
hgnc_info = pd.read_csv('dd_data/HGNC_genes.txt', sep="\t")
hgnc_mapper = {}
for i, row in hgnc_info.iterrows():
    uid = row["HGNC ID"]
    enz = row["Enzyme (EC) ID"]
    unip = row["UniProt accession"]
    if uid not in hgnc_mapper:
        hgnc_mapper[uid] = {}
        if type(unip) == str:
            hgnc_mapper[uid]["UNIPROTKB"] = unip
        if type(enz) == str:
            hgnc_mapper[uid]["ec_id"] = enz
                


## Load Data

In [4]:
concepts = pd.read_csv(base_input + "neo4j/import/CUIs.csv")
concepts = pd.DataFrame(index=concepts["CUI:ID"].unique())
concepts.index.name = "id"
concepts.head()

C0000097
C0000359
C0000610
C0000739
C0000873


In [5]:
semantics = pd.read_csv(base_input + "neo4j/import/TUIs.csv", index_col=0)
semantics.head()

Unnamed: 0_level_0,name,STN,DEF
TUI:ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
T012,Bird,A1.1.3.1.1.2,A vertebrate having a constant body temperatur...
T014,Reptile,A1.1.3.1.1.5,A cold-blooded vertebrate having an external c...
T015,Mammal,A1.1.3.1.1.4,A vertebrate having a constant body temperatur...
T021,Fully Formed Anatomical Structure,A1.2.3,An anatomical structure in a fully formed orga...
T022,Body System,A2.1.4.1,A complex of anatomical structures that perfor...


In [6]:
terms = pd.read_csv(base_input + "neo4j/import/SUIs.csv", index_col=0)
terms.head()

Unnamed: 0_level_0,name
SUI:ID,Unnamed: 1_level_1
S0009776,"Acid, 2-Aminohexanedioic"
S7249234,BR CAMP
S11872577,cramps abdominal
S14680596,Retained tissue after pregnancy loss
S3417882,Missed miscarriage


In [7]:
codes = pd.read_csv(base_input + "neo4j/import/CODEs.csv", index_col=0)
codes.head()

  codes = pd.read_csv(base_input + "neo4j/import/CODEs.csv", index_col=0)


Unnamed: 0_level_0,SAB,CODE,value:float,lowerbound:float,upperbound:float,unit
CodeID:ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MTHSPL:J7A92W69L7,MTHSPL,J7A92W69L7,,,,
NCI:C76777,NCI,C76777,,,,
ATC:N07XX07,ATC,N07XX07,,,,
GS:1946,GS,1946,,,,
NOC:040413,NOC,040413,,,,


In [8]:
concept_term = pd.read_csv(base_input + "neo4j/import/CUI-SUIs.csv")
concept_term.head()

Unnamed: 0,:START_ID,:END_ID
0,C0000163,S0007806
1,C0000427,S0008336
2,C0000665,S11922611
3,C0000992,S1577859
4,C0001021,S0797872


In [9]:
concept_semantics = pd.read_csv(base_input + "neo4j/import/CUI-TUIs.csv")
concept_semantics.head()

Unnamed: 0,:START_ID,:END_ID
0,C0000132,T126
1,C0000246,T116
2,C0000895,T060
3,C0000908,T037
4,C0000931,T067


In [10]:
concept_code = pd.read_csv(base_input + "neo4j/import/CUI-CODEs.csv")
concept_code.head()

Unnamed: 0,:START_ID,:END_ID
0,C0000294,ATC:V03AF01
1,C0000481,CHV:0000000513
2,C0000661,MSH:D015124
3,C0000665,VANDF:4020847
4,C0000737,LNC:LA15468-4


In [11]:
semantics_semantics = pd.read_csv(base_input + "neo4j/import/TUIrel.csv")
semantics_semantics.head()

Unnamed: 0,:END_ID,:START_ID
0,T204,T002
1,T001,T004
2,T071,T004
3,T072,T010
4,T204,T010


In [12]:
code_term = pd.read_csv(base_input + "neo4j/import/CODE-SUIs.csv")
code_term.head()

Unnamed: 0,:END_ID,:START_ID,:TYPE,CUI
0,S1424701,RXNORM:74,IN,C0000473
1,S18541041,SNOMEDCT_US:80994002,FN,C0000477
2,S11730064,SNOMEDCT_US:226367006,SY,C0000545
3,S0288461,CSP:2005-4146,PT,C0000735
4,S1957040,MDR:10048885,LLT,C0000735


## Merge Concept and Terms

In [13]:
concept_term.columns = ["CUI:ID", "SUI:ID"]
concept_term.shape

(7923747, 2)

In [14]:
concept_term = pd.merge(concept_term, terms, on="SUI:ID", how='outer')
concept_term = concept_term.groupby('CUI:ID').first()
concept_term.head()

Unnamed: 0_level_0,SUI:ID,name
CUI:ID,Unnamed: 1_level_1,Unnamed: 2_level_1
4DND:4DNES1JP4KZ1 CUI,aW4gc2l0dSBIaS1DIG9uIEhDVDExNiBjZWxscyAoY29udG...,in situ Hi-C on HCT116 cells (containing AID-t...
4DND:4DNES21D8SP8 CUI,TWljcm8tQyBvbiBIMS1FU0MgY2VsbHMuSDEtRVND,Micro-C on H1-ESC cells.H1-ESC
4DND:4DNES2M5JIGV CUI,aW4gc2l0dSBIaS1DIG9uICBIaS1FU0MgY2VsbHMuSDEtRVND,in situ Hi-C on Hi-ESC cells.H1-ESC
4DND:4DNES2R6PUEK CUI,aW4gc2l0dSBIaS1DIG9uIEhGRmM2IGNlbGxzLkhGRmM2,in situ Hi-C on HFFc6 cells.HFFc6
4DND:4DNES3QAGOZZ CUI,aW4gc2l0dSBIaS1DIG9uIEhDVDExNiBjZWxscyAoY29udG...,in situ Hi-C on HCT116 cells (containing AID-t...


In [15]:
concept_term.shape

(7923727, 2)

In [16]:
concept_term.columns = ["SUI:ID", "label"]
concept_term = concept_term[["label"]]
concept_term.head()

Unnamed: 0_level_0,label
CUI:ID,Unnamed: 1_level_1
4DND:4DNES1JP4KZ1 CUI,in situ Hi-C on HCT116 cells (containing AID-t...
4DND:4DNES21D8SP8 CUI,Micro-C on H1-ESC cells.H1-ESC
4DND:4DNES2M5JIGV CUI,in situ Hi-C on Hi-ESC cells.H1-ESC
4DND:4DNES2R6PUEK CUI,in situ Hi-C on HFFc6 cells.HFFc6
4DND:4DNES3QAGOZZ CUI,in situ Hi-C on HCT116 cells (containing AID-t...


In [17]:
concept_term.shape

(7923727, 1)

In [18]:
concepts.loc[concept_term.index, 'label'] = concept_term.loc[concept_term.index, 'label']
concepts.head()

Unnamed: 0_level_0,label
id,Unnamed: 1_level_1
C0000097,"1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine"
C0000359,"3',5'-Cyclic-Nucleotide Phosphodiesterase"
C0000610,6-Aminonicotinamide
C0000739,Abdominal Muscles
C0000873,Academic Problem


## Semantics

In [19]:
semantics.head()

Unnamed: 0_level_0,name,STN,DEF
TUI:ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
T012,Bird,A1.1.3.1.1.2,A vertebrate having a constant body temperatur...
T014,Reptile,A1.1.3.1.1.5,A cold-blooded vertebrate having an external c...
T015,Mammal,A1.1.3.1.1.4,A vertebrate having a constant body temperatur...
T021,Fully Formed Anatomical Structure,A1.2.3,An anatomical structure in a fully formed orga...
T022,Body System,A2.1.4.1,A complex of anatomical structures that perfor...


In [20]:
concept_semantics.head()

Unnamed: 0,:START_ID,:END_ID
0,C0000132,T126
1,C0000246,T116
2,C0000895,T060
3,C0000908,T037
4,C0000931,T067


In [21]:
no_type = set(concepts.index) - set(concept_semantics[':START_ID'])
len(no_type)

12264238

In [22]:
with open('out/0623/semantics_ranked.tsv') as o:
    ranked_type = [i.strip() for i in o.read().strip().split("\n")]

In [23]:
concept_semantics.columns = ["id", "TUI:ID"]
concept_semantics["type"] = [semantics.at[i, 'name'] for i in concept_semantics['TUI:ID']]
concept_semantics.head()

Unnamed: 0,id,TUI:ID,type
0,C0000132,T126,Enzyme
1,C0000246,T116,"Amino Acid, Peptide, or Protein"
2,C0000895,T060,Diagnostic Procedure
3,C0000908,T037,Injury or Poisoning
4,C0000931,T067,Phenomenon or Process


In [24]:
def fetch_type(v):
    cat = ""
    rank = len(ranked_type)
    for i in v:
        r = ranked_type.index(i)
        if r < rank:
            cat = i
            rank = r
    return cat

In [25]:
cs = concept_semantics.groupby('id')['type'].apply(lambda x: "; ".join(set(x)))
cs.head()

id
C0000005    Amino Acid, Peptide, or Protein; Indicator, Re...
C0000039            Organic Chemical; Pharmacologic Substance
C0000052              Amino Acid, Peptide, or Protein; Enzyme
C0000074                                     Organic Chemical
C0000084    Amino Acid, Peptide, or Protein; Biologically ...
Name: type, dtype: object

In [26]:
cs_ranked = concept_semantics.groupby('id')['type'].apply(fetch_type)
cs_ranked.head()

id
C0000005    Amino Acid, Peptide, or Protein
C0000039                   Organic Chemical
C0000052                             Enzyme
C0000074                   Organic Chemical
C0000084    Amino Acid, Peptide, or Protein
Name: type, dtype: object

In [27]:
common = list(set(concepts.index).intersection(cs.index))
cs[common].head()

id
C4596166                                       Fungus
C2131805                                      Finding
C1550825                             Population Group
C3884354    Organic Chemical; Pharmacologic Substance
C0857614                                      Finding
Name: type, dtype: object

In [28]:
concept_semantics
concepts.loc[common, 'type'] = cs_ranked[common]
concepts.loc[common, 'type_combined'] = cs[common]
concepts.head()

Unnamed: 0_level_0,label,type,type_combined
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C0000097,"1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine",Organic Chemical,Organic Chemical; Hazardous or Poisonous Subst...
C0000359,"3',5'-Cyclic-Nucleotide Phosphodiesterase",Enzyme,"Amino Acid, Peptide, or Protein; Enzyme"
C0000610,6-Aminonicotinamide,Pharmacologic Substance,Pharmacologic Substance; Vitamin
C0000739,Abdominal Muscles,"Body Part, Organ, or Organ Component","Body Part, Organ, or Organ Component"
C0000873,Academic Problem,Finding,Finding


In [29]:
out_prefix = "out/0915/"

In [30]:
concepts.groupby("type_combined").first().to_csv(out_prefix + 'semantics.tsv', sep="\t")

In [31]:
concepts.head()

Unnamed: 0_level_0,label,type,type_combined
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C0000097,"1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine",Organic Chemical,Organic Chemical; Hazardous or Poisonous Subst...
C0000359,"3',5'-Cyclic-Nucleotide Phosphodiesterase",Enzyme,"Amino Acid, Peptide, or Protein; Enzyme"
C0000610,6-Aminonicotinamide,Pharmacologic Substance,Pharmacologic Substance; Vitamin
C0000739,Abdominal Muscles,"Body Part, Organ, or Organ Component","Body Part, Organ, or Organ Component"
C0000873,Academic Problem,Finding,Finding


In [32]:
concepts.shape

(15527671, 3)

In [33]:
with open(out_prefix + 'semantics_list.tsv', 'w') as o:
    o.write("\n".join([str(i) for i in concept_semantics.type.unique()]))

In [34]:
codes.head()

Unnamed: 0_level_0,SAB,CODE,value:float,lowerbound:float,upperbound:float,unit
CodeID:ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MTHSPL:J7A92W69L7,MTHSPL,J7A92W69L7,,,,
NCI:C76777,NCI,C76777,,,,
ATC:N07XX07,ATC,N07XX07,,,,
GS:1946,GS,1946,,,,
NOC:040413,NOC,040413,,,,


In [35]:
concept_code.columns = ["id", "CodeID:ID"]
concept_code.head()

Unnamed: 0,id,CodeID:ID
0,C0000294,ATC:V03AF01
1,C0000481,CHV:0000000513
2,C0000661,MSH:D015124
3,C0000665,VANDF:4020847
4,C0000737,LNC:LA15468-4


In [36]:
concept_code = pd.merge(concept_code, codes, on="CodeID:ID", how='left')
concept_code.head()

Unnamed: 0,id,CodeID:ID,SAB,CODE,value:float,lowerbound:float,upperbound:float,unit
0,C0000294,ATC:V03AF01,ATC,V03AF01,,,,
1,C0000481,CHV:0000000513,CHV,0000000513,,,,
2,C0000661,MSH:D015124,MSH,D015124,,,,
3,C0000665,VANDF:4020847,VANDF,4020847,,,,
4,C0000737,LNC:LA15468-4,LNC,LA15468-4,,,,


In [37]:
concept_code[concept_code.id == 'C0000097']

Unnamed: 0,id,CodeID:ID,SAB,CODE,value:float,lowerbound:float,upperbound:float,unit
623690,C0000097,LCH_NW:sh86002892,LCH_NW,sh86002892,,,,
1108107,C0000097,CSP:2511-0411,CSP,2511-0411,,,,
1246274,C0000097,PSY:31213,PSY,31213,,,,
2179442,C0000097,MSH:D015632,MSH,D015632,,,,
3529094,C0000097,CHV:0000000501,CHV,0000000501,,,,
3736237,C0000097,PSY:32433,PSY,32433,,,,
4116966,C0000097,SNOMEDCT_US:285407008,SNOMEDCT_US,285407008,,,,


In [38]:
concepts.head()

Unnamed: 0_level_0,label,type,type_combined
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C0000097,"1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine",Organic Chemical,Organic Chemical; Hazardous or Poisonous Subst...
C0000359,"3',5'-Cyclic-Nucleotide Phosphodiesterase",Enzyme,"Amino Acid, Peptide, or Protein; Enzyme"
C0000610,6-Aminonicotinamide,Pharmacologic Substance,Pharmacologic Substance; Vitamin
C0000739,Abdominal Muscles,"Body Part, Organ, or Organ Component","Body Part, Organ, or Organ Component"
C0000873,Academic Problem,Finding,Finding


In [39]:
type_mapper = {}
with open("output/unique_SABS_of_Concept_Mapper.txt") as o:
    for line in o:
        r = line.strip().split(":")
        if len(r) == 2:
            type_mapper[r[0]] = r[1]
        elif 'MSIGDB' in r[0]:
            type_mapper[r[0]] = 'MSIGDB'
        else:
            type_mapper[r[0]] = r[0]

In [40]:
for i,row in tqdm(concept_code[concept_code.id.isin(concepts[concepts.type.isna()].index)].iterrows()):
    sab = row["SAB"]
    ind = row["id"]
    if type(sab) == str:
        if 'MSIGDB' in sab:
            sab = 'MSIGDB'
            concept_code.at[i, 'SAB'] = 'MSIGDB'
        if sab == 'MSIGDB':
            tp = 'MSIGDB'
        else:
            tp = type_mapper[sab]
        if tp:
            concepts.at[ind, "type"] = tp
            concepts.at[ind, "type_combined"] = tp


15721067it [07:40, 34108.59it/s]


In [41]:
for i, row in concepts.iterrows():
    concepts.at[i, "type"] = row["type"].replace(".", " ")
    concepts.at[i, "type_combined"] = row["type_combined"].replace(".", " ")


In [42]:
concepts[concepts.type == "UNIPROTKB"].head()

Unnamed: 0_level_0,label,type,type_combined
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
UNIPROTKB:A0A087X1C5 CUI,CP2D7_HUMAN,UNIPROTKB,UNIPROTKB
UNIPROTKB:A0A0B4J2F0 CUI,PIOS1_HUMAN,UNIPROTKB,UNIPROTKB
UNIPROTKB:A0A0B4J2F2 CUI,SIK1B_HUMAN,UNIPROTKB,UNIPROTKB
UNIPROTKB:A0A0C5B5G6 CUI,MOTSC_HUMAN,UNIPROTKB,UNIPROTKB
UNIPROTKB:A0A0K2S4Q6 CUI,CD3CH_HUMAN,UNIPROTKB,UNIPROTKB


In [None]:
for tp in tqdm(concepts.type.unique()):
    con = concepts[concepts.type==tp].copy()
    cc = concept_code[concept_code.id.isin(con.index)]
    for sab in cc.SAB.unique():
        c = cc[cc.SAB == sab]
        c = c.groupby('id').first()
        common = list(set(con.index).intersection(c.index))
        con.loc[common, sab] = c.loc[common, "CodeID:ID"]
        if c.loc[common, "value:float"].isna().sum() != len(common):
            con.loc[common, "%s value"%sab] = c.loc[common, "value:float"]
        if c.loc[common, "lowerbound:float"].isna().sum() != len(common):
            con.loc[common, "%s lowerbound"%sab] = c.loc[common, "lowerbound:float"]
        if c.loc[common, "upperbound:float"].isna().sum() != len(common):
            con.loc[common, "%s upperbound"%sab] = c.loc[common, "upperbound:float"]
        if c.loc[common, "unit"].isna().sum() != len(common):
            con.loc[common, "%s unit"%sab] = c.loc[common, "unit"]
        if "-" in list(con["label"]):
            tmp = con[con.label == "-"]
            ind = set(tmp.index).intersection(con.index)
            ind2 = set(tmp.index).intersection(c.index)
            if len(ind.intersection(ind2)) > 0:
                l = list(ind.intersection(ind2))
                con.loc[l, "label"] = c.loc[l, 'CodeID:ID']
        con.to_csv("out/0915/serialization/nodes/%s.nodes.csv"%(tp))


In [49]:
gene_or_genome_df = pd.read_csv("out/0915/serialization/nodes/Gene or Genome.nodes.csv", index_col=0)
uniprot = pd.read_csv("out/0915/serialization/nodes/UNIPROTKB.nodes.csv", index_col=0)
gene_df = pd.read_csv("out/0915/serialization/nodes/Gene.nodes.csv", index_col=0)


  gene_df = pd.read_csv("out/0915/filtered/nodes/Gene.nodes.csv", index_col=0)


In [50]:
uniprot.head()

Unnamed: 0_level_0,label,type,UNIPROTKB
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
UNIPROTKB:Q92993 CUI,KAT5_HUMAN,UNIPROTKB,UNIPROTKB:Q92993
UNIPROTKB:Q9H093 CUI,NUAK2_HUMAN,UNIPROTKB,UNIPROTKB:Q9H093
UNIPROTKB:Q15418 CUI,KS6A1_HUMAN,UNIPROTKB,UNIPROTKB:Q15418
UNIPROTKB:P59540 CUI,T2R46_HUMAN,UNIPROTKB,UNIPROTKB:P59540
UNIPROTKB:P29122 CUI,PCSK6_HUMAN,UNIPROTKB,UNIPROTKB:P29122


In [51]:
uniprot.shape, gene_df.shape, gene_or_genome_df.shape

((2546, 3), (39920, 28), (1, 5))

In [52]:
gene_or_genome_df

Unnamed: 0_level_0,label,type,NCI,MTH,PUBCHEM
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C2828054,FANCD2 wt Allele,Gene or Genome,NCI:C86550,MTH:NOCODE,PUBCHEM:643975


In [None]:
uniprot_id_mapper = pd.read_csv('output/idmapping_2023_09_18.tsv', sep="\t", index_col=0)
uniprot_id_mapper.head()

In [None]:
new_gene_or_genome = gene_or_genome_df[gene_or_genome_df.HGNC.isna()]

In [None]:
rows = {}
hgnc_mapper = {}
for i, row in gene_or_genome_df[~gene_or_genome_df.HGNC.isna()].iterrows():
    hgnc = row["HGNC"]
    hgnc_mapper[hgnc] = i
    row["type"] = "Gene"
    row["type_combined"] = row["type_combined"].replace("Gene or Genome", "Gene")
    rows[i] = row
len(rows)

In [None]:
for i, row in gene_df.iterrows():
    hgnc = row["HGNC"]
    if hgnc not in hgnc_mapper:
        hgnc_mapper[hgnc] = i
        row["type"] = "Gene"
        rows[i] = row
len(rows)

In [None]:
gene_df.head()

In [None]:
uniprot.head()

In [None]:
uniprot_kb_mapper = {}
uniprot_list = []
with open("uniprot_ids_0917.txt", "w") as o:
    for i, row in uniprot.iterrows():
        kb = row["UNIPROTKB"].replace("UNIPROTKB:", "")
        o.write("%s\n"%kb)
    # hgnc = uniprot_id_mapper.at[kb, 'To']
    # hgnc = row["HGNC"]
    # if hgnc not in hgnc_mapper:
    #     hgnc_mapper[hgnc] = i
    #     row["type"] = "Gene"
    #     rows[i] = row

In [None]:
uniprot_mapper = {}
for k, v in uniprot_id_mapper.iterrows():
    uniprot_mapper[k] = v["To"]

In [None]:
no_hgnc = set()
for i, row in uniprot.iterrows():
    kb = row["UNIPROTKB"].replace("UNIPROTKB:", "")
    if kb not in uniprot_mapper:
        no_hgnc.add(kb)
        rows[i] = row
    else:
        hgnc = uniprot_mapper[kb]
        if hgnc in hgnc_mapper:
            cui = hgnc_mapper[hgnc]
            rows[cui]["UNIPROTKB"] = kb
        else:
            row["HGNC"] = hgnc
            row["type"] = "Gene"
            row["type_combined"] = "Gene"
            rows[i] = row

In [None]:
len(rows)

In [None]:
new_gene_df = pd.DataFrame.from_dict(rows, orient="index")

In [None]:
new_gene_df.head()

In [None]:
new_gene_df.type_combined = "Gene"
new_gene_df.type = "Gene"
new_gene_df.type_combined.unique(), new_gene_df.type.unique()

In [None]:
concepts.head()

In [None]:
for i in new_gene_df.index:
    concepts.at[i, "type"] = "Gene"
    concepts.at[i, "type_combined"] = "Gene"

In [None]:
new_gene_df.to_csv("out/0915/serialization/nodes/Gene.nodes.csv")

In [None]:
new_gene_or_genome.to_csv("out/0915/serialization/nodes/Gene or Genome.nodes.csv")

In [None]:
import os

In [None]:
row_headers = ["source", "relation", "target", "source_label", "target_label", "SAB", "evidence"]
with open(base_input + "neo4j/import/CUI-CUIs.csv") as o:
    csv_reader = csv.reader(o)
    headers = None
    for row in tqdm(csv_reader):
        if not headers:
            headers = row
        else:
            source = row[0]
            if source in uniprot_mapper:
                source = uniprot_mapper[source]
            target = row[1]
            if target in uniprot_mapper:
                target = uniprot_mapper[target]
            if source in concepts.index and target in concepts.index:
                source_label = concepts.at[source, 'label']
                source_type = concepts.at[source, 'type']
                
                target_label = concepts.at[target, 'label']
                target_type = concepts.at[target, 'type']
                relation = row[2]
                sab = row[3]
                evidence = ''
                if len(row) > 4:
                    evidence = row[4]
                filename = 'out/0915/serialization/edges/%s.%s.%s.edges.csv'%(source_type, relation, target_type)
                write_header = False
                operation = "a"
                if not os.path.isfile(filename):
                    write_header = True
                    operation = "w"
                # source_list = set()
                # target_list = set()
                with open(filename, operation) as w:
                    csv_writer = csv.writer(w)
                    if write_header:
                        csv_writer.writerow(row_headers)
                    csv_writer.writerow([source, relation, target, source_label, target_label, sab, evidence])
                #     source_list.add(source)
                #     target_list.add(target)

                # # take note of nodes that are used for source and target
                # source_ids = "out/serialization/ids/%s.txt"%source_type
                # if not os.path.isfile(source_ids):
                #     with open(source_ids, 'w') as o:
                #         o.write("\n".join(source_list))
                # else:
                #     with open(source_ids) as o:
                #         source_list = source_list.union(o.read().strip().split("\n"))
                #     with open(source_ids, 'w') as o:
                #         o.write("\n".join(source_list))
                # target_ids = "out/serialization/ids/%s.txt"%target_type
                # if not os.path.isfile(target_ids):
                #     with open(target_ids, 'w') as o:
                #         o.write("\n".join(target_list))
                # else:
                #     with open(target_ids) as o:
                #         target_list = target_list.union(o.read().strip().split("\n"))
                #     with open(target_ids, 'w') as o:
                #         o.write("\n".join(target_list))

In [None]:
for filename in glob("out/0915/serialization/nodes/*.csv"):
    df = pd.read_csv(filename, index_col=0, low_memory=False)
    orig_columns = df.columns
    if "type_combined" in df.columns:
        dtype = df.type.unique()[0]
        combined = set()
        for i in df.type_combined:
            combined = combined.union(i.split("; "))
        # remove og type
        combined = combined - {dtype}
        columns = [i for i in df.columns if not i == "type_combined"] + list(combined)
        if len(combined) > 0:
            print(filename)
            for i in combined:
                df[i] = False
            for i, row in df.iterrows():
                type_combined = row["type_combined"].split("; ")
                for t in type_combined:
                    col = "is_%s"%t
                    df.at[i, col] = True
        df = df[columns]
        df.to_csv(filename)

In [None]:
with open("output/august_dcc_sabs.txt") as o:
    sabs_to_keep = set(o.read().strip().split("\n"))

In [None]:
import re
import os
edge_pattern = "(?P<directory>.+)/(?P<source_type>.+)\.(?P<relation>.+)\.(?P<target_type>.+)\.(?P<entity>.+)\.csv"


In [None]:
node_base = "out/0915/serialization/nodes/%s.nodes.csv"
new_node_base = "out/0915/filtered/nodes/%s.nodes.csv"
new_edge_base = "out/0915/filtered/edges/%s.%s.%s.edges.csv"
ids_base = "out/0915/filtered/ids/%s.txt"
node_ids = {}
sab_relations = {}
processed = set()

In [None]:
def glygen(s):
    return s.replace("GLYGEN.RESIDUE", "GLYGEN_RESIDUE").replace("GLYCAN.MOTIF", "GLYCAN_MOTIF").replace('GLYCOSYLTRANSFERASE.REACTION', 'GLYCOSYLTRANSFERASE_REACTION').replace("GLYGEN.SRC", "GLYGEN_SRC").replace('GLYGEN.GLYCOSYLATION', 'GLYGEN_GLYCOSYLATION')

def glygen_reverse(s):
    return s.replace("GLYGEN_RESIDUE", "GLYGEN.RESIDUE").replace("GLYCAN_MOTIF", "GLYCAN.MOTIF").replace('GLYCOSYLTRANSFERASE_REACTION', 'GLYCOSYLTRANSFERASE.REACTION').replace("GLYGEN_SRC", "GLYGEN.SRC").replace('GLYGEN_GLYCOSYLATION', 'GLYGEN.GLYCOSYLATION')


In [None]:
for filename in tqdm(glob("out/0915/serialization/edges/*.csv")):
    if filename not in processed:
        match = re.match(edge_pattern, glygen(filename)).groupdict()
        entity = match["entity"]
        source_type = glygen_reverse(match["source_type"])
        relation = match["relation"].replace("_", " ")
        target_type = glygen_reverse(match["target_type"])
        if "inverse" not in relation:
            edge_df = pd.read_csv(filename, low_memory=False)
            # filter for SAB
            sabs = sabs_to_keep.intersection(edge_df.SAB.unique())
            for sab in sabs:
                if sab not in sab_relations:
                    sab_relations[sab] = set()
                sab_relations[sab].add(relation)
            if len(sabs) > 0:
                edge_df = edge_df[edge_df.SAB.isin(sabs)]
                if not os.path.isfile(ids_base%source_type):
                    with open(ids_base%source_type, 'w') as o:
                        o.write("\n".join(edge_df.source))
                else:
                    with open(ids_base%source_type) as o:
                        ids = set(o.read().strip().split("\n"))
                    with open(ids_base%source_type, 'w') as o:
                        ids = ids.union(edge_df.source)
                        o.write("\n".join(ids))
                if not os.path.isfile(ids_base%target_type):
                    with open(ids_base%target_type, 'w') as o:
                        o.write("\n".join(edge_df.target))
                else:
                    with open(ids_base%target_type) as o:
                        ids = set(o.read().strip().split("\n"))
                    with open(ids_base%target_type, 'w') as o:
                        ids = ids.union(edge_df.target)
                        o.write("\n".join(ids))
                # source_df = pd.read_csv(node_base%source_type, index_col=0, low_memory=False)
                # if os.path.isfile(new_node_base%(source_type)):
                #     new_source_df = pd.read_csv(new_node_base%(source_type), index_col=0, low_memory=False)
                #     pd.concat([new_source_df, source_df]).dropna(axis=1).to_csv(new_node_base%(source_type))
                # else:
                #     source_df.dropna(axis=1).to_csv(new_node_base%(source_type))
                
                # target_df = pd.read_csv(node_base%target_type, index_col=0, low_memory=False)
                # if os.path.isfile(new_node_base%(target_type)):
                #     new_target_df = pd.read_csv(new_node_base%(target_type), index_col=0, low_memory=False)
                #     pd.concat([new_target_df, target_df]).dropna(axis=1).to_csv(new_node_base%(target_type))
                # else:
                #     target_df.dropna(axis=1).to_csv(new_node_base%(target_type))
                edge_df.to_csv(new_edge_base%(source_type, relation, target_type), index=False)
        processed.add(filename.replace("GLYGEN_RESIDUE", "GLYGEN.RESIDUE"))
                

In [None]:
count = 0
for filename in tqdm(glob("out/0915/filtered/ids/*.txt")):
    count+=1
count

In [None]:
id_pattern = "(?P<directory>.+)/(?P<type>.+)\.txt"
for filename in tqdm(glob("out/0915/filtered/ids/*.txt")):
    if not "inverse" in filename and not "isa_" in filename:
        match = re.match(id_pattern, filename).groupdict()
        node_type = match["type"]
        node_df = pd.read_csv(node_base%node_type, index_col=0, low_memory=False)
        with open(filename) as o:
            ids = list(set(o.read().strip().split("\n")).intersection(node_df.index))
        node_df.loc[ids].dropna(axis=1, how="all").to_csv(new_node_base%node_type)



In [None]:
hgnc = pd.read_csv("out/0915/filtered/nodes/Gene.nodes.csv", low_memory=False)

In [None]:
hgnc.head()

In [None]:
for i in glob('out/0915/filtered/edges/*'):
    if "UNIPROT" in i:
        print(i)

In [None]:
concepts.type.unique()

In [53]:
gene_or_genome_df = pd.read_csv("out/0915/filtered/nodes/Gene or Genome.nodes.csv", index_col=0)
uniprot = pd.read_csv("out/0915/filtered/nodes/UNIPROTKB.nodes.csv", index_col=0)
gene_df = pd.read_csv("out/0915/filtered/nodes/Gene.nodes.csv", index_col=0)

  gene_df = pd.read_csv("out/0915/filtered/nodes/Gene.nodes.csv", index_col=0)


In [54]:
gene_or_genome_df.head()

Unnamed: 0_level_0,label,type,NCI,MTH,PUBCHEM
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C2828054,FANCD2 wt Allele,Gene or Genome,NCI:C86550,MTH:NOCODE,PUBCHEM:643975


In [55]:
uniprot.head()

Unnamed: 0_level_0,label,type,UNIPROTKB
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
UNIPROTKB:Q92993 CUI,KAT5_HUMAN,UNIPROTKB,UNIPROTKB:Q92993
UNIPROTKB:Q9H093 CUI,NUAK2_HUMAN,UNIPROTKB,UNIPROTKB:Q9H093
UNIPROTKB:Q15418 CUI,KS6A1_HUMAN,UNIPROTKB,UNIPROTKB:Q15418
UNIPROTKB:P59540 CUI,T2R46_HUMAN,UNIPROTKB,UNIPROTKB:P59540
UNIPROTKB:P29122 CUI,PCSK6_HUMAN,UNIPROTKB,UNIPROTKB:P29122


In [56]:
gene_df.head()

Unnamed: 0,label,type,NCI,LCH_NW,MSH,CSP,SNOMEDCT_US,HGNC,MTH,OMIM,...,CHEBI,ENSEMBL,ENSEMBL value,ENSEMBL lowerbound,ENSEMBL upperbound,ENTREZ,ENTREZ lowerbound,ENTREZ upperbound,PUBCHEM,UNIPROTKB
C1824908,DENND1C gene,Gene,,,,,,HGNC:26225,MTH:NOCODE,OMIM:613634,...,,ENSEMBL:ENSG00000205744,10.0,6467207.0,6482557.0,ENTREZ:79958,6467207.0,6476246.0,,Q8IV53
C3541602,RN7SL41P gene,Gene,,,,,,HGNC:46057,MTH:NOCODE,,...,,ENSEMBL:ENSG00000241550,3.0,80204606.0,80204902.0,,,,,
C1425512,MIA2 gene,Gene,NCI:C155865,,,,,HGNC:18432,MTH:NOCODE,OMIM:602132,...,,ENSEMBL:ENSG00000150527,18.0,39230231.0,39388513.0,ENTREZ:4253,39230231.0,39240588.0,,Q96PC5
C1826915,RPS26P2 gene,Gene,,,,,,HGNC:23662,MTH:NOCODE,,...,,ENSEMBL:ENSG00000233278,1.0,30831878.0,30832225.0,,,,,
C1366544,ROR2 gene,Gene,NCI:C24746,,,,,HGNC:10257,MTH:NOCODE,OMIM:602337,...,,ENSEMBL:ENSG00000169071,15.0,91563091.0,91950228.0,ENTREZ:4920,91563091.0,91948880.0,,Q01974


In [58]:
for i, row in gene_or_genome_df.iterrows():
    gene_df.loc[i] = row

In [59]:
for i, row in uniprot.iterrows():
    gene_df.loc[i] = row

In [60]:
gene_df.type = "Gene"

In [61]:
gene_df.to_csv("out/0915/filtered/nodes/Gene.nodes.csv")

In [64]:
import os

In [65]:
os.remove("out/0915/filtered/nodes/UNIPROTKB.nodes.csv")
os.remove("out/0915/filtered/nodes/Gene or Genome.nodes.csv")

In [67]:
for i in glob('out/0915/filtered/edges/*'):
    if "UNIPROTKB" in i:
        os.rename(i, i.replace("UNIPROTKB", "Gene"))
    if "Gene or Genome" in i:
        os.rename(i, i.replace("Gene or Genome", "Gene"))

In [69]:
gene_df.head()

Unnamed: 0,label,type,NCI,LCH_NW,MSH,CSP,SNOMEDCT_US,HGNC,MTH,OMIM,...,CHEBI,ENSEMBL,ENSEMBL value,ENSEMBL lowerbound,ENSEMBL upperbound,ENTREZ,ENTREZ lowerbound,ENTREZ upperbound,PUBCHEM,UNIPROTKB
C1824908,DENND1C gene,Gene,,,,,,HGNC:26225,MTH:NOCODE,OMIM:613634,...,,ENSEMBL:ENSG00000205744,10.0,6467207.0,6482557.0,ENTREZ:79958,6467207.0,6476246.0,,Q8IV53
C3541602,RN7SL41P gene,Gene,,,,,,HGNC:46057,MTH:NOCODE,,...,,ENSEMBL:ENSG00000241550,3.0,80204606.0,80204902.0,,,,,
C1425512,MIA2 gene,Gene,NCI:C155865,,,,,HGNC:18432,MTH:NOCODE,OMIM:602132,...,,ENSEMBL:ENSG00000150527,18.0,39230231.0,39388513.0,ENTREZ:4253,39230231.0,39240588.0,,Q96PC5
C1826915,RPS26P2 gene,Gene,,,,,,HGNC:23662,MTH:NOCODE,,...,,ENSEMBL:ENSG00000233278,1.0,30831878.0,30832225.0,,,,,
C1366544,ROR2 gene,Gene,NCI:C24746,,,,,HGNC:10257,MTH:NOCODE,OMIM:602337,...,,ENSEMBL:ENSG00000169071,15.0,91563091.0,91950228.0,ENTREZ:4920,91563091.0,91948880.0,,Q01974


In [70]:
hgnc_genes = pd.read_csv("dd_data/HGNC_genes.txt", sep="\t")
hgnc_genes.head()

Unnamed: 0,HGNC ID,Status,Approved symbol,Approved name,Enzyme (EC) ID,UniProt accession
0,HGNC:5,Approved,A1BG,alpha-1-B glycoprotein,,P04217
1,HGNC:37133,Approved,A1BG-AS1,A1BG antisense RNA 1,,
2,HGNC:24086,Approved,A1CF,APOBEC1 complementation factor,,Q9NQ94
3,HGNC:7,Approved,A2M,alpha-2-macroglobulin,,P01023
4,HGNC:27057,Approved,A2M-AS1,A2M antisense RNA 1,,


In [71]:
hgnc_mapper = {}
for i, row in hgnc_genes.iterrows():
    hgnc_id = row["HGNC ID"]
    enz_id = row["Enzyme (EC) ID"]
    if hgnc_id not in hgnc_mapper:
        hgnc_mapper[hgnc_id] = {
            "EC ID": enz_id,
            "is_Enzyme": type(enz_id) == str
        }
    

In [74]:
for i, row in gene_df.iterrows():
    hgnc_id = row["HGNC"]
    if hgnc_id in hgnc_mapper:
        gene_df.at[i, "EC ID"] = hgnc_mapper[hgnc_id]["EC ID"]
        gene_df.at[i, "is_Enzyme"] = hgnc_mapper[hgnc_id]["is_Enzyme"]

In [78]:
gene_df[gene_df.is_Enzyme == True]

Unnamed: 0,label,type,NCI,LCH_NW,MSH,CSP,SNOMEDCT_US,HGNC,MTH,OMIM,...,ENSEMBL value,ENSEMBL lowerbound,ENSEMBL upperbound,ENTREZ,ENTREZ lowerbound,ENTREZ upperbound,PUBCHEM,UNIPROTKB,EC ID,is_Enzyme
C1413862,CYP24A1 gene,Gene,NCI:C104146,,,,,HGNC:2602,MTH:NOCODE,OMIM:126065,...,10.0,54153446.0,54173986.0,ENTREZ:1591,54153446.0,54173986.0,,Q07973,1.14.15.16,True
C1418489,PFKFB3 gene,Gene,NCI:C102808,,,,,HGNC:8874,MTH:NOCODE,OMIM:605319,...,21.0,6144934.0,6254644.0,ENTREZ:5209,6144934.0,6235532.0,,Q16875,2.7.1.105,True
C1415713,HR gene,Gene,,,,,,HGNC:5172,MTH:NOCODE,OMIM:602302,...,16.0,22114419.0,22133384.0,ENTREZ:55806,22114419.0,22131010.0,,O43593,1.14.11.-,True
C1537989,MT-ND1 gene,Gene,,,,,,HGNC:7455,MTH:NOCODE,OMIM:516000,...,2.0,3307.0,4262.0,ENTREZ:4535,3307.0,4262.0,,P03886,1.6.5.3,True
C1425508,AGO3 gene,Gene,NCI:C84491,,,,,HGNC:18421,MTH:NOCODE,OMIM:607355,...,20.0,35930718.0,36072500.0,ENTREZ:192669,35930718.0,35983409.0,,Q9H9G7,3.1.26.n2,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C1333339,EPHA8 gene,Gene,NCI:C24365,,,,,HGNC:3391,MTH:NOCODE,OMIM:176945,...,12.0,22563489.0,22603595.0,ENTREZ:2046,22563489.0,22603595.0,,P29322,2.7.10.1,True
C1823183,AMDHD2 gene,Gene,,,,,,HGNC:24262,MTH:NOCODE,,...,16.0,2520357.0,2531422.0,ENTREZ:51005,2520357.0,2530491.0,,Q9Y303,3.5.1.25,True
C1418446,PDXK gene,Gene,,,,,,HGNC:8819,MTH:NOCODE,OMIM:179020,...,19.0,43719094.0,43762307.0,ENTREZ:8566,43719094.0,43738861.0,,O00764,2.7.1.35,True
C1413656,COX7A2 gene,Gene,,,,,,HGNC:2288,MTH:NOCODE,OMIM:123996,...,13.0,75237675.0,75250323.0,ENTREZ:1347,75237675.0,75250323.0,,P14406,1.9.3.1,True


In [80]:
for filename in glob('out/0915/filtered/edges/*'):
    df = pd.read_csv(filename)
    if "CMAP" in df.SAB.unique():
        print(filename)
        os.remove(filename)


out/0915/filtered/edges/Gene.negatively correlated with chemical or drug.Hormone.edges.csv
out/0915/filtered/edges/Gene.negatively correlated with chemical or drug.Amino Acid, Peptide, or Protein.edges.csv
out/0915/filtered/edges/Gene.negatively correlated with chemical or drug.Organic Chemical.edges.csv
out/0915/filtered/edges/Gene.negatively correlated with chemical or drug.Drug.edges.csv
out/0915/filtered/edges/Gene.positively correlated with chemical or drug.Drug.edges.csv
out/0915/filtered/edges/Gene.positively correlated with chemical or drug.Organic Chemical.edges.csv
out/0915/filtered/edges/Gene.negatively correlated with chemical or drug.Pharmacologic Substance.edges.csv
out/0915/filtered/edges/Gene.positively correlated with chemical or drug.Nucleic Acid, Nucleoside, or Nucleotide.edges.csv
out/0915/filtered/edges/Gene.positively correlated with chemical or drug.Pharmacologic Substance.edges.csv
out/0915/filtered/edges/Gene.negatively correlated with chemical or drug.Nucleic 

In [82]:
dcc_mapper = {}
with open('output/sabs_dcc_mapper.txt') as o:
    for line in o:
        r = line.strip().split(":")
        if len(r) == 2:
            dcc_mapper[r[0]] =r[1]
        else:
            dcc_mapper[r[0]] =r[0]

In [84]:
for filename in glob('out/0915/filtered/edges/*'):
    df = pd.read_csv(filename)
    if len(df.SAB.unique()) > 1:
        print(filename, df.SAB.unique())
        df["DCC"] = dcc_mapper[df.SAB.unique()[0]]
    else:
        df["DCC"] = dcc_mapper[df.SAB.unique()[0]]
    df.to_csv(filename)

out/0915/filtered/edges/GLYCAN.isa.UNIPROT.edges.csv ['GLYCORDF' 'GLYCOCOO']
out/0915/filtered/edges/GLYCAN.isa.GLYCAN.edges.csv ['GLYCORDF' 'GLYCOCOO']


In [86]:
gene_df.index.name = "id"
gene_df

Unnamed: 0_level_0,label,type,NCI,LCH_NW,MSH,CSP,SNOMEDCT_US,HGNC,MTH,OMIM,...,ENSEMBL value,ENSEMBL lowerbound,ENSEMBL upperbound,ENTREZ,ENTREZ lowerbound,ENTREZ upperbound,PUBCHEM,UNIPROTKB,EC ID,is_Enzyme
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C1824908,DENND1C gene,Gene,,,,,,HGNC:26225,MTH:NOCODE,OMIM:613634,...,10.0,6467207.0,6482557.0,ENTREZ:79958,6467207.0,6476246.0,,Q8IV53,,False
C3541602,RN7SL41P gene,Gene,,,,,,HGNC:46057,MTH:NOCODE,,...,3.0,80204606.0,80204902.0,,,,,,,False
C1425512,MIA2 gene,Gene,NCI:C155865,,,,,HGNC:18432,MTH:NOCODE,OMIM:602132,...,18.0,39230231.0,39388513.0,ENTREZ:4253,39230231.0,39240588.0,,Q96PC5,,False
C1826915,RPS26P2 gene,Gene,,,,,,HGNC:23662,MTH:NOCODE,,...,1.0,30831878.0,30832225.0,,,,,,,False
C1366544,ROR2 gene,Gene,NCI:C24746,,,,,HGNC:10257,MTH:NOCODE,OMIM:602337,...,15.0,91563091.0,91950228.0,ENTREZ:4920,91563091.0,91948880.0,,Q01974,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UNIPROTKB:Q9NYY3 CUI,PLK2_HUMAN,Gene,,,,,,,,,...,,,,,,,,UNIPROTKB:Q9NYY3,,
UNIPROTKB:P07148 CUI,FABPL_HUMAN,Gene,,,,,,,,,...,,,,,,,,UNIPROTKB:P07148,,
UNIPROTKB:P29317 CUI,EPHA2_HUMAN,Gene,,,,,,,,,...,,,,,,,,UNIPROTKB:P29317,,
UNIPROTKB:O14804 CUI,TAAR5_HUMAN,Gene,,,,,,,,,...,,,,,,,,UNIPROTKB:O14804,,


In [87]:
gene_df.to_csv("out/0915/filtered/nodes/Gene.nodes.csv")

In [92]:
filenames = []
for filename in glob('out/0915/filtered/nodes/*'):
    df = pd.read_csv(filename, index_col=0, low_memory=False)
    if "label" not in df.columns:
        filenames.append(filename)
        df['label'] = df.index
        df.to_csv(filename)
        print(filename)

out/0915/filtered/nodes/GLYCOSYLTRANSFERASE REACTION.nodes.csv
out/0915/filtered/nodes/EXPBINS.nodes.csv
out/0915/filtered/nodes/GLYGEN SRC.nodes.csv
out/0915/filtered/nodes/GLYGEN GLYCOSYLATION.nodes.csv
out/0915/filtered/nodes/CHLO.nodes.csv
out/0915/filtered/nodes/ENCODE CCRE CTCF.nodes.csv
out/0915/filtered/nodes/4DNQ.nodes.csv
out/0915/filtered/nodes/4DNL.nodes.csv
out/0915/filtered/nodes/MEDGEN.nodes.csv
out/0915/filtered/nodes/KFVARBIN.nodes.csv
out/0915/filtered/nodes/GTEXEXP.nodes.csv
out/0915/filtered/nodes/PVALUEBINS.nodes.csv
out/0915/filtered/nodes/KFPT.nodes.csv
out/0915/filtered/nodes/GTEXEQTL.nodes.csv
out/0915/filtered/nodes/MOTORPAC.nodes.csv
out/0915/filtered/nodes/ENCODE CCRE H3K4ME3.nodes.csv
out/0915/filtered/nodes/ENCODE CCRE H3K27AC.nodes.csv
out/0915/filtered/nodes/ENCODE CCRE.nodes.csv


In [None]:
df