In [93]:
import pandas as pd
from glob import glob
from IPython.display import display, Markdown
from tqdm import tqdm
import csv
pd.__version__

'2.0.3'

In [94]:
base_input = "dd_data/20230802/"

In [3]:
hgnc_info = pd.read_csv('dd_data/HGNC_genes.txt', sep="\t")
hgnc_mapper = {}
for i, row in hgnc_info.iterrows():
    uid = row["HGNC ID"]
    enz = row["Enzyme (EC) ID"]
    unip = row["UniProt accession"]
    if uid not in hgnc_mapper:
        hgnc_mapper[uid] = {}
        if type(unip) == str:
            hgnc_mapper[uid]["UNIPROTKB"] = unip
        if type(enz) == str:
            hgnc_mapper[uid]["ec_id"] = enz
                


## Load Data

In [4]:
concepts = pd.read_csv(base_input + "neo4j/import/CUIs.csv")
concepts = pd.DataFrame(index=concepts["CUI:ID"].unique())
concepts.index.name = "id"
concepts.head()

C0000097
C0000359
C0000610
C0000739
C0000873


In [5]:
semantics = pd.read_csv(base_input + "neo4j/import/TUIs.csv", index_col=0)
semantics.head()

Unnamed: 0_level_0,name,STN,DEF
TUI:ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
T012,Bird,A1.1.3.1.1.2,A vertebrate having a constant body temperatur...
T014,Reptile,A1.1.3.1.1.5,A cold-blooded vertebrate having an external c...
T015,Mammal,A1.1.3.1.1.4,A vertebrate having a constant body temperatur...
T021,Fully Formed Anatomical Structure,A1.2.3,An anatomical structure in a fully formed orga...
T022,Body System,A2.1.4.1,A complex of anatomical structures that perfor...


In [6]:
terms = pd.read_csv(base_input + "neo4j/import/SUIs.csv", index_col=0)
terms.head()

Unnamed: 0_level_0,name
SUI:ID,Unnamed: 1_level_1
S0009776,"Acid, 2-Aminohexanedioic"
S7249234,BR CAMP
S11872577,cramps abdominal
S14680596,Retained tissue after pregnancy loss
S3417882,Missed miscarriage


In [7]:
codes = pd.read_csv(base_input + "neo4j/import/CODEs.csv", index_col=0)
codes.head()

  codes = pd.read_csv(base_input + "neo4j/import/CODEs.csv", index_col=0)


Unnamed: 0_level_0,SAB,CODE,value:float,lowerbound:float,upperbound:float,unit
CodeID:ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MTHSPL:J7A92W69L7,MTHSPL,J7A92W69L7,,,,
NCI:C76777,NCI,C76777,,,,
ATC:N07XX07,ATC,N07XX07,,,,
GS:1946,GS,1946,,,,
NOC:040413,NOC,040413,,,,


In [8]:
concept_term = pd.read_csv(base_input + "neo4j/import/CUI-SUIs.csv")
concept_term.head()

Unnamed: 0,:START_ID,:END_ID
0,C0000163,S0007806
1,C0000427,S0008336
2,C0000665,S11922611
3,C0000992,S1577859
4,C0001021,S0797872


In [9]:
concept_semantics = pd.read_csv(base_input + "neo4j/import/CUI-TUIs.csv")
concept_semantics.head()

Unnamed: 0,:START_ID,:END_ID
0,C0000132,T126
1,C0000246,T116
2,C0000895,T060
3,C0000908,T037
4,C0000931,T067


In [10]:
concept_code = pd.read_csv(base_input + "neo4j/import/CUI-CODEs.csv")
concept_code.head()

Unnamed: 0,:START_ID,:END_ID
0,C0000294,ATC:V03AF01
1,C0000481,CHV:0000000513
2,C0000661,MSH:D015124
3,C0000665,VANDF:4020847
4,C0000737,LNC:LA15468-4


In [11]:
semantics_semantics = pd.read_csv(base_input + "neo4j/import/TUIrel.csv")
semantics_semantics.head()

Unnamed: 0,:END_ID,:START_ID
0,T204,T002
1,T001,T004
2,T071,T004
3,T072,T010
4,T204,T010


In [12]:
code_term = pd.read_csv(base_input + "neo4j/import/CODE-SUIs.csv")
code_term.head()

Unnamed: 0,:END_ID,:START_ID,:TYPE,CUI
0,S1424701,RXNORM:74,IN,C0000473
1,S18541041,SNOMEDCT_US:80994002,FN,C0000477
2,S11730064,SNOMEDCT_US:226367006,SY,C0000545
3,S0288461,CSP:2005-4146,PT,C0000735
4,S1957040,MDR:10048885,LLT,C0000735


## Merge Concept and Terms

In [13]:
concept_term.columns = ["CUI:ID", "SUI:ID"]
concept_term.shape

(7923747, 2)

In [14]:
concept_term = pd.merge(concept_term, terms, on="SUI:ID", how='outer')
concept_term = concept_term.groupby('CUI:ID').first()
concept_term.head()

Unnamed: 0_level_0,SUI:ID,name
CUI:ID,Unnamed: 1_level_1,Unnamed: 2_level_1
4DND:4DNES1JP4KZ1 CUI,aW4gc2l0dSBIaS1DIG9uIEhDVDExNiBjZWxscyAoY29udG...,in situ Hi-C on HCT116 cells (containing AID-t...
4DND:4DNES21D8SP8 CUI,TWljcm8tQyBvbiBIMS1FU0MgY2VsbHMuSDEtRVND,Micro-C on H1-ESC cells.H1-ESC
4DND:4DNES2M5JIGV CUI,aW4gc2l0dSBIaS1DIG9uICBIaS1FU0MgY2VsbHMuSDEtRVND,in situ Hi-C on Hi-ESC cells.H1-ESC
4DND:4DNES2R6PUEK CUI,aW4gc2l0dSBIaS1DIG9uIEhGRmM2IGNlbGxzLkhGRmM2,in situ Hi-C on HFFc6 cells.HFFc6
4DND:4DNES3QAGOZZ CUI,aW4gc2l0dSBIaS1DIG9uIEhDVDExNiBjZWxscyAoY29udG...,in situ Hi-C on HCT116 cells (containing AID-t...


In [15]:
concept_term.shape

(7923727, 2)

In [16]:
concept_term.columns = ["SUI:ID", "label"]
concept_term = concept_term[["label"]]
concept_term.head()

Unnamed: 0_level_0,label
CUI:ID,Unnamed: 1_level_1
4DND:4DNES1JP4KZ1 CUI,in situ Hi-C on HCT116 cells (containing AID-t...
4DND:4DNES21D8SP8 CUI,Micro-C on H1-ESC cells.H1-ESC
4DND:4DNES2M5JIGV CUI,in situ Hi-C on Hi-ESC cells.H1-ESC
4DND:4DNES2R6PUEK CUI,in situ Hi-C on HFFc6 cells.HFFc6
4DND:4DNES3QAGOZZ CUI,in situ Hi-C on HCT116 cells (containing AID-t...


In [17]:
concept_term.shape

(7923727, 1)

In [18]:
concepts.loc[concept_term.index, 'label'] = concept_term.loc[concept_term.index, 'label']
concepts.head()

Unnamed: 0_level_0,label
id,Unnamed: 1_level_1
C0000097,"1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine"
C0000359,"3',5'-Cyclic-Nucleotide Phosphodiesterase"
C0000610,6-Aminonicotinamide
C0000739,Abdominal Muscles
C0000873,Academic Problem


## Semantics

In [19]:
semantics.head()

Unnamed: 0_level_0,name,STN,DEF
TUI:ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
T012,Bird,A1.1.3.1.1.2,A vertebrate having a constant body temperatur...
T014,Reptile,A1.1.3.1.1.5,A cold-blooded vertebrate having an external c...
T015,Mammal,A1.1.3.1.1.4,A vertebrate having a constant body temperatur...
T021,Fully Formed Anatomical Structure,A1.2.3,An anatomical structure in a fully formed orga...
T022,Body System,A2.1.4.1,A complex of anatomical structures that perfor...


In [20]:
concept_semantics.head()

Unnamed: 0,:START_ID,:END_ID
0,C0000132,T126
1,C0000246,T116
2,C0000895,T060
3,C0000908,T037
4,C0000931,T067


In [21]:
no_type = set(concepts.index) - set(concept_semantics[':START_ID'])
len(no_type)

12264238

In [22]:
with open('out/0623/semantics_ranked.tsv') as o:
    ranked_type = [i.strip() for i in o.read().strip().split("\n")]

In [23]:
concept_semantics.columns = ["id", "TUI:ID"]
concept_semantics["type"] = [semantics.at[i, 'name'] for i in concept_semantics['TUI:ID']]
concept_semantics.head()

Unnamed: 0,id,TUI:ID,type
0,C0000132,T126,Enzyme
1,C0000246,T116,"Amino Acid, Peptide, or Protein"
2,C0000895,T060,Diagnostic Procedure
3,C0000908,T037,Injury or Poisoning
4,C0000931,T067,Phenomenon or Process


In [24]:
def fetch_type(v):
    cat = ""
    rank = len(ranked_type)
    for i in v:
        r = ranked_type.index(i)
        if r < rank:
            cat = i
            rank = r
    return cat

In [25]:
cs = concept_semantics.groupby('id')['type'].apply(lambda x: "; ".join(set(x)))
cs.head()

id
C0000005    Amino Acid, Peptide, or Protein; Indicator, Re...
C0000039            Organic Chemical; Pharmacologic Substance
C0000052              Amino Acid, Peptide, or Protein; Enzyme
C0000074                                     Organic Chemical
C0000084    Amino Acid, Peptide, or Protein; Biologically ...
Name: type, dtype: object

In [26]:
cs_ranked = concept_semantics.groupby('id')['type'].apply(fetch_type)
cs_ranked.head()

id
C0000005    Amino Acid, Peptide, or Protein
C0000039                   Organic Chemical
C0000052                             Enzyme
C0000074                   Organic Chemical
C0000084    Amino Acid, Peptide, or Protein
Name: type, dtype: object

In [27]:
common = list(set(concepts.index).intersection(cs.index))
cs[common].head()

id
C4596166                                       Fungus
C2131805                                      Finding
C1550825                             Population Group
C3884354    Organic Chemical; Pharmacologic Substance
C0857614                                      Finding
Name: type, dtype: object

In [28]:
concept_semantics
concepts.loc[common, 'type'] = cs_ranked[common]
concepts.loc[common, 'type_combined'] = cs[common]
concepts.head()

Unnamed: 0_level_0,label,type,type_combined
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C0000097,"1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine",Organic Chemical,Organic Chemical; Hazardous or Poisonous Subst...
C0000359,"3',5'-Cyclic-Nucleotide Phosphodiesterase",Enzyme,"Amino Acid, Peptide, or Protein; Enzyme"
C0000610,6-Aminonicotinamide,Pharmacologic Substance,Pharmacologic Substance; Vitamin
C0000739,Abdominal Muscles,"Body Part, Organ, or Organ Component","Body Part, Organ, or Organ Component"
C0000873,Academic Problem,Finding,Finding


In [29]:
out_prefix = "out/0915/"

In [30]:
concepts.groupby("type_combined").first().to_csv(out_prefix + 'semantics.tsv', sep="\t")

In [31]:
concepts.head()

Unnamed: 0_level_0,label,type,type_combined
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C0000097,"1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine",Organic Chemical,Organic Chemical; Hazardous or Poisonous Subst...
C0000359,"3',5'-Cyclic-Nucleotide Phosphodiesterase",Enzyme,"Amino Acid, Peptide, or Protein; Enzyme"
C0000610,6-Aminonicotinamide,Pharmacologic Substance,Pharmacologic Substance; Vitamin
C0000739,Abdominal Muscles,"Body Part, Organ, or Organ Component","Body Part, Organ, or Organ Component"
C0000873,Academic Problem,Finding,Finding


In [32]:
concepts.shape

(15527671, 3)

In [33]:
with open(out_prefix + 'semantics_list.tsv', 'w') as o:
    o.write("\n".join([str(i) for i in concept_semantics.type.unique()]))

In [34]:
codes.head()

Unnamed: 0_level_0,SAB,CODE,value:float,lowerbound:float,upperbound:float,unit
CodeID:ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MTHSPL:J7A92W69L7,MTHSPL,J7A92W69L7,,,,
NCI:C76777,NCI,C76777,,,,
ATC:N07XX07,ATC,N07XX07,,,,
GS:1946,GS,1946,,,,
NOC:040413,NOC,040413,,,,


In [35]:
concept_code.columns = ["id", "CodeID:ID"]
concept_code.head()

Unnamed: 0,id,CodeID:ID
0,C0000294,ATC:V03AF01
1,C0000481,CHV:0000000513
2,C0000661,MSH:D015124
3,C0000665,VANDF:4020847
4,C0000737,LNC:LA15468-4


In [36]:
concept_code = pd.merge(concept_code, codes, on="CodeID:ID", how='left')
concept_code.head()

Unnamed: 0,id,CodeID:ID,SAB,CODE,value:float,lowerbound:float,upperbound:float,unit
0,C0000294,ATC:V03AF01,ATC,V03AF01,,,,
1,C0000481,CHV:0000000513,CHV,0000000513,,,,
2,C0000661,MSH:D015124,MSH,D015124,,,,
3,C0000665,VANDF:4020847,VANDF,4020847,,,,
4,C0000737,LNC:LA15468-4,LNC,LA15468-4,,,,


In [37]:
concept_code[concept_code.id == 'C0000097']

Unnamed: 0,id,CodeID:ID,SAB,CODE,value:float,lowerbound:float,upperbound:float,unit
623690,C0000097,LCH_NW:sh86002892,LCH_NW,sh86002892,,,,
1108107,C0000097,CSP:2511-0411,CSP,2511-0411,,,,
1246274,C0000097,PSY:31213,PSY,31213,,,,
2179442,C0000097,MSH:D015632,MSH,D015632,,,,
3529094,C0000097,CHV:0000000501,CHV,0000000501,,,,
3736237,C0000097,PSY:32433,PSY,32433,,,,
4116966,C0000097,SNOMEDCT_US:285407008,SNOMEDCT_US,285407008,,,,


In [38]:
concepts.head()

Unnamed: 0_level_0,label,type,type_combined
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C0000097,"1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine",Organic Chemical,Organic Chemical; Hazardous or Poisonous Subst...
C0000359,"3',5'-Cyclic-Nucleotide Phosphodiesterase",Enzyme,"Amino Acid, Peptide, or Protein; Enzyme"
C0000610,6-Aminonicotinamide,Pharmacologic Substance,Pharmacologic Substance; Vitamin
C0000739,Abdominal Muscles,"Body Part, Organ, or Organ Component","Body Part, Organ, or Organ Component"
C0000873,Academic Problem,Finding,Finding


In [39]:
type_mapper = {}
with open("output/unique_SABS_of_Concept_Mapper.txt") as o:
    for line in o:
        r = line.strip().split(":")
        if len(r) == 2:
            type_mapper[r[0]] = r[1]
        elif 'MSIGDB' in r[0]:
            type_mapper[r[0]] = 'MSIGDB'
        else:
            type_mapper[r[0]] = r[0]

In [40]:
for i,row in tqdm(concept_code[concept_code.id.isin(concepts[concepts.type.isna()].index)].iterrows()):
    sab = row["SAB"]
    ind = row["id"]
    if type(sab) == str:
        if 'MSIGDB' in sab:
            sab = 'MSIGDB'
            concept_code.at[i, 'SAB'] = 'MSIGDB'
        if sab == 'MSIGDB':
            tp = 'MSIGDB'
        else:
            tp = type_mapper[sab]
        if tp:
            concepts.at[ind, "type"] = tp
            concepts.at[ind, "type_combined"] = tp


15721067it [07:40, 34108.59it/s]


In [41]:
for i, row in concepts.iterrows():
    concepts.at[i, "type"] = row["type"].replace(".", " ")
    concepts.at[i, "type_combined"] = row["type_combined"].replace(".", " ")


In [42]:
concepts[concepts.type == "UNIPROTKB"].head()

Unnamed: 0_level_0,label,type,type_combined
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
UNIPROTKB:A0A087X1C5 CUI,CP2D7_HUMAN,UNIPROTKB,UNIPROTKB
UNIPROTKB:A0A0B4J2F0 CUI,PIOS1_HUMAN,UNIPROTKB,UNIPROTKB
UNIPROTKB:A0A0B4J2F2 CUI,SIK1B_HUMAN,UNIPROTKB,UNIPROTKB
UNIPROTKB:A0A0C5B5G6 CUI,MOTSC_HUMAN,UNIPROTKB,UNIPROTKB
UNIPROTKB:A0A0K2S4Q6 CUI,CD3CH_HUMAN,UNIPROTKB,UNIPROTKB


In [None]:
for tp in tqdm(concepts.type.unique()):
    con = concepts[concepts.type==tp].copy()
    cc = concept_code[concept_code.id.isin(con.index)]
    for sab in cc.SAB.unique():
        c = cc[cc.SAB == sab]
        c = c.groupby('id').first()
        common = list(set(con.index).intersection(c.index))
        con.loc[common, sab] = c.loc[common, "CodeID:ID"]
        if c.loc[common, "value:float"].isna().sum() != len(common):
            con.loc[common, "%s value"%sab] = c.loc[common, "value:float"]
        if c.loc[common, "lowerbound:float"].isna().sum() != len(common):
            con.loc[common, "%s lowerbound"%sab] = c.loc[common, "lowerbound:float"]
        if c.loc[common, "upperbound:float"].isna().sum() != len(common):
            con.loc[common, "%s upperbound"%sab] = c.loc[common, "upperbound:float"]
        if c.loc[common, "unit"].isna().sum() != len(common):
            con.loc[common, "%s unit"%sab] = c.loc[common, "unit"]
        if "-" in list(con["label"]):
            tmp = con[con.label == "-"]
            ind = set(tmp.index).intersection(con.index)
            ind2 = set(tmp.index).intersection(c.index)
            if len(ind.intersection(ind2)) > 0:
                l = list(ind.intersection(ind2))
                con.loc[l, "label"] = c.loc[l, 'CodeID:ID']
        con.to_csv("out/0915/serialization/nodes/%s.nodes.csv"%(tp))


In [49]:
gene_or_genome_df = pd.read_csv("out/0915/serialization/nodes/Gene or Genome.nodes.csv", index_col=0)
uniprot = pd.read_csv("out/0915/serialization/nodes/UNIPROTKB.nodes.csv", index_col=0)
gene_df = pd.read_csv("out/0915/serialization/nodes/Gene.nodes.csv", index_col=0)


  gene_df = pd.read_csv("out/0915/filtered/nodes/Gene.nodes.csv", index_col=0)


In [50]:
uniprot.head()

Unnamed: 0_level_0,label,type,UNIPROTKB
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
UNIPROTKB:Q92993 CUI,KAT5_HUMAN,UNIPROTKB,UNIPROTKB:Q92993
UNIPROTKB:Q9H093 CUI,NUAK2_HUMAN,UNIPROTKB,UNIPROTKB:Q9H093
UNIPROTKB:Q15418 CUI,KS6A1_HUMAN,UNIPROTKB,UNIPROTKB:Q15418
UNIPROTKB:P59540 CUI,T2R46_HUMAN,UNIPROTKB,UNIPROTKB:P59540
UNIPROTKB:P29122 CUI,PCSK6_HUMAN,UNIPROTKB,UNIPROTKB:P29122


In [51]:
uniprot.shape, gene_df.shape, gene_or_genome_df.shape

((2546, 3), (39920, 28), (1, 5))

In [52]:
gene_or_genome_df

Unnamed: 0_level_0,label,type,NCI,MTH,PUBCHEM
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C2828054,FANCD2 wt Allele,Gene or Genome,NCI:C86550,MTH:NOCODE,PUBCHEM:643975


In [None]:
uniprot_id_mapper = pd.read_csv('output/idmapping_2023_09_18.tsv', sep="\t", index_col=0)
uniprot_id_mapper.head()

In [None]:
new_gene_or_genome = gene_or_genome_df[gene_or_genome_df.HGNC.isna()]

In [None]:
rows = {}
hgnc_mapper = {}
for i, row in gene_or_genome_df[~gene_or_genome_df.HGNC.isna()].iterrows():
    hgnc = row["HGNC"]
    hgnc_mapper[hgnc] = i
    row["type"] = "Gene"
    row["type_combined"] = row["type_combined"].replace("Gene or Genome", "Gene")
    rows[i] = row
len(rows)

In [None]:
for i, row in gene_df.iterrows():
    hgnc = row["HGNC"]
    if hgnc not in hgnc_mapper:
        hgnc_mapper[hgnc] = i
        row["type"] = "Gene"
        rows[i] = row
len(rows)

In [None]:
gene_df.head()

In [None]:
uniprot.head()

In [None]:
uniprot_kb_mapper = {}
uniprot_list = []
with open("uniprot_ids_0917.txt", "w") as o:
    for i, row in uniprot.iterrows():
        kb = row["UNIPROTKB"].replace("UNIPROTKB:", "")
        o.write("%s\n"%kb)
    # hgnc = uniprot_id_mapper.at[kb, 'To']
    # hgnc = row["HGNC"]
    # if hgnc not in hgnc_mapper:
    #     hgnc_mapper[hgnc] = i
    #     row["type"] = "Gene"
    #     rows[i] = row

In [None]:
uniprot_mapper = {}
for k, v in uniprot_id_mapper.iterrows():
    uniprot_mapper[k] = v["To"]

In [None]:
no_hgnc = set()
for i, row in uniprot.iterrows():
    kb = row["UNIPROTKB"].replace("UNIPROTKB:", "")
    if kb not in uniprot_mapper:
        no_hgnc.add(kb)
        rows[i] = row
    else:
        hgnc = uniprot_mapper[kb]
        if hgnc in hgnc_mapper:
            cui = hgnc_mapper[hgnc]
            rows[cui]["UNIPROTKB"] = kb
        else:
            row["HGNC"] = hgnc
            row["type"] = "Gene"
            row["type_combined"] = "Gene"
            rows[i] = row

In [None]:
len(rows)

In [None]:
new_gene_df = pd.DataFrame.from_dict(rows, orient="index")

In [None]:
new_gene_df.head()

In [None]:
new_gene_df.type_combined = "Gene"
new_gene_df.type = "Gene"
new_gene_df.type_combined.unique(), new_gene_df.type.unique()

In [None]:
concepts.head()

In [None]:
for i in new_gene_df.index:
    concepts.at[i, "type"] = "Gene"
    concepts.at[i, "type_combined"] = "Gene"

In [None]:
new_gene_df.to_csv("out/0915/serialization/nodes/Gene.nodes.csv")

In [None]:
new_gene_or_genome.to_csv("out/0915/serialization/nodes/Gene or Genome.nodes.csv")

In [None]:
import os

In [None]:
row_headers = ["source", "relation", "target", "source_label", "target_label", "SAB", "evidence"]
with open(base_input + "neo4j/import/CUI-CUIs.csv") as o:
    csv_reader = csv.reader(o)
    headers = None
    for row in tqdm(csv_reader):
        if not headers:
            headers = row
        else:
            source = row[0]
            if source in uniprot_mapper:
                source = uniprot_mapper[source]
            target = row[1]
            if target in uniprot_mapper:
                target = uniprot_mapper[target]
            if source in concepts.index and target in concepts.index:
                source_label = concepts.at[source, 'label']
                source_type = concepts.at[source, 'type']
                
                target_label = concepts.at[target, 'label']
                target_type = concepts.at[target, 'type']
                relation = row[2]
                sab = row[3]
                evidence = ''
                if len(row) > 4:
                    evidence = row[4]
                filename = 'out/0915/serialization/edges/%s.%s.%s.edges.csv'%(source_type, relation, target_type)
                write_header = False
                operation = "a"
                if not os.path.isfile(filename):
                    write_header = True
                    operation = "w"
                # source_list = set()
                # target_list = set()
                with open(filename, operation) as w:
                    csv_writer = csv.writer(w)
                    if write_header:
                        csv_writer.writerow(row_headers)
                    csv_writer.writerow([source, relation, target, source_label, target_label, sab, evidence])
                #     source_list.add(source)
                #     target_list.add(target)

                # # take note of nodes that are used for source and target
                # source_ids = "out/serialization/ids/%s.txt"%source_type
                # if not os.path.isfile(source_ids):
                #     with open(source_ids, 'w') as o:
                #         o.write("\n".join(source_list))
                # else:
                #     with open(source_ids) as o:
                #         source_list = source_list.union(o.read().strip().split("\n"))
                #     with open(source_ids, 'w') as o:
                #         o.write("\n".join(source_list))
                # target_ids = "out/serialization/ids/%s.txt"%target_type
                # if not os.path.isfile(target_ids):
                #     with open(target_ids, 'w') as o:
                #         o.write("\n".join(target_list))
                # else:
                #     with open(target_ids) as o:
                #         target_list = target_list.union(o.read().strip().split("\n"))
                #     with open(target_ids, 'w') as o:
                #         o.write("\n".join(target_list))

In [None]:
for filename in glob("out/0915/serialization/nodes/*.csv"):
    df = pd.read_csv(filename, index_col=0, low_memory=False)
    orig_columns = df.columns
    if "type_combined" in df.columns:
        dtype = df.type.unique()[0]
        combined = set()
        for i in df.type_combined:
            combined = combined.union(i.split("; "))
        # remove og type
        combined = combined - {dtype}
        columns = [i for i in df.columns if not i == "type_combined"] + list(combined)
        if len(combined) > 0:
            print(filename)
            for i in combined:
                df[i] = False
            for i, row in df.iterrows():
                type_combined = row["type_combined"].split("; ")
                for t in type_combined:
                    col = "is_%s"%t
                    df.at[i, col] = True
        df = df[columns]
        df.to_csv(filename)

In [100]:
with open("output/august_dcc_sabs.txt") as o:
    sabs_to_keep = set(o.read().strip().split("\n"))

In [26]:
import re
import os
edge_pattern = "(?P<directory>.+)/(?P<source_type>.+)\.(?P<relation>.+)\.(?P<target_type>.+)\.(?P<entity>.+)\.csv"


In [105]:
node_base = "out/0915/serialization/nodes/%s.nodes.csv"
new_node_base = "out/0915/filtered/nodes/%s.nodes.csv"
new_edge_base = "out/0915/filtered/edges/%s.%s.%s.edges.csv"
ids_base = "out/0915/filtered/ids/%s.txt"
node_ids = {}
sab_relations = {}
processed = set()

In [106]:
def glygen(s):
    return s.replace("GLYGEN.RESIDUE", "GLYGEN_RESIDUE").replace("GLYCAN.MOTIF", "GLYCAN_MOTIF").replace('GLYCOSYLTRANSFERASE.REACTION', 'GLYCOSYLTRANSFERASE_REACTION').replace("GLYGEN.SRC", "GLYGEN_SRC").replace('GLYGEN.GLYCOSYLATION', 'GLYGEN_GLYCOSYLATION')

def glygen_reverse(s):
    return s.replace("GLYGEN_RESIDUE", "GLYGEN.RESIDUE").replace("GLYCAN_MOTIF", "GLYCAN.MOTIF").replace('GLYCOSYLTRANSFERASE_REACTION', 'GLYCOSYLTRANSFERASE.REACTION').replace("GLYGEN_SRC", "GLYGEN.SRC").replace('GLYGEN_GLYCOSYLATION', 'GLYGEN.GLYCOSYLATION')


In [107]:
for filename in tqdm(glob("out/0915/serialization/edges/*.csv")):
    if filename not in processed:
        match = re.match(edge_pattern, glygen(filename)).groupdict()
        entity = match["entity"]
        source_type = glygen_reverse(match["source_type"])
        relation = match["relation"].replace("_", " ")
        target_type = glygen_reverse(match["target_type"])
        if "inverse" not in relation:
            edge_df = pd.read_csv(filename, low_memory=False)
            # filter for SAB
            sabs = sabs_to_keep.intersection(edge_df.SAB.unique())
            for sab in sabs:
                if sab not in sab_relations:
                    sab_relations[sab] = set()
                sab_relations[sab].add(relation)
            if len(sabs) > 0:
                edge_df = edge_df[edge_df.SAB.isin(sabs)]
                if not os.path.isfile(ids_base%source_type):
                    with open(ids_base%source_type, 'w') as o:
                        o.write("\n".join(edge_df.source))
                else:
                    with open(ids_base%source_type) as o:
                        ids = set(o.read().strip().split("\n"))
                    with open(ids_base%source_type, 'w') as o:
                        ids = ids.union(edge_df.source)
                        o.write("\n".join(ids))
                if not os.path.isfile(ids_base%target_type):
                    with open(ids_base%target_type, 'w') as o:
                        o.write("\n".join(edge_df.target))
                else:
                    with open(ids_base%target_type) as o:
                        ids = set(o.read().strip().split("\n"))
                    with open(ids_base%target_type, 'w') as o:
                        ids = ids.union(edge_df.target)
                        o.write("\n".join(ids))
                # source_df = pd.read_csv(node_base%source_type, index_col=0, low_memory=False)
                # if os.path.isfile(new_node_base%(source_type)):
                #     new_source_df = pd.read_csv(new_node_base%(source_type), index_col=0, low_memory=False)
                #     pd.concat([new_source_df, source_df]).dropna(axis=1).to_csv(new_node_base%(source_type))
                # else:
                #     source_df.dropna(axis=1).to_csv(new_node_base%(source_type))
                
                # target_df = pd.read_csv(node_base%target_type, index_col=0, low_memory=False)
                # if os.path.isfile(new_node_base%(target_type)):
                #     new_target_df = pd.read_csv(new_node_base%(target_type), index_col=0, low_memory=False)
                #     pd.concat([new_target_df, target_df]).dropna(axis=1).to_csv(new_node_base%(target_type))
                # else:
                #     target_df.dropna(axis=1).to_csv(new_node_base%(target_type))
                edge_df.to_csv(new_edge_base%(source_type, relation, target_type), index=False)
        processed.add(filename.replace("GLYGEN_RESIDUE", "GLYGEN.RESIDUE"))
                

100%|██████████| 75759/75759 [06:31<00:00, 193.49it/s] 


In [108]:
count = 0
for filename in tqdm(glob("out/0915/filtered/ids/*.txt")):
    count+=1
count

100%|██████████| 117/117 [00:00<00:00, 2921033.14it/s]


117

In [109]:
id_pattern = "(?P<directory>.+)/(?P<type>.+)\.txt"
for filename in tqdm(glob("out/0915/filtered/ids/*.txt")):
    if not "inverse" in filename and not "isa_" in filename:
        match = re.match(id_pattern, filename).groupdict()
        node_type = match["type"]
        node_df = pd.read_csv(node_base%node_type, index_col=0, low_memory=False)
        with open(filename) as o:
            ids = list(set(o.read().strip().split("\n")).intersection(node_df.index))
        node_df.loc[ids].dropna(axis=1, how="all").to_csv(new_node_base%node_type)



100%|██████████| 117/117 [01:07<00:00,  1.73it/s]


In [110]:
hgnc = pd.read_csv("out/0915/filtered/nodes/Gene.nodes.csv", low_memory=False)

In [111]:
hgnc.head()

Unnamed: 0.1,Unnamed: 0,label,type,NCI,LCH_NW,MSH,CSP,SNOMEDCT_US,HGNC,MTH,...,CHEBI,ENSEMBL,ENSEMBL value,ENSEMBL lowerbound,ENSEMBL upperbound,ENTREZ,ENTREZ lowerbound,ENTREZ upperbound,PUBCHEM,UNIPROTKB
0,C1420882,TPT1 gene,Gene,NCI:C101389,,,,,HGNC:12022,MTH:NOCODE,...,,ENSEMBL:ENSG00000133112,17.0,45333471.0,45341284.0,ENTREZ:7178,45333471.0,45341183.0,,P13693
1,C1412662,ATP5MC1 gene,Gene,,,,,,HGNC:841,MTH:NOCODE,...,,ENSEMBL:ENSG00000159199,14.0,48892765.0,48895871.0,ENTREZ:516,48892765.0,48895871.0,,P05496
2,C1824623,CCDC96 gene,Gene,,,,,,HGNC:26900,MTH:NOCODE,...,,ENSEMBL:ENSG00000173013,6.0,7040849.0,7043001.0,ENTREZ:257236,7040849.0,7043001.0,,Q2M329
3,C3543374,RN7SL531P gene,Gene,,,,,,HGNC:46547,MTH:NOCODE,...,,ENSEMBL:ENSG00000264071,2.0,43492032.0,43492313.0,,,,,
4,C1336927,VAV3 gene,Gene,NCI:C24894,,,,,HGNC:12659,MTH:NOCODE,...,,ENSEMBL:ENSG00000134215,16.0,107571161.0,107965180.0,ENTREZ:10451,107571161.0,107965180.0,,Q9UKW4


In [112]:
for i in glob('out/0915/filtered/edges/*'):
    if "UNIPROT" in i:
        print(i)

out/0915/filtered/edges/UNIPROTKB.molecularly interacts with.ENCODE RBS HEPG2 K562.edges.csv
out/0915/filtered/edges/UNIPROTKB.predicted in.Body Substance.edges.csv
out/0915/filtered/edges/UNIPROT.isa.GLYCAN.edges.csv
out/0915/filtered/edges/GLYCAN.isa.UNIPROT.edges.csv
out/0915/filtered/edges/UNIPROTKB.not predicted in.Body Substance.edges.csv
out/0915/filtered/edges/GLYGEN GLYCOSYLATION.has enzyme protein.UNIPROTKB.edges.csv
out/0915/filtered/edges/GLYCOSYLTRANSFERASE REACTION.has enzyme protein.UNIPROTKB.edges.csv
out/0915/filtered/edges/UNIPROTKB.molecularly interacts with.ENCODE RBS HEPG2.edges.csv
out/0915/filtered/edges/Drug.bioactivity.UNIPROTKB.edges.csv
out/0915/filtered/edges/UNIPROTKB.molecularly interacts with.ENCODE RBS K562.edges.csv


In [113]:
concepts.type.unique()

NameError: name 'concepts' is not defined

In [114]:
gene_or_genome_df = pd.read_csv("out/0915/filtered/nodes/Gene or Genome.nodes.csv", index_col=0)
uniprot = pd.read_csv("out/0915/filtered/nodes/UNIPROTKB.nodes.csv", index_col=0)
gene_df = pd.read_csv("out/0915/filtered/nodes/Gene.nodes.csv", index_col=0)

  gene_df = pd.read_csv("out/0915/filtered/nodes/Gene.nodes.csv", index_col=0)


In [115]:
gene_or_genome_df.head()

Unnamed: 0_level_0,label,type,NCI,MTH,PUBCHEM
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C2828054,FANCD2 wt Allele,Gene or Genome,NCI:C86550,MTH:NOCODE,PUBCHEM:643975


In [116]:
uniprot.head()

Unnamed: 0_level_0,label,type,UNIPROTKB
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
UNIPROTKB:P41182 CUI,BCL6_HUMAN,UNIPROTKB,UNIPROTKB:P41182
UNIPROTKB:P51531 CUI,SMCA2_HUMAN,UNIPROTKB,UNIPROTKB:P51531
UNIPROTKB:P25054 CUI,APC_HUMAN,UNIPROTKB,UNIPROTKB:P25054
UNIPROTKB:Q07817 CUI,B2CL1_HUMAN,UNIPROTKB,UNIPROTKB:Q07817
UNIPROTKB:P19801 CUI,AOC1_HUMAN,UNIPROTKB,UNIPROTKB:P19801


In [117]:
gene_df.head()

Unnamed: 0,label,type,NCI,LCH_NW,MSH,CSP,SNOMEDCT_US,HGNC,MTH,OMIM,...,CHEBI,ENSEMBL,ENSEMBL value,ENSEMBL lowerbound,ENSEMBL upperbound,ENTREZ,ENTREZ lowerbound,ENTREZ upperbound,PUBCHEM,UNIPROTKB
C1420882,TPT1 gene,Gene,NCI:C101389,,,,,HGNC:12022,MTH:NOCODE,OMIM:600763,...,,ENSEMBL:ENSG00000133112,17.0,45333471.0,45341284.0,ENTREZ:7178,45333471.0,45341183.0,,P13693
C1412662,ATP5MC1 gene,Gene,,,,,,HGNC:841,MTH:NOCODE,OMIM:603192,...,,ENSEMBL:ENSG00000159199,14.0,48892765.0,48895871.0,ENTREZ:516,48892765.0,48895871.0,,P05496
C1824623,CCDC96 gene,Gene,,,,,,HGNC:26900,MTH:NOCODE,OMIM:619347,...,,ENSEMBL:ENSG00000173013,6.0,7040849.0,7043001.0,ENTREZ:257236,7040849.0,7043001.0,,Q2M329
C3543374,RN7SL531P gene,Gene,,,,,,HGNC:46547,MTH:NOCODE,,...,,ENSEMBL:ENSG00000264071,2.0,43492032.0,43492313.0,,,,,
C1336927,VAV3 gene,Gene,NCI:C24894,,,,,HGNC:12659,MTH:NOCODE,OMIM:605541,...,,ENSEMBL:ENSG00000134215,16.0,107571161.0,107965180.0,ENTREZ:10451,107571161.0,107965180.0,,Q9UKW4


In [118]:
for i, row in gene_or_genome_df.iterrows():
    gene_df.loc[i] = row

In [119]:
for i, row in uniprot.iterrows():
    gene_df.loc[i] = row

In [120]:
gene_df.type = "Gene"

In [121]:
gene_df.to_csv("out/0915/filtered/nodes/Gene.nodes.csv")

In [122]:
import os

In [123]:
os.remove("out/0915/filtered/nodes/UNIPROTKB.nodes.csv")
os.remove("out/0915/filtered/nodes/Gene or Genome.nodes.csv")

In [124]:
for i in glob('out/0915/filtered/edges/*'):
    if "UNIPROTKB" in i:
        os.rename(i, i.replace("UNIPROTKB", "Gene"))
    if "Gene or Genome" in i:
        os.rename(i, i.replace("Gene or Genome", "Gene"))

In [125]:
gene_df.head()

Unnamed: 0,label,type,NCI,LCH_NW,MSH,CSP,SNOMEDCT_US,HGNC,MTH,OMIM,...,CHEBI,ENSEMBL,ENSEMBL value,ENSEMBL lowerbound,ENSEMBL upperbound,ENTREZ,ENTREZ lowerbound,ENTREZ upperbound,PUBCHEM,UNIPROTKB
C1420882,TPT1 gene,Gene,NCI:C101389,,,,,HGNC:12022,MTH:NOCODE,OMIM:600763,...,,ENSEMBL:ENSG00000133112,17.0,45333471.0,45341284.0,ENTREZ:7178,45333471.0,45341183.0,,P13693
C1412662,ATP5MC1 gene,Gene,,,,,,HGNC:841,MTH:NOCODE,OMIM:603192,...,,ENSEMBL:ENSG00000159199,14.0,48892765.0,48895871.0,ENTREZ:516,48892765.0,48895871.0,,P05496
C1824623,CCDC96 gene,Gene,,,,,,HGNC:26900,MTH:NOCODE,OMIM:619347,...,,ENSEMBL:ENSG00000173013,6.0,7040849.0,7043001.0,ENTREZ:257236,7040849.0,7043001.0,,Q2M329
C3543374,RN7SL531P gene,Gene,,,,,,HGNC:46547,MTH:NOCODE,,...,,ENSEMBL:ENSG00000264071,2.0,43492032.0,43492313.0,,,,,
C1336927,VAV3 gene,Gene,NCI:C24894,,,,,HGNC:12659,MTH:NOCODE,OMIM:605541,...,,ENSEMBL:ENSG00000134215,16.0,107571161.0,107965180.0,ENTREZ:10451,107571161.0,107965180.0,,Q9UKW4


In [126]:
hgnc_genes = pd.read_csv("dd_data/HGNC_genes.txt", sep="\t")
hgnc_genes.head()

Unnamed: 0,HGNC ID,Status,Approved symbol,Approved name,Enzyme (EC) ID,UniProt accession
0,HGNC:5,Approved,A1BG,alpha-1-B glycoprotein,,P04217
1,HGNC:37133,Approved,A1BG-AS1,A1BG antisense RNA 1,,
2,HGNC:24086,Approved,A1CF,APOBEC1 complementation factor,,Q9NQ94
3,HGNC:7,Approved,A2M,alpha-2-macroglobulin,,P01023
4,HGNC:27057,Approved,A2M-AS1,A2M antisense RNA 1,,


In [127]:
hgnc_mapper = {}
for i, row in hgnc_genes.iterrows():
    hgnc_id = row["HGNC ID"]
    enz_id = row["Enzyme (EC) ID"]
    if hgnc_id not in hgnc_mapper:
        hgnc_mapper[hgnc_id] = {
            "EC ID": enz_id,
            "is_Enzyme": type(enz_id) == str
        }
    

In [128]:
for i, row in gene_df.iterrows():
    hgnc_id = row["HGNC"]
    if hgnc_id in hgnc_mapper:
        gene_df.at[i, "EC ID"] = hgnc_mapper[hgnc_id]["EC ID"]
        gene_df.at[i, "is_Enzyme"] = hgnc_mapper[hgnc_id]["is_Enzyme"]

In [129]:
gene_df[gene_df.is_Enzyme == True]

Unnamed: 0,label,type,NCI,LCH_NW,MSH,CSP,SNOMEDCT_US,HGNC,MTH,OMIM,...,ENSEMBL value,ENSEMBL lowerbound,ENSEMBL upperbound,ENTREZ,ENTREZ lowerbound,ENTREZ upperbound,PUBCHEM,UNIPROTKB,EC ID,is_Enzyme
C1826654,PPCS gene,Gene,,,,,,HGNC:25686,MTH:NOCODE,OMIM:609853,...,9.0,42456117.0,42473385.0,ENTREZ:79717,42456117.0,42460081.0,,Q9HAB8,6.3.2.5,True
C1414049,SEPTIN1 gene,Gene,,,,,,HGNC:2879,MTH:NOCODE,OMIM:612897,...,13.0,30378135.0,30395991.0,ENTREZ:1731,30378135.0,30382561.0,,Q8WYJ6,3.1.5.1,True
C1425932,DCXR gene,Gene,,,,,,HGNC:18985,MTH:NOCODE,OMIM:608347,...,8.0,82035136.0,82037709.0,ENTREZ:51181,82035136.0,82036995.0,,Q7Z4W1,1.1.1.10,True
C1842869,INPP5K gene,Gene,,,,,,HGNC:33882,MTH:NOCODE,OMIM:607875,...,20.0,1494577.0,1516742.0,ENTREZ:51763,1494577.0,1516612.0,,Q9BT40,3.1.3.56,True
C1415276,GPX2 gene,Gene,NCI:C104434,,,,,HGNC:4554,MTH:NOCODE,OMIM:138319,...,13.0,64939152.0,64942746.0,ENTREZ:2877,64939152.0,64942745.0,,P18283,1.11.1.9,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C1825958,MECR gene,Gene,NCI:C89008,,,,,HGNC:19691,MTH:NOCODE,OMIM:608205,...,16.0,29192657.0,29230942.0,ENTREZ:51102,29192657.0,29230934.0,,Q9BV79,1.3.1.38,True
C1420620,TBXAS1 gene,Gene,,,,,,HGNC:11609,MTH:NOCODE,OMIM:274180,...,18.0,139777051.0,140020325.0,ENTREZ:6916,139777051.0,139789622.0,,P24557,5.3.99.5,True
C1412639,ATP2A3 gene,Gene,,,,,,HGNC:813,MTH:NOCODE,OMIM:601929,...,18.0,3923870.0,3964464.0,ENTREZ:489,3923870.0,3964464.0,,Q93084,3.6.3.8,True
C1537381,HSD17B6 gene,Gene,NCI:C105152,,,,,HGNC:23316,MTH:NOCODE,OMIM:606623,...,11.0,56752161.0,56787790.0,ENTREZ:8630,56752161.0,56787736.0,,O14756,1.1.1.62,True


In [130]:
for filename in glob('out/0915/filtered/edges/*'):
    df = pd.read_csv(filename)
    if "CMAP" in df.SAB.unique():
        print(filename)
        os.remove(filename)


out/0915/filtered/edges/Gene.negatively correlated with chemical or drug.Hormone.edges.csv
out/0915/filtered/edges/Gene.negatively correlated with chemical or drug.Amino Acid, Peptide, or Protein.edges.csv
out/0915/filtered/edges/Gene.negatively correlated with chemical or drug.Organic Chemical.edges.csv
out/0915/filtered/edges/Gene.negatively correlated with chemical or drug.Drug.edges.csv
out/0915/filtered/edges/Gene.positively correlated with chemical or drug.Drug.edges.csv
out/0915/filtered/edges/Gene.positively correlated with chemical or drug.Organic Chemical.edges.csv
out/0915/filtered/edges/Gene.negatively correlated with chemical or drug.Pharmacologic Substance.edges.csv
out/0915/filtered/edges/Gene.positively correlated with chemical or drug.Nucleic Acid, Nucleoside, or Nucleotide.edges.csv
out/0915/filtered/edges/Gene.positively correlated with chemical or drug.Pharmacologic Substance.edges.csv
out/0915/filtered/edges/Gene.negatively correlated with chemical or drug.Nucleic 

In [131]:
dcc_mapper = {}
with open('output/sabs_dcc_mapper.txt') as o:
    for line in o:
        r = line.strip().split(":")
        if len(r) == 2:
            dcc_mapper[r[0]] =r[1]
        else:
            dcc_mapper[r[0]] =r[0]

In [132]:
for filename in glob('out/0915/filtered/edges/*'):
    df = pd.read_csv(filename)
    if len(df.SAB.unique()) > 1:
        print(filename, df.SAB.unique())
        df["DCC"] = dcc_mapper[df.SAB.unique()[0]]
    else:
        df["DCC"] = dcc_mapper[df.SAB.unique()[0]]
    df.to_csv(filename)

out/0915/filtered/edges/GLYCAN.isa.UNIPROT.edges.csv ['GLYCORDF' 'GLYCOCOO']
out/0915/filtered/edges/GLYCAN.isa.GLYCAN.edges.csv ['GLYCORDF' 'GLYCOCOO']


In [133]:
gene_df.index.name = "id"
gene_df

Unnamed: 0_level_0,label,type,NCI,LCH_NW,MSH,CSP,SNOMEDCT_US,HGNC,MTH,OMIM,...,ENSEMBL value,ENSEMBL lowerbound,ENSEMBL upperbound,ENTREZ,ENTREZ lowerbound,ENTREZ upperbound,PUBCHEM,UNIPROTKB,EC ID,is_Enzyme
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C1420882,TPT1 gene,Gene,NCI:C101389,,,,,HGNC:12022,MTH:NOCODE,OMIM:600763,...,17.0,45333471.0,45341284.0,ENTREZ:7178,45333471.0,45341183.0,,P13693,,False
C1412662,ATP5MC1 gene,Gene,,,,,,HGNC:841,MTH:NOCODE,OMIM:603192,...,14.0,48892765.0,48895871.0,ENTREZ:516,48892765.0,48895871.0,,P05496,,False
C1824623,CCDC96 gene,Gene,,,,,,HGNC:26900,MTH:NOCODE,OMIM:619347,...,6.0,7040849.0,7043001.0,ENTREZ:257236,7040849.0,7043001.0,,Q2M329,,False
C3543374,RN7SL531P gene,Gene,,,,,,HGNC:46547,MTH:NOCODE,,...,2.0,43492032.0,43492313.0,,,,,,,False
C1336927,VAV3 gene,Gene,NCI:C24894,,,,,HGNC:12659,MTH:NOCODE,OMIM:605541,...,16.0,107571161.0,107965180.0,ENTREZ:10451,107571161.0,107965180.0,,Q9UKW4,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UNIPROTKB:P16083 CUI,NQO2_HUMAN,Gene,,,,,,,,,...,,,,,,,,UNIPROTKB:P16083,,
UNIPROTKB:P10646 CUI,TFPI1_HUMAN,Gene,,,,,,,,,...,,,,,,,,UNIPROTKB:P10646,,
UNIPROTKB:Q92772 CUI,CDKL2_HUMAN,Gene,,,,,,,,,...,,,,,,,,UNIPROTKB:Q92772,,
UNIPROTKB:P01375 CUI,TNFA_HUMAN,Gene,,,,,,,,,...,,,,,,,,UNIPROTKB:P01375,,


In [134]:
gene_df.to_csv("out/0915/filtered/nodes/Gene.nodes.csv")

In [135]:
filenames = []
for filename in glob('out/0915/filtered/nodes/*'):
    df = pd.read_csv(filename, index_col=0, low_memory=False)
    if "label" not in df.columns:
        filenames.append(filename)
        df['label'] = df.index
        df.to_csv(filename)
        print(filename)

out/0915/filtered/nodes/GLYCOSYLTRANSFERASE REACTION.nodes.csv
out/0915/filtered/nodes/EXPBINS.nodes.csv
out/0915/filtered/nodes/GLYGEN SRC.nodes.csv
out/0915/filtered/nodes/GLYGEN GLYCOSYLATION.nodes.csv
out/0915/filtered/nodes/CHLO.nodes.csv
out/0915/filtered/nodes/ENCODE CCRE CTCF.nodes.csv
out/0915/filtered/nodes/4DNQ.nodes.csv
out/0915/filtered/nodes/4DNL.nodes.csv
out/0915/filtered/nodes/MEDGEN.nodes.csv
out/0915/filtered/nodes/KFVARBIN.nodes.csv
out/0915/filtered/nodes/GTEXEXP.nodes.csv
out/0915/filtered/nodes/PVALUEBINS.nodes.csv
out/0915/filtered/nodes/KFPT.nodes.csv
out/0915/filtered/nodes/GTEXEQTL.nodes.csv
out/0915/filtered/nodes/MOTORPAC.nodes.csv
out/0915/filtered/nodes/ENCODE CCRE H3K4ME3.nodes.csv
out/0915/filtered/nodes/ENCODE CCRE H3K27AC.nodes.csv
out/0915/filtered/nodes/ENCODE CCRE.nodes.csv


In [136]:
len(filenames)

18

In [95]:
from py2neo import Graph
from dotenv import load_dotenv

load_dotenv()
graph = Graph(os.getenv('NEO4j_URL'), auth=(os.getenv('NEO4J_USER'), os.getenv('NEO4J_PASSWORD')))

In [97]:
import re

In [105]:
node_pattern = "(?P<directory>.+)/(?P<node_type>.+)\.(?P<entity>.+)\.csv"

for filename in filenames:
    match = re.match(node_pattern, filename).groupdict()
    node_type = match["node_type"]
    print(node_type)
    query = "MATCH (a: `%s`) WHERE a.label IS NULL SET a.label = a.id"%node_type
    graph.run(query)

GLYCOSYLTRANSFERASE REACTION
EXPBINS
GLYGEN SRC
GLYGEN GLYCOSYLATION
CHLO
ENCODE CCRE CTCF
4DNQ
4DNL
MEDGEN
KFVARBIN
GTEXEXP
PVALUEBINS
KFPT
GTEXEQTL
MOTORPAC
ENCODE CCRE H3K4ME3
ENCODE CCRE H3K27AC
ENCODE CCRE


In [137]:
for filename in glob('out/0915/filtered/edges/*'):
    df = pd.read_csv(filename, index_col=0)
    df.to_csv(filename, index=False)

In [138]:
relations = set()
for filename in glob('out/0915/filtered/edges/*'):
    match = re.match(edge_pattern, filename).groupdict()
    relations.add(match['relation'])

In [139]:
from glob import glob
import re

In [140]:
gtex = "GTEXEXP"
# gtex = "GTEXEQTL"

In [141]:
for filename in glob('out/0915/filtered/edges/*'):
    # relations.add(match['relation'])
    if gtex in filename:
        match = re.match(edge_pattern, filename).groupdict()
        print(filename, match["relation"])

out/0915/filtered/edges/GTEXEXP.expressed in.Tissue.edges.csv expressed in
out/0915/filtered/edges/Gene.expresses.GTEXEXP.edges.csv expresses
out/0915/filtered/edges/GTEXEXP.expressed in.Gene.edges.csv expressed in
out/0915/filtered/edges/GTEXEXP.has expression.EXPBINS.edges.csv has expression
out/0915/filtered/edges/GTEXEXP.expressed in.Body Location or Region.edges.csv expressed in
out/0915/filtered/edges/Tissue.expresses.GTEXEXP.edges.csv expresses
out/0915/filtered/edges/Body Part, Organ, or Organ Component.expresses.GTEXEXP.edges.csv expresses
out/0915/filtered/edges/GTEXEXP.expressed in.Body Part, Organ, or Organ Component.edges.csv expressed in
out/0915/filtered/edges/Body Location or Region.expresses.GTEXEXP.edges.csv expresses


In [142]:
gene2gtexp = pd.read_csv("out/0915/filtered/edges/Gene.expresses.GTEXEXP.edges.csv")
tissue2gtexp = pd.read_csv("out/0915/filtered/edges/Tissue.expresses.GTEXEXP.edges.csv")
organ2gtexp = pd.read_csv("out/0915/filtered/edges/Body Part, Organ, or Organ Component.expresses.GTEXEXP.edges.csv")
location2gtexp = pd.read_csv("out/0915/filtered/edges/Body Location or Region.expresses.GTEXEXP.edges.csv")
hasExp = pd.read_csv("out/0915/filtered/edges/GTEXEXP.has expression.EXPBINS.edges.csv")

In [143]:
gene2gtexp.head()

Unnamed: 0,source,relation,target,source_label,target_label,SAB,evidence,DCC
0,C2239334,expresses,GTEXEXP:ENSG00000223972-5-Adipose-Subcutaneous...,DDX11L1 gene,,GTEXEXP,,GTEX
1,C2239334,expresses,GTEXEXP:ENSG00000223972-5-Adrenal-Gland CUI,DDX11L1 gene,,GTEXEXP,,GTEX
2,C2239334,expresses,GTEXEXP:ENSG00000223972-5-Artery-Aorta CUI,DDX11L1 gene,,GTEXEXP,,GTEX
3,C2239334,expresses,GTEXEXP:ENSG00000223972-5-Artery-Coronary CUI,DDX11L1 gene,,GTEXEXP,,GTEX
4,C2239334,expresses,GTEXEXP:ENSG00000223972-5-Artery-Tibial CUI,DDX11L1 gene,,GTEXEXP,,GTEX


In [144]:
tissue2gtexp.head()

Unnamed: 0,source,relation,target,source_label,target_label,SAB,evidence,DCC
0,C0222331,expresses,GTEXEXP:ENSG00000223972-5-Adipose-Subcutaneous...,Subcutaneous Fat,,GTEXEXP,,GTEX
1,UBERON:0008367 CUI,expresses,GTEXEXP:ENSG00000223972-5-Breast-Mammary-Tissu...,breast epithelium,,GTEXEXP,,GTEX
2,UBERON:EFO 0002009 CUI,expresses,GTEXEXP:ENSG00000223972-5-Cells-Cultured-fibro...,,,GTEXEXP,,GTEX
3,UBERON:EFO 0000572 CUI,expresses,GTEXEXP:ENSG00000223972-5-Cells-EBV-transforme...,,,GTEXEXP,,GTEX
4,C1707950,expresses,GTEXEXP:ENSG00000223972-5-Esophagus-Mucosa CUI,Esophageal Squamous Epithelium,,GTEXEXP,,GTEX


In [145]:
organ2gtexp.head()

Unnamed: 0,source,relation,target,source_label,target_label,SAB,evidence,DCC
0,C0001625,expresses,GTEXEXP:ENSG00000223972-5-Adrenal-Gland CUI,Adrenal Glands,,GTEXEXP,,GTEX
1,C0003956,expresses,GTEXEXP:ENSG00000223972-5-Artery-Aorta CUI,Ascending aorta structure,,GTEXEXP,,GTEX
2,C0205042,expresses,GTEXEXP:ENSG00000223972-5-Artery-Coronary CUI,Coronary artery,,GTEXEXP,,GTEX
3,C0085427,expresses,GTEXEXP:ENSG00000223972-5-Artery-Tibial CUI,Tibial Arteries,,GTEXEXP,,GTEX
4,C0005682,expresses,GTEXEXP:ENSG00000223972-5-Bladder CUI,Urinary Bladder,,GTEXEXP,,GTEX


In [146]:
location2gtexp.head()

Unnamed: 0,source,relation,target,source_label,target_label,SAB,evidence,DCC
0,C0736435,expresses,GTEXEXP:ENSG00000223972-5-Kidney-Medulla CUI,Set of outer region of renal pyramids,,GTEXEXP,,GTEX
1,C0736435,expresses,GTEXEXP:ENSG00000227232-5-Kidney-Medulla CUI,Set of outer region of renal pyramids,,GTEXEXP,,GTEX
2,C0736435,expresses,GTEXEXP:ENSG00000278267-1-Kidney-Medulla CUI,Set of outer region of renal pyramids,,GTEXEXP,,GTEX
3,C0736435,expresses,GTEXEXP:ENSG00000243485-5-Kidney-Medulla CUI,Set of outer region of renal pyramids,,GTEXEXP,,GTEX
4,C0736435,expresses,GTEXEXP:ENSG00000237613-2-Kidney-Medulla CUI,Set of outer region of renal pyramids,,GTEXEXP,,GTEX


In [147]:
gene2gtexp.shape

(1573785, 8)

In [148]:
hasExp.head()

Unnamed: 0,source,relation,target,source_label,target_label,SAB,evidence,DCC
0,GTEXEXP:ENSG00000223972-5-Testis CUI,has_expression,EXPBINS:0.1.0.2 CUI,,,GTEXEXP,,GTEX
1,GTEXEXP:ENSG00000227232-5-Adipose-Subcutaneous...,has_expression,EXPBINS:4.0.5.0 CUI,,,GTEXEXP,,GTEX
2,GTEXEXP:ENSG00000227232-5-Adrenal-Gland CUI,has_expression,EXPBINS:2.0.3.0 CUI,,,GTEXEXP,,GTEX
3,GTEXEXP:ENSG00000227232-5-Artery-Aorta CUI,has_expression,EXPBINS:4.0.5.0 CUI,,,GTEXEXP,,GTEX
4,GTEXEXP:ENSG00000227232-5-Artery-Coronary CUI,has_expression,EXPBINS:3.0.4.0 CUI,,,GTEXEXP,,GTEX


In [149]:
len(set(tissue2gtexp.target)), len(set(tissue2gtexp.target).intersection(gene2gtexp.target))

(244832, 244811)

In [150]:
gtexp_gene_mapper = {}
for i, row in gene2gtexp.iterrows():
    gene_id = row["source"]
    gene = row["source_label"]
    gtexp = row["target"]
    gtexp_gene_mapper[gtexp] = {
        "gene_id": gene_id,
        "gene": gene,
    }
(i, len(gtexp_gene_mapper))

(1573784, 1573785)

In [151]:
evidence_mapper = {}
for i, row in hasExp.iterrows():
    gtexp = row["source"]
    target = row["target"]
    # EXPBINS:0.1.0.2 CUI	
    score = float(".".join(target.replace("CUI", "").strip().split(".")[2:]))
    evidence_mapper[gtexp] = score

In [152]:
counter = 0
for i, row in tissue2gtexp.iterrows():
    target = row["target"]
    if target in gtexp_gene_mapper:
        val = gtexp_gene_mapper[target]
        tissue2gtexp.at[i, 'target'] = val["gene_id"]
        tissue2gtexp.at[i, 'target_label'] = val["gene"]
        if target in evidence_mapper:
            tissue2gtexp.at[i, 'evidence'] = evidence_mapper[target]
    else:
        counter+=1
print(counter)
tissue2gtexp.head()

21


Unnamed: 0,source,relation,target,source_label,target_label,SAB,evidence,DCC
0,C0222331,expresses,C2239334,Subcutaneous Fat,DDX11L1 gene,GTEXEXP,0.0,GTEX
1,UBERON:0008367 CUI,expresses,C2239334,breast epithelium,DDX11L1 gene,GTEXEXP,0.0,GTEX
2,UBERON:EFO 0002009 CUI,expresses,C2239334,,DDX11L1 gene,GTEXEXP,0.0,GTEX
3,UBERON:EFO 0000572 CUI,expresses,C2239334,,DDX11L1 gene,GTEXEXP,0.0,GTEX
4,C1707950,expresses,C2239334,Esophageal Squamous Epithelium,DDX11L1 gene,GTEXEXP,0.0,GTEX


In [153]:
tissue2gtexp.to_csv("out/0915/filtered/edges/Tissue.expresses.Gene.edges.csv", index=False)

In [154]:
counter = 0
for i, row in organ2gtexp.iterrows():
    target = row["target"]
    if target in gtexp_gene_mapper:
        val = gtexp_gene_mapper[target]
        organ2gtexp.at[i, 'target'] = val["gene_id"]
        organ2gtexp.at[i, 'target_label'] = val["gene"]
        if target in evidence_mapper:
            organ2gtexp.at[i, 'evidence'] = evidence_mapper[target]
    else:
        counter+=1
print(counter)
organ2gtexp.head()

111


Unnamed: 0,source,relation,target,source_label,target_label,SAB,evidence,DCC
0,C0001625,expresses,C2239334,Adrenal Glands,DDX11L1 gene,GTEXEXP,0.0,GTEX
1,C0003956,expresses,C2239334,Ascending aorta structure,DDX11L1 gene,GTEXEXP,0.0,GTEX
2,C0205042,expresses,C2239334,Coronary artery,DDX11L1 gene,GTEXEXP,0.0,GTEX
3,C0085427,expresses,C2239334,Tibial Arteries,DDX11L1 gene,GTEXEXP,0.0,GTEX
4,C0005682,expresses,C2239334,Urinary Bladder,DDX11L1 gene,GTEXEXP,0.0,GTEX


In [155]:
organ2gtexp.to_csv("out/0915/filtered/edges/Body Part, Organ, or Organ Component.expresses.Gene.edges.csv", index=False)

In [156]:
counter = 0
for i, row in location2gtexp.iterrows():
    target = row["target"]
    if target in gtexp_gene_mapper:
        val = gtexp_gene_mapper[target]
        location2gtexp.at[i, 'target'] = val["gene_id"]
        location2gtexp.at[i, 'target_label'] = val["gene"]
        if target in evidence_mapper:
            location2gtexp.at[i, 'evidence'] = evidence_mapper[target]
    else:
        counter+=1
print(counter)
location2gtexp.head()

3


Unnamed: 0,source,relation,target,source_label,target_label,SAB,evidence,DCC
0,C0736435,expresses,C2239334,Set of outer region of renal pyramids,DDX11L1 gene,GTEXEXP,0.0,GTEX
1,C0736435,expresses,C2829144,Set of outer region of renal pyramids,WASH7P gene,GTEXEXP,3.0,GTEX
2,C0736435,expresses,C3815338,Set of outer region of renal pyramids,MIR6859-1 gene,GTEXEXP,0.0,GTEX
3,C0736435,expresses,C4320459,Set of outer region of renal pyramids,MIR1302-2HG gene,GTEXEXP,0.0,GTEX
4,C0736435,expresses,C2239429,Set of outer region of renal pyramids,FAM138A gene,GTEXEXP,0.0,GTEX


In [157]:
location2gtexp.to_csv("out/0915/filtered/edges/Body Location or Region.expresses.Gene.edges.csv", index=False)

In [158]:
os.remove("out/0915/filtered/edges/Gene.expresses.GTEXEXP.edges.csv")
os.remove("out/0915/filtered/edges/Tissue.expresses.GTEXEXP.edges.csv")
os.remove("out/0915/filtered/edges/Body Part, Organ, or Organ Component.expresses.GTEXEXP.edges.csv")
os.remove("out/0915/filtered/edges/Body Location or Region.expresses.GTEXEXP.edges.csv")
os.remove("out/0915/filtered/edges/GTEXEXP.has expression.EXPBINS.edges.csv")

In [159]:
gtex = "GTEXEQTL"
for filename in glob('out/0915/filtered/edges/*'):
    # relations.add(match['relation'])
    match = re.match(edge_pattern, filename).groupdict()
    if gtex in match["target_type"]:
        print(filename, match["relation"])

out/0915/filtered/edges/Gene.positively regulated by.GTEXEQTL.edges.csv positively regulated by
out/0915/filtered/edges/ENTREZ.positively regulated by.GTEXEQTL.edges.csv positively regulated by
out/0915/filtered/edges/Tissue.part of.GTEXEQTL.edges.csv part of
out/0915/filtered/edges/ENSEMBL.positively regulated by.GTEXEQTL.edges.csv positively regulated by
out/0915/filtered/edges/Tissue.location of.GTEXEQTL.edges.csv location of
out/0915/filtered/edges/Gene.location of.GTEXEQTL.edges.csv location of
out/0915/filtered/edges/Body Part, Organ, or Organ Component.location of.GTEXEQTL.edges.csv location of
out/0915/filtered/edges/ENSEMBL.negatively regulated by.GTEXEQTL.edges.csv negatively regulated by
out/0915/filtered/edges/Body Part, Organ, or Organ Component.part of.GTEXEQTL.edges.csv part of
out/0915/filtered/edges/EFO.location of.GTEXEQTL.edges.csv location of
out/0915/filtered/edges/CHLO.location of.GTEXEQTL.edges.csv location of
out/0915/filtered/edges/CLINGEN ALLELE REGISTRY.part 

In [160]:
gtex = "GTEXEQTL"
for filename in glob('out/0915/filtered/edges/*'):
    # relations.add(match['relation'])
    match = re.match(edge_pattern, filename).groupdict()
    if gtex in match["source_type"]:
        print(filename, match["relation"])

out/0915/filtered/edges/GTEXEQTL.has part.Tissue.edges.csv has part
out/0915/filtered/edges/GTEXEQTL.located in.Body Part, Organ, or Organ Component.edges.csv located in
out/0915/filtered/edges/GTEXEQTL.positively regulates.ENSEMBL.edges.csv positively regulates
out/0915/filtered/edges/GTEXEQTL.has part.Body Part, Organ, or Organ Component.edges.csv has part
out/0915/filtered/edges/GTEXEQTL.negatively regulates.Gene.edges.csv negatively regulates
out/0915/filtered/edges/GTEXEQTL.located in.Cell.edges.csv located in
out/0915/filtered/edges/GTEXEQTL.p value.PVALUEBINS.edges.csv p value
out/0915/filtered/edges/GTEXEQTL.has part.CLINGEN ALLELE REGISTRY.edges.csv has part
out/0915/filtered/edges/GTEXEQTL.located in.Gene.edges.csv located in
out/0915/filtered/edges/GTEXEQTL.negatively regulates.ENSEMBL.edges.csv negatively regulates
out/0915/filtered/edges/GTEXEQTL.positively regulates.ENTREZ.edges.csv positively regulates
out/0915/filtered/edges/GTEXEQTL.positively regulates.Gene.edges.csv 

In [161]:
df = pd.read_csv("out/0915/filtered/edges/ENTREZ.positively regulated by.GTEXEQTL.edges.csv")
df.head()

Unnamed: 0,source,relation,target,source_label,target_label,SAB,evidence,DCC
0,ENSEMBL:ENSG00000274253 CUI,positively_regulated_by,GTEXEQTL:eQTL.chr15.22757689.A.G.b38.Pancreas CUI,ENSG00000274253,,ERCCREG,,ERCC
1,ENSEMBL:ENSG00000274253 CUI,positively_regulated_by,GTEXEQTL:eQTL.chr15.22757689.A.G.b38.Brain.Cau...,ENSG00000274253,,ERCCREG,,ERCC
2,ENSEMBL:ENSG00000274253 CUI,positively_regulated_by,GTEXEQTL:eQTL.chr15.22757689.A.G.b38.Brain.Put...,ENSG00000274253,,ERCCREG,,ERCC
3,ENSEMBL:ENSG00000274253 CUI,positively_regulated_by,GTEXEQTL:eQTL.chr15.22757689.A.G.b38.Brain.Hip...,ENSG00000274253,,ERCCREG,,ERCC
4,ENSEMBL:ENSG00000274253 CUI,positively_regulated_by,GTEXEQTL:eQTL.chr15.22757689.A.G.b38.Brain.Cer...,ENSG00000274253,,ERCCREG,,ERCC


In [162]:
import csv

In [66]:
for filename in glob('out/0915/filtered/edges/*'):
    if 'ENTREZ.' in filename:
        with open(filename) as o:
            csv_reader = csv.reader(o)
            header = True
            for row in csv_reader:
                if header:
                    header = False
                else:
                    print(filename, row[5])
                    df = pd.read_csv(filename)
                    print(df.shape)
                    break

out/0915/filtered/edges/ENCODE CCRE ACTIVITY.regulates.ENTREZ.edges.csv ERCCREG
(107, 8)
out/0915/filtered/edges/ENTREZ.positively regulated by.Drug.edges.csv LINCS
(3, 8)
out/0915/filtered/edges/ENTREZ.positively regulated by.GTEXEQTL.edges.csv ERCCREG
(8, 8)
out/0915/filtered/edges/ENTREZ.regulated by.ENCODE CCRE ACTIVITY.edges.csv ERCCREG
(107, 8)
out/0915/filtered/edges/Drug.positively regulates.ENTREZ.edges.csv LINCS
(3, 8)
out/0915/filtered/edges/GTEXEQTL.positively regulates.ENTREZ.edges.csv ERCCREG
(8, 8)


In [69]:
df = pd.read_csv("out/0915/filtered/edges/ENCODE CCRE ACTIVITY.regulates.ENTREZ.edges.csv")
df.shape

(107, 8)

In [70]:
df.target.unique()

array(['ENSEMBL:ENSG00000274253 CUI'], dtype=object)

In [61]:
for filename in glob('out/0915/filtered/edges/*'):
    with open(filename) as o:
        csv_reader = csv.reader(o)
        header = True
        for row in csv_reader:
            if header:
                header = False
            else:
                if 'ERCC' == row[-1]:
                    print(filename, row[-1])
                break

out/0915/filtered/edges/ENCODE CCRE ACTIVITY.regulates.ENTREZ.edges.csv ERCC
out/0915/filtered/edges/ENCODE CCRE ACTIVITY.isa.ENCODE CCRE H3K27AC.edges.csv ERCC
out/0915/filtered/edges/GTEXEQTL.has part.Tissue.edges.csv ERCC
out/0915/filtered/edges/Gene.molecularly interacts with.ENCODE RBS HEPG2.edges.csv ERCC
out/0915/filtered/edges/CLINGEN ALLELE REGISTRY.located in.ENCODE CCRE.edges.csv ERCC
out/0915/filtered/edges/ENCODE RBS HEPG2 K562.overlaps.Gene.edges.csv ERCC
out/0915/filtered/edges/ENCODE RBS 150 NO OVERLAP.is subsequence of.ENCODE RBS HEPG2 K562.edges.csv ERCC
out/0915/filtered/edges/ENCODE CCRE ACTIVITY.has part.ENCODE CCRE.edges.csv ERCC
out/0915/filtered/edges/ENCODE CCRE ACTIVITY.has part.Tissue.edges.csv ERCC
out/0915/filtered/edges/Gene.molecularly interacts with.ENCODE RBS HEPG2 K562.edges.csv ERCC
out/0915/filtered/edges/ENCODE CCRE ACTIVITY.has part.Body Part, Organ, or Organ Component.edges.csv ERCC
out/0915/filtered/edges/Gene.positively regulated by.GTEXEQTL.edg

In [163]:
# Replace ENTREZ node to gene
entrez = pd.read_csv("out/0915/filtered/nodes/ENTREZ.nodes.csv", index_col=0)
entrez.head()

Unnamed: 0_level_0,label,type,ENSEMBL,ENSEMBL value,ENSEMBL lowerbound,ENSEMBL upperbound,ENTREZ
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSEMBL:ENSG00000274253 CUI,ENSG00000274253,ENTREZ,ENSEMBL:ENSG00000274253,6.0,22757835.0,22778741.0,ENTREZ::283683


In [164]:
gene_df = pd.read_csv("out/0915/filtered/nodes/Gene.nodes.csv", index_col=0)
gene_df.head()


  gene_df = pd.read_csv("out/0915/filtered/nodes/Gene.nodes.csv", index_col=0)


Unnamed: 0_level_0,label,type,NCI,LCH_NW,MSH,CSP,SNOMEDCT_US,HGNC,MTH,OMIM,...,ENSEMBL value,ENSEMBL lowerbound,ENSEMBL upperbound,ENTREZ,ENTREZ lowerbound,ENTREZ upperbound,PUBCHEM,UNIPROTKB,EC ID,is_Enzyme
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C1420882,TPT1 gene,Gene,NCI:C101389,,,,,HGNC:12022,MTH:NOCODE,OMIM:600763,...,17.0,45333471.0,45341284.0,ENTREZ:7178,45333471.0,45341183.0,,P13693,,False
C1412662,ATP5MC1 gene,Gene,,,,,,HGNC:841,MTH:NOCODE,OMIM:603192,...,14.0,48892765.0,48895871.0,ENTREZ:516,48892765.0,48895871.0,,P05496,,False
C1824623,CCDC96 gene,Gene,,,,,,HGNC:26900,MTH:NOCODE,OMIM:619347,...,6.0,7040849.0,7043001.0,ENTREZ:257236,7040849.0,7043001.0,,Q2M329,,False
C3543374,RN7SL531P gene,Gene,,,,,,HGNC:46547,MTH:NOCODE,,...,2.0,43492032.0,43492313.0,,,,,,,False
C1336927,VAV3 gene,Gene,NCI:C24894,,,,,HGNC:12659,MTH:NOCODE,OMIM:605541,...,16.0,107571161.0,107965180.0,ENTREZ:10451,107571161.0,107965180.0,,Q9UKW4,,False


In [165]:
gene_df.loc["ENSEMBL:ENSG00000274253 CUI"] = entrez.loc["ENSEMBL:ENSG00000274253 CUI"]

In [166]:
gene_df.type = "Gene"

In [168]:
gene_df.to_csv("out/0915/filtered/nodes/Gene.nodes.csv")

In [167]:
os.remove('out/0915/filtered/nodes/ENTREZ.nodes.csv')

In [170]:
for filename in sorted(glob('out/0915/filtered/edges/*')):
    if 'ENTREZ.' in filename:
        append_file = filename.replace("ENTREZ", "Gene")
        print(append_file)
        with open(append_file, "a") as a:
            csv_writer = csv.writer(a)
            with open(filename) as o:
                csv_reader = csv.reader(o)
                h = True
                for row in csv_reader:
                    if h:
                        h = False
                    else:
                        if not row[5] == "LINCS":
                            csv_writer.writerow(row)
        os.remove(filename)   
                    

out/0915/filtered/edges/Drug.positively regulates.Gene.edges.csv
out/0915/filtered/edges/ENCODE CCRE ACTIVITY.regulates.Gene.edges.csv
out/0915/filtered/edges/Gene.positively regulated by.Drug.edges.csv
out/0915/filtered/edges/Gene.positively regulated by.GTEXEQTL.edges.csv
out/0915/filtered/edges/Gene.regulated by.ENCODE CCRE ACTIVITY.edges.csv
out/0915/filtered/edges/GTEXEQTL.positively regulates.Gene.edges.csv


In [178]:
gtex = "GTEXEQTL"
for filename in glob('out/0915/filtered/edges/*'):
    # relations.add(match['relation'])
    match = re.match(edge_pattern, filename).groupdict()
    if gtex in match["source_type"]:
        df = pd.read_csv(filename)
        print(filename, df.DCC.unique())

out/0915/filtered/edges/GTEXEQTL.has part.Tissue.edges.csv ['ERCC']
out/0915/filtered/edges/GTEXEQTL.located in.Body Part, Organ, or Organ Component.edges.csv ['GTEX']
out/0915/filtered/edges/GTEXEQTL.positively regulates.ENSEMBL.edges.csv ['ERCC']
out/0915/filtered/edges/GTEXEQTL.has part.Body Part, Organ, or Organ Component.edges.csv ['ERCC']
out/0915/filtered/edges/GTEXEQTL.negatively regulates.Gene.edges.csv ['ERCC']
out/0915/filtered/edges/GTEXEQTL.located in.Cell.edges.csv ['GTEX']
out/0915/filtered/edges/GTEXEQTL.p value.PVALUEBINS.edges.csv ['GTEX']
out/0915/filtered/edges/GTEXEQTL.has part.CLINGEN ALLELE REGISTRY.edges.csv ['ERCC']
out/0915/filtered/edges/GTEXEQTL.located in.Gene.edges.csv ['GTEX']
out/0915/filtered/edges/GTEXEQTL.negatively regulates.ENSEMBL.edges.csv ['ERCC']
out/0915/filtered/edges/GTEXEQTL.positively regulates.Gene.edges.csv ['ERCC']
out/0915/filtered/edges/GTEXEQTL.located in.CHLO.edges.csv ['GTEX']
out/0915/filtered/edges/GTEXEQTL.located in.Tissue.edges

In [180]:
df = pd.read_csv("out/0915/filtered/edges/GTEXEQTL.located in.Gene.edges.csv")
df.head()

Unnamed: 0,source,relation,target,source_label,target_label,SAB,evidence,DCC
0,GTEXEQTL:eQTL.chr7.64957448.C.A.b38.Esophagus....,located_in,C2681341,,CCT6P3 gene,GTEXEQTL,,GTEX
1,GTEXEQTL:eQTL.chr6.32603032.A.G.b38.Brain.Hipp...,located_in,C1415580,,HLA-DRB5 gene,GTEXEQTL,,GTEX
2,GTEXEQTL:eQTL.chr20.35712534.C.T.b38.Adrenal.G...,located_in,C1413675,,CPNE1 gene,GTEXEQTL,,GTEX
3,GTEXEQTL:eQTL.chr2.61320500.T.A.b38.Brain.Fron...,located_in,C2681200,,C2orf74 gene,GTEXEQTL,,GTEX
4,GTEXEQTL:eQTL.chr7.72725280.G.T.b38.Testis CUI,located_in,C2240136,,TYW1B gene,GTEXEQTL,,GTEX


In [2]:
import pandas as pd
from glob import glob
import re


In [4]:
id_mapping = pd.read_csv('dd_data/idmapping_2023_08_24.tsv.gz', sep='\t')
id_mapping.head()

Unnamed: 0,From,To
0,A0A087X1C5,HGNC:2624
1,A0A0B4J2F0,HGNC:50696
2,A0A0C5B5G6,HGNC:7470
3,A0A0K2S4Q6,HGNC:52292
4,A0A0U1RRE5,HGNC:50713


In [6]:
mapper = {}
for i, row in id_mapping.iterrows():
    f = row["From"]
    t = row["To"]
    mapper["UNIPROTKB:%s CUI"%f] = t

In [5]:
genes = pd.read_csv('out/0915/filtered/nodes/Gene.nodes.csv', index_col=0)
genes.head()

  genes = pd.read_csv('out/0915/filtered/nodes/Gene.nodes.csv', index_col=0)


Unnamed: 0_level_0,label,type,NCI,LCH_NW,MSH,CSP,SNOMEDCT_US,HGNC,MTH,OMIM,...,ENSEMBL value,ENSEMBL lowerbound,ENSEMBL upperbound,ENTREZ,ENTREZ lowerbound,ENTREZ upperbound,PUBCHEM,UNIPROTKB,EC ID,is_Enzyme
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C1420882,TPT1 gene,Gene,NCI:C101389,,,,,HGNC:12022,MTH:NOCODE,OMIM:600763,...,17.0,45333471.0,45341284.0,ENTREZ:7178,45333471.0,45341183.0,,P13693,,False
C1412662,ATP5MC1 gene,Gene,,,,,,HGNC:841,MTH:NOCODE,OMIM:603192,...,14.0,48892765.0,48895871.0,ENTREZ:516,48892765.0,48895871.0,,P05496,,False
C1824623,CCDC96 gene,Gene,,,,,,HGNC:26900,MTH:NOCODE,OMIM:619347,...,6.0,7040849.0,7043001.0,ENTREZ:257236,7040849.0,7043001.0,,Q2M329,,False
C3543374,RN7SL531P gene,Gene,,,,,,HGNC:46547,MTH:NOCODE,,...,2.0,43492032.0,43492313.0,,,,,,,False
C1336927,VAV3 gene,Gene,NCI:C24894,,,,,HGNC:12659,MTH:NOCODE,OMIM:605541,...,16.0,107571161.0,107965180.0,ENTREZ:10451,107571161.0,107965180.0,,Q9UKW4,,False


In [12]:
hgnc_to_cui = {}
for i, row in genes.iterrows():
    hgnc = row["HGNC"]
    if hgnc and type(hgnc) == str:
        hgnc_to_cui[hgnc] = i

In [17]:
matched = []
counter = 0
hgnc_match = []
for i in genes.index:
    if i in mapper:
        matched.append(i)
        if mapper[i] in hgnc_to_cui:
            hgnc_match.append((i, mapper[i], hgnc_to_cui[mapper[i]]))
    if "UNIPROTKB" in i:
        counter += 1
print(counter, len(matched), len(hgnc_match))

2592 2546 2546


In [22]:
len(set([i[0] for i in hgnc_match]))

2546

In [24]:
genes = genes.drop(labels=[i[0] for i in hgnc_match])

In [28]:
label_mapper = {}
for i in hgnc_match:
    ind = i[2]
    label = genes.at[ind, 'label']
    label_mapper[ind] = label

In [29]:
label_mapper[ind], ind

('CAPN2 gene', 'C1413112')

In [27]:
edge_pattern

'(?P<directory>.+)/(?P<source_type>.+)\\.(?P<relation>.+)\\.(?P<target_type>.+)\\.(?P<entity>.+)\\.csv'

In [33]:
from tqdm import tqdm

In [47]:
with_uniprot = set()

In [49]:
for filename in tqdm(glob("out/0915/serialization/edges/*Gene.*.csv")):
    match = re.match(edge_pattern, filename).groupdict()
    column = "" 
    if match["target_type"] == "Gene":
        column = "target"
    elif match["source_type"] == "Gene":
        column = "source"
    df = pd.read_csv(filename)
    for i, row in df.iterrows():
        gene_id = row[column]
        if gene_id in mapper:
            with_uniprot.add(filename)
            print(filename)
            print(mapper[gene_id], hgnc_to_cui.get(mapper[gene_id], gene_id))
            break


  9%|▉         | 95/1065 [00:48<03:09,  5.11it/s]

out/0915/serialization/edges/ENSEMBL.has_gene_product.Gene.edges.csv
HGNC:56760 UNIPROTKB:A0A1W2PPE3 CUI


 66%|██████▌   | 698/1065 [07:34<03:51,  1.59it/s]

out/0915/serialization/edges/Gene.has_gene_product.Gene.edges.csv
HGNC:14906 UNIPROTKB:Q8N4C6 CUI


 99%|█████████▉| 1052/1065 [18:16<00:41,  3.20s/it]

out/0915/serialization/edges/Gene.gene_product_of.ENSEMBL.edges.csv
HGNC:56760 UNIPROTKB:A0A1W2PPE3 CUI


100%|██████████| 1065/1065 [18:29<00:00,  1.04s/it]


In [41]:
gene_id

'UNIPROTKB:A0A1W2PPE3 CUI'

In [42]:
mapper[gene_id]

'HGNC:56760'

In [44]:
hgnc_to_cui.get(mapper[gene_id], gene_id)

'UNIPROTKB:A0A1W2PPE3 CUI'

In [50]:
for filename in tqdm(glob("out/0915/serialization/edges/*bioactivity*.csv")):
    print(filename)


100%|██████████| 4/4 [00:00<00:00, 81840.08it/s]

out/0915/serialization/edges/Drug.bioactivity.Gene.edges.csv
out/0915/serialization/edges/UNIPROTKB.inverse_bioactivity.Drug.edges.csv
out/0915/serialization/edges/Gene.inverse_bioactivity.Drug.edges.csv
out/0915/serialization/edges/Drug.bioactivity.UNIPROTKB.edges.csv





In [53]:
filename = "out/0915/serialization/edges/Drug.bioactivity.Gene.edges.csv"
match = re.match(edge_pattern, filename).groupdict()
column = "" 
if match["target_type"] == "Gene":
    column = "target"
elif match["source_type"] == "Gene":
    column = "source"
print(column)
df = pd.read_csv(filename)
for i, row in df.iterrows():
    gene_id = row[column]
    if gene_id in mapper:
        with_uniprot.add(filename)
        print(filename)
        print(mapper[gene_id], hgnc_to_cui.get(mapper[gene_id], gene_id))
        break


target


In [55]:
mapper["UNIPROTKB:Q6ZSR9 CUI"]

KeyError: 'UNIPROTKB:Q6ZSR9 CUI'

In [52]:
df.head()

Unnamed: 0,source,relation,target,source_label,target_label,SAB,evidence
0,PUBCHEM:126565 CUI,bioactivity,UNIPROTKB:Q6ZSR9 CUI,16-hydroxy-16-(hydroxymethyl)-15-methyl-28-oxa...,YJ005_HUMAN,IDGP,Kd
1,PUBCHEM:3035817 CUI,bioactivity,UNIPROTKB:Q6ZSR9 CUI,K-252a,YJ005_HUMAN,IDGP,Kd
2,PUBCHEM:3078519 CUI,bioactivity,UNIPROTKB:Q6ZSR9 CUI,UCN-01,YJ005_HUMAN,IDGP,Kd
3,PUBCHEM:5329102 CUI,bioactivity,UNIPROTKB:Q6ZSR9 CUI,Sunitinib,YJ005_HUMAN,IDGP,Kd
4,PUBCHEM:5494449 CUI,bioactivity,UNIPROTKB:Q6ZSR9 CUI,VX-680/MK-0457,YJ005_HUMAN,IDGP,Kd
