In [1]:
import pandas as pd
from glob import glob
from IPython.display import display, Markdown
from tqdm import tqdm
import csv
import re
import os
pd.__version__

'2.0.3'

In [2]:
sabs = set()
with open("dd_data/20230802/neo4j/import/CUI-CUIs.csv") as o:
    csv_reader = csv.reader(o)
    headers = None
    for row in tqdm(csv_reader):
        if not headers:
            headers = row
        else:
            sabs.add(row[3])

156777637it [02:03, 1274158.50it/s]


In [3]:
with open("output/august_sabs.txt", "w") as o:
    o.write("\n".join(sorted(sabs)))

In [4]:
with open("output/august_sabs.txt") as o:
    dcc_sabs = o.read().strip().split("\n")
len(dcc_sabs)

116

# Nodes
## Concept

In [5]:
concepts = pd.read_csv("dd_data/20230802/neo4j/import/CUIs.csv")
concepts = pd.DataFrame(index=concepts["CUI:ID"].unique())
concepts.index.name = "id"
concepts.head()

C0000097
C0000359
C0000610
C0000739
C0000873


In [6]:
concepts.shape

(15527671, 0)

## Semantics

In [7]:
semantics = pd.read_csv("dd_data/20230802/neo4j/import/TUIs.csv", index_col=0)
semantics.head()

Unnamed: 0_level_0,name,STN,DEF
TUI:ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
T012,Bird,A1.1.3.1.1.2,A vertebrate having a constant body temperatur...
T014,Reptile,A1.1.3.1.1.5,A cold-blooded vertebrate having an external c...
T015,Mammal,A1.1.3.1.1.4,A vertebrate having a constant body temperatur...
T021,Fully Formed Anatomical Structure,A1.2.3,An anatomical structure in a fully formed orga...
T022,Body System,A2.1.4.1,A complex of anatomical structures that perfor...


In [8]:
concept_semantics = pd.read_csv("dd_data/20230802/neo4j/import/CUI-TUIs.csv")
concept_semantics.head()

Unnamed: 0,:START_ID,:END_ID
0,C0000132,T126
1,C0000246,T116
2,C0000895,T060
3,C0000908,T037
4,C0000931,T067


In [9]:
no_semantics = set(concepts.index) - set(concept_semantics[':START_ID'])
len(no_semantics)

12264238

In [10]:
with open('out/0623/semantics_ranked.tsv') as o:
    ranked_type = [i.strip() for i in o.read().strip().split("\n")]

In [11]:
concept_semantics.columns = ["id", "TUI:ID"]
concept_semantics["type"] = [semantics.at[i, 'name'] for i in concept_semantics['TUI:ID']]
concept_semantics.head()

Unnamed: 0,id,TUI:ID,type
0,C0000132,T126,Enzyme
1,C0000246,T116,"Amino Acid, Peptide, or Protein"
2,C0000895,T060,Diagnostic Procedure
3,C0000908,T037,Injury or Poisoning
4,C0000931,T067,Phenomenon or Process


In [12]:
def fetch_type(v):
    cat = ""
    rank = len(ranked_type)
    for i in v:
        r = ranked_type.index(i)
        if r < rank:
            cat = i
            rank = r
    return cat

In [13]:
cs = concept_semantics.groupby('id')['type'].apply(lambda x: "; ".join(set(x)))
cs.head()

id
C0000005    Pharmacologic Substance; Indicator, Reagent, o...
C0000039            Pharmacologic Substance; Organic Chemical
C0000052              Enzyme; Amino Acid, Peptide, or Protein
C0000074                                     Organic Chemical
C0000084    Biologically Active Substance; Amino Acid, Pep...
Name: type, dtype: object

In [14]:
cs_ranked = concept_semantics.groupby('id')['type'].apply(fetch_type)
cs_ranked.head()

id
C0000005    Amino Acid, Peptide, or Protein
C0000039                   Organic Chemical
C0000052                             Enzyme
C0000074                   Organic Chemical
C0000084    Amino Acid, Peptide, or Protein
Name: type, dtype: object

In [15]:
common = list(set(concepts.index).intersection(cs.index))
cs[common].head()

id
C2149007    Therapeutic or Preventive Procedure
C2920276                         Medical Device
C4734693                          Clinical Drug
C2476818    Therapeutic or Preventive Procedure
C1611435                       Organic Chemical
Name: type, dtype: object

In [16]:
concepts.loc[common, 'type'] = cs_ranked[common]
concepts.loc[common, 'type_combined'] = cs[common]
concepts.head()

Unnamed: 0_level_0,type,type_combined
id,Unnamed: 1_level_1,Unnamed: 2_level_1
C0000097,Organic Chemical,Hazardous or Poisonous Substance; Organic Chem...
C0000359,Enzyme,"Enzyme; Amino Acid, Peptide, or Protein"
C0000610,Pharmacologic Substance,Pharmacologic Substance; Vitamin
C0000739,"Body Part, Organ, or Organ Component","Body Part, Organ, or Organ Component"
C0000873,Finding,Finding


In [17]:
concepts.shape

(15527671, 2)

## Terms

In [18]:
terms = pd.read_csv("dd_data/20230802/neo4j/import/SUIs.csv", index_col=0)
terms.head()

Unnamed: 0_level_0,name
SUI:ID,Unnamed: 1_level_1
S0009776,"Acid, 2-Aminohexanedioic"
S7249234,BR CAMP
S11872577,cramps abdominal
S14680596,Retained tissue after pregnancy loss
S3417882,Missed miscarriage


In [19]:
concept_term = pd.read_csv("dd_data/20230802/neo4j/import/CUI-SUIs.csv")
concept_term.head()

Unnamed: 0,:START_ID,:END_ID
0,C0000163,S0007806
1,C0000427,S0008336
2,C0000665,S11922611
3,C0000992,S1577859
4,C0001021,S0797872


In [20]:
concept_term.columns = ["CUI:ID", "SUI:ID"]
concept_term.shape

(7923747, 2)

In [21]:
concept_term = pd.merge(concept_term, terms, on="SUI:ID", how='outer')
concept_term = concept_term.groupby('CUI:ID').first()
concept_term.head()

Unnamed: 0_level_0,SUI:ID,name
CUI:ID,Unnamed: 1_level_1,Unnamed: 2_level_1
4DND:4DNES1JP4KZ1 CUI,aW4gc2l0dSBIaS1DIG9uIEhDVDExNiBjZWxscyAoY29udG...,in situ Hi-C on HCT116 cells (containing AID-t...
4DND:4DNES21D8SP8 CUI,TWljcm8tQyBvbiBIMS1FU0MgY2VsbHMuSDEtRVND,Micro-C on H1-ESC cells.H1-ESC
4DND:4DNES2M5JIGV CUI,aW4gc2l0dSBIaS1DIG9uICBIaS1FU0MgY2VsbHMuSDEtRVND,in situ Hi-C on Hi-ESC cells.H1-ESC
4DND:4DNES2R6PUEK CUI,aW4gc2l0dSBIaS1DIG9uIEhGRmM2IGNlbGxzLkhGRmM2,in situ Hi-C on HFFc6 cells.HFFc6
4DND:4DNES3QAGOZZ CUI,aW4gc2l0dSBIaS1DIG9uIEhDVDExNiBjZWxscyAoY29udG...,in situ Hi-C on HCT116 cells (containing AID-t...


In [22]:
concept_term.shape

(7923727, 2)

In [23]:
concepts.loc[concept_term.index, 'label'] = concept_term.loc[concept_term.index, 'name']
concepts.head()

Unnamed: 0_level_0,type,type_combined,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C0000097,Organic Chemical,Hazardous or Poisonous Substance; Organic Chem...,"1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine"
C0000359,Enzyme,"Enzyme; Amino Acid, Peptide, or Protein","3',5'-Cyclic-Nucleotide Phosphodiesterase"
C0000610,Pharmacologic Substance,Pharmacologic Substance; Vitamin,6-Aminonicotinamide
C0000739,"Body Part, Organ, or Organ Component","Body Part, Organ, or Organ Component",Abdominal Muscles
C0000873,Finding,Finding,Academic Problem


In [24]:
concepts.label = concepts.label.fillna("-")
concepts.shape

(15527671, 3)

In [25]:
# how many -
len([i for i in concepts.label if i == "-"])

7603946

## Code

In [26]:
codes = pd.read_csv("dd_data/20230802/neo4j/import/CODEs.csv", index_col=0)
codes.head()

  codes = pd.read_csv("dd_data/20230802/neo4j/import/CODEs.csv", index_col=0)


Unnamed: 0_level_0,SAB,CODE,value:float,lowerbound:float,upperbound:float,unit
CodeID:ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MTHSPL:J7A92W69L7,MTHSPL,J7A92W69L7,,,,
NCI:C76777,NCI,C76777,,,,
ATC:N07XX07,ATC,N07XX07,,,,
GS:1946,GS,1946,,,,
NOC:040413,NOC,040413,,,,


In [27]:
concept_code = pd.read_csv("dd_data/20230802/neo4j/import/CUI-CODEs.csv")
concept_code.head()

Unnamed: 0,:START_ID,:END_ID
0,C0000294,ATC:V03AF01
1,C0000481,CHV:0000000513
2,C0000661,MSH:D015124
3,C0000665,VANDF:4020847
4,C0000737,LNC:LA15468-4


In [28]:
concept_code.columns = ["id", "CodeID:ID"]
concept_code.head()

Unnamed: 0,id,CodeID:ID
0,C0000294,ATC:V03AF01
1,C0000481,CHV:0000000513
2,C0000661,MSH:D015124
3,C0000665,VANDF:4020847
4,C0000737,LNC:LA15468-4


In [29]:
concept_code = pd.merge(concept_code, codes, on="CodeID:ID", how='left')
concept_code.head()

Unnamed: 0,id,CodeID:ID,SAB,CODE,value:float,lowerbound:float,upperbound:float,unit
0,C0000294,ATC:V03AF01,ATC,V03AF01,,,,
1,C0000481,CHV:0000000513,CHV,0000000513,,,,
2,C0000661,MSH:D015124,MSH,D015124,,,,
3,C0000665,VANDF:4020847,VANDF,4020847,,,,
4,C0000737,LNC:LA15468-4,LNC,LA15468-4,,,,


In [30]:
concept_code[concept_code.id == 'C0000097']

Unnamed: 0,id,CodeID:ID,SAB,CODE,value:float,lowerbound:float,upperbound:float,unit
623690,C0000097,LCH_NW:sh86002892,LCH_NW,sh86002892,,,,
1108107,C0000097,CSP:2511-0411,CSP,2511-0411,,,,
1246274,C0000097,PSY:31213,PSY,31213,,,,
2179442,C0000097,MSH:D015632,MSH,D015632,,,,
3529094,C0000097,CHV:0000000501,CHV,0000000501,,,,
3736237,C0000097,PSY:32433,PSY,32433,,,,
4116966,C0000097,SNOMEDCT_US:285407008,SNOMEDCT_US,285407008,,,,


In [31]:
with open("output/unique_SABS_of_Concept.txt", "w") as o:
    sabs = [str(i) for i in concept_code[concept_code.id.isin(concepts[concepts.type.isna()].index)].SAB.unique()]
    o.write("\n".join(sabs))

In [32]:
type_mapper = {}
with open("output/unique_SABS_of_Concept_Mapper.txt") as o:
    for line in o:
        r = line.strip().split(":")
        if len(r) == 2:
            type_mapper[r[0]] = r[1]
        elif "MSIGDB" in r[0]:
            type_mapper[r[0]] = "MSIGDB"
        else:
            type_mapper[r[0]] = r[0]
len((set(type_mapper.values()))), type_mapper["PUBCHEM"]

(159, 'Drug')

In [33]:
for i,row in tqdm(concept_code[concept_code.id.isin(concepts[concepts.type.isna()].index)].iterrows()):
    sab = row["SAB"]
    ind = row["id"]
    if type(sab) == str:
        tp = type_mapper[sab]
        if tp:
            concepts.at[ind, "type"] = tp
            concepts.at[ind, "type_combined"] = tp


15721067it [07:33, 34691.41it/s]


In [34]:
concepts.loc[:, "type"] = [i.replace(".", "_") for i in concepts.type]

In [35]:
for type in tqdm(concepts.type.unique()):
    type = type.replace(".", "_")
    con = concepts[concepts.type==type].copy()
    cc = concept_code[concept_code.id.isin(con.index)]
    if 'MSIGDB' in cc.SAB.unique()[0]:
        sab = "MSIGDB"
        cc.loc[:,'SAB'] = sab
    for sab in cc.SAB.unique():
        c = cc[cc.SAB == sab]
        c = c.groupby('id').first()
        common = list(set(con.index).intersection(c.index))
        con.loc[common, sab] = c.loc[common, "CodeID:ID"]
        if c.loc[common, "value:float"].isna().sum() != len(common):
            con.loc[common, "%s value"%sab] = c.loc[common, "value:float"]
        if c.loc[common, "lowerbound:float"].isna().sum() != len(common):
            con.loc[common, "%s lowerbound"%sab] = c.loc[common, "lowerbound:float"]
        if c.loc[common, "upperbound:float"].isna().sum() != len(common):
            con.loc[common, "%s upperbound"%sab] = c.loc[common, "upperbound:float"]
        if c.loc[common, "unit"].isna().sum() != len(common):
            con.loc[common, "%s unit"%sab] = c.loc[common, "unit"]
        if "-" in list(con["label"]):
            tmp = con[con.label == "-"]
            ind = set(tmp.index).intersection(con.index)
            ind2 = set(tmp.index).intersection(c.index)
            if len(ind.intersection(ind2)) > 0:
                l = list(ind.intersection(ind2))
                con.loc[l, "label"] = c.loc[l, 'CodeID:ID']
        con.to_csv("out/0802/serialization/nodes/%s.nodes.csv"%(type))


100%|██████████| 284/284 [16:36<00:00,  3.51s/it] 


In [36]:
with open("output/august_dcc_sabs.txt") as o:
    sabs_to_keep = set(o.read().strip().split("\n"))

In [37]:
row_headers = ["source", "relation", "target", "source_label", "target_label", "SAB", "evidence"]
inverse = set()
with open("dd_data/20230802/neo4j/import/CUI-CUIs.csv") as o:
    csv_reader = csv.reader(o)
    headers = None
    for row in tqdm(csv_reader):
        if not headers:
            headers = row
        else:
            source = row[0]
            target = row[1]
            if source in concepts.index and target in concepts.index:
                source_label = concepts.at[source, 'label']
                source_type = concepts.at[source, 'type']
                
                target_label = concepts.at[target, 'label']
                target_type = concepts.at[target, 'type']
                relation = row[2]
                sab = row[3]
                evidence = ''
                if len(row) > 4:
                    evidence = row[4]
                if "inverse" in relation or "_by" in relation:
                    inverse.add(relation)
                if sab in sabs_to_keep and "inverse" not in relation and "_by" not in relation:
                    filename = 'out/0802/serialization/edges/%s.%s.%s.edges.csv'%(source_type, relation, target_type)
                    write_header = False
                    operation = "a"
                    if not os.path.isfile(filename):
                        write_header = True
                        operation = "w"
                    with open(filename, operation) as w:
                        csv_writer = csv.writer(w)
                        if write_header:
                            csv_writer.writerow(row_headers)
                        csv_writer.writerow([source, relation, target, source_label, target_label, sab, evidence])

156777637it [1:09:36, 37540.23it/s] 


In [38]:

edge_pattern = "(?P<directory>.+)/(?P<source_type>.+)\.(?P<relation>.+)\.(?P<target_type>.+)\.(?P<entity>.+)\.csv"


In [39]:
node_base = "out/0802/serialization/nodes/%s.nodes.csv"
new_node_base = "out/0802/filtered/nodes/%s.nodes.csv"
new_edge_base = "out/0802/filtered/edges/%s.%s.%s.edges.csv"
ids_base = "out/0802/filtered/ids/%s.txt"
node_ids = {}
sab_relations = {}
processed = set()

In [40]:
for filename in tqdm(glob("out/0802/serialization/edges/*.csv")):
    if filename not in processed:
        match = re.match(edge_pattern, filename).groupdict()
        entity = match["entity"]
        source_type = match["source_type"]
        relation = match["relation"].replace("_", " ")
        target_type = match["target_type"]
        if "inverse" not in relation:
            edge_df = pd.read_csv(filename, low_memory=False)
            # filter for SAB
            sabs = sabs_to_keep.intersection(edge_df.SAB.unique())
            for sab in sabs:
                if sab not in sab_relations:
                    sab_relations[sab] = set()
                sab_relations[sab].add(relation)
            if len(sabs) > 0:
                edge_df = edge_df[edge_df.SAB.isin(sabs)]
                source_df = pd.read_csv(node_base%source_type, index_col=0, low_memory=False)
                if not os.path.isfile(new_node_base%(source_type)):
                    # new_source_df = pd.read_csv(new_node_base%(source_type), index_col=0, low_memory=False)
                    # pd.concat([new_source_df, source_df]).dropna(axis=1).to_csv(new_node_base%(source_type))
                    source_df.to_csv(new_node_base%(source_type))
                # else:
                #     source_df.dropna(axis=1).to_csv(new_node_base%(source_type))
                
                target_df = pd.read_csv(node_base%target_type, index_col=0, low_memory=False)
                if not os.path.isfile(new_node_base%(target_type)):
                    # new_target_df = pd.read_csv(new_node_base%(target_type), index_col=0, low_memory=False)
                    # pd.concat([new_target_df, target_df]).dropna(axis=1).to_csv(new_node_base%(target_type))
                    target_df.to_csv(new_node_base%(target_type))
                # else:
                #     target_df.dropna(axis=1).to_csv(new_node_base%(target_type))
                edge_df.to_csv(new_edge_base%(source_type, relation, target_type), index=False)
        processed.add(filename)
                

100%|██████████| 586/586 [12:51<00:00,  1.32s/it]


In [None]:
pd.read_csv("out/0802/filtered/edges/GTEXEQTL.p value.PVALUEBINS.edges.csv").head()

In [10]:
gtex = 'EXPBINS'
for i in glob("out/0802/filtered/edges/*%s*edges.csv"%gtex):
    df = pd.read_csv(i)

In [4]:
df.shape

(1573920, 7)

In [11]:
df.head()

Unnamed: 0,source,relation,target,source_label,target_label,SAB,evidence
0,GTEXEXP:ENSG00000223972-5-Testis CUI,has_expression,EXPBINS:0.1.0.2 CUI,-,-,GTEXEXP,
1,GTEXEXP:ENSG00000227232-5-Adipose-Subcutaneous...,has_expression,EXPBINS:4.0.5.0 CUI,-,-,GTEXEXP,
2,GTEXEXP:ENSG00000227232-5-Adrenal-Gland CUI,has_expression,EXPBINS:2.0.3.0 CUI,-,-,GTEXEXP,
3,GTEXEXP:ENSG00000227232-5-Artery-Aorta CUI,has_expression,EXPBINS:4.0.5.0 CUI,-,-,GTEXEXP,
4,GTEXEXP:ENSG00000227232-5-Artery-Coronary CUI,has_expression,EXPBINS:3.0.4.0 CUI,-,-,GTEXEXP,


In [5]:
i

'out/0802/filtered/edges/GTEXEXP.has expression.EXPBINS.edges.csv'

In [12]:
gtex = 'GTEXEXP'
for i in glob("out/0802/filtered/edges/*%s*edges.csv"%gtex):
    print(i)
    df = pd.read_csv(i)

out/0802/filtered/edges/GTEXEXP.expressed in.Tissue.edges.csv
out/0802/filtered/edges/Gene or Genome.expresses.GTEXEXP.edges.csv
out/0802/filtered/edges/GTEXEXP.has expression.EXPBINS.edges.csv
out/0802/filtered/edges/GTEXEXP.expressed in.Body Location or Region.edges.csv
out/0802/filtered/edges/Tissue.expresses.GTEXEXP.edges.csv
out/0802/filtered/edges/Body Part, Organ, or Organ Component.expresses.GTEXEXP.edges.csv
out/0802/filtered/edges/GTEXEXP.expressed in.Body Part, Organ, or Organ Component.edges.csv
out/0802/filtered/edges/Body Location or Region.expresses.GTEXEXP.edges.csv
out/0802/filtered/edges/GTEXEXP.expressed in.Gene or Genome.edges.csv


In [86]:
filename = "out/0802/filtered/edges/GTEXEXP.has expression.EXPBINS.edges.csv"
scores = pd.read_csv(filename, index_col=0)
scores.head()

Unnamed: 0,source,relation,target,source_label,target_label,SAB,evidence
0,GTEXEXP:ENSG00000223972-5-Testis CUI,has_expression,EXPBINS:0.1.0.2 CUI,-,-,GTEXEXP,
1,GTEXEXP:ENSG00000227232-5-Adipose-Subcutaneous...,has_expression,EXPBINS:4.0.5.0 CUI,-,-,GTEXEXP,
2,GTEXEXP:ENSG00000227232-5-Adrenal-Gland CUI,has_expression,EXPBINS:2.0.3.0 CUI,-,-,GTEXEXP,
3,GTEXEXP:ENSG00000227232-5-Artery-Aorta CUI,has_expression,EXPBINS:4.0.5.0 CUI,-,-,GTEXEXP,
4,GTEXEXP:ENSG00000227232-5-Artery-Coronary CUI,has_expression,EXPBINS:3.0.4.0 CUI,-,-,GTEXEXP,


In [47]:
scores.evidence = ''
scores.to_csv(filename, index=False)

In [49]:
scores.columns

Index(['source', 'relation', 'target', 'source_label', 'target_label', 'SAB',
       'evidence'],
      dtype='object')

In [53]:
score_dict = {}
for k,v in scores.iterrows():
    source = v["source"]
    score = v["target"]
    score_dict[source] = float(".".join(score.replace("CUI", "").strip().split(".")[2:]))

In [54]:
len(score_dict)

1573920

In [55]:
cols = ['source', 'relation', 'target', 'source_label', 'target_label', 'SAB',
       'evidence']
gtex = 'GTEXEXP'
for i in glob("out/0802/filtered/edges/*%s.*edges.csv"%gtex):
    print(i)

out/0802/filtered/edges/GTEXEXP.expressed in.Tissue.edges.csv
out/0802/filtered/edges/Gene or Genome.expresses.GTEXEXP.edges.csv
out/0802/filtered/edges/GTEXEXP.has expression.EXPBINS.edges.csv
out/0802/filtered/edges/GTEXEXP.expressed in.Body Location or Region.edges.csv
out/0802/filtered/edges/Tissue.expresses.GTEXEXP.edges.csv
out/0802/filtered/edges/Body Part, Organ, or Organ Component.expresses.GTEXEXP.edges.csv
out/0802/filtered/edges/GTEXEXP.expressed in.Body Part, Organ, or Organ Component.edges.csv
out/0802/filtered/edges/Body Location or Region.expresses.GTEXEXP.edges.csv
out/0802/filtered/edges/GTEXEXP.expressed in.Gene or Genome.edges.csv


In [72]:
gene_df = pd.read_csv("out/0802/filtered/edges/GTEXEXP.expressed in.Gene or Genome.edges.csv", index_col=0)
tissue_df = pd.read_csv("out/0802/filtered/edges/Tissue.expresses.GTEXEXP.edges.csv", index_col=0)
organ_df = pd.read_csv("out/0802/filtered/edges/Body Part, Organ, or Organ Component.expresses.GTEXEXP.edges.csv", index_col=0)

In [77]:
gene_df.columns = ['source', 'relation', 'gene_id', 'source_label', 'gene_label', 'SAB',
       'evidence']

In [87]:
scores.head()

Unnamed: 0,source,relation,target,source_label,target_label,SAB,evidence
0,GTEXEXP:ENSG00000223972-5-Testis CUI,has_expression,EXPBINS:0.1.0.2 CUI,-,-,GTEXEXP,
1,GTEXEXP:ENSG00000227232-5-Adipose-Subcutaneous...,has_expression,EXPBINS:4.0.5.0 CUI,-,-,GTEXEXP,
2,GTEXEXP:ENSG00000227232-5-Adrenal-Gland CUI,has_expression,EXPBINS:2.0.3.0 CUI,-,-,GTEXEXP,
3,GTEXEXP:ENSG00000227232-5-Artery-Aorta CUI,has_expression,EXPBINS:4.0.5.0 CUI,-,-,GTEXEXP,
4,GTEXEXP:ENSG00000227232-5-Artery-Coronary CUI,has_expression,EXPBINS:3.0.4.0 CUI,-,-,GTEXEXP,


In [78]:
gene_df.head()

Unnamed: 0,source,relation,gene_id,source_label,gene_label,SAB,evidence
0,GTEXEXP:ENSG00000223972-5-Adipose-Subcutaneous...,expressed_in,C2239334,-,DDX11L1 gene,GTEXEXP,0.0
1,GTEXEXP:ENSG00000223972-5-Adrenal-Gland CUI,expressed_in,C2239334,-,DDX11L1 gene,GTEXEXP,0.0
2,GTEXEXP:ENSG00000223972-5-Artery-Aorta CUI,expressed_in,C2239334,-,DDX11L1 gene,GTEXEXP,0.0
3,GTEXEXP:ENSG00000223972-5-Artery-Coronary CUI,expressed_in,C2239334,-,DDX11L1 gene,GTEXEXP,0.0
4,GTEXEXP:ENSG00000223972-5-Artery-Tibial CUI,expressed_in,C2239334,-,DDX11L1 gene,GTEXEXP,0.0


In [74]:
tissue_df.head()

Unnamed: 0,source,relation,target,source_label,target_label,SAB,evidence
0,C0222331,expresses,GTEXEXP:ENSG00000223972-5-Adipose-Subcutaneous...,Subcutaneous Fat,-,GTEXEXP,0.0
1,UBERON:0008367 CUI,expresses,GTEXEXP:ENSG00000223972-5-Breast-Mammary-Tissu...,breast epithelium,-,GTEXEXP,0.0
2,UBERON:EFO 0002009 CUI,expresses,GTEXEXP:ENSG00000223972-5-Cells-Cultured-fibro...,-,-,GTEXEXP,0.0
3,UBERON:EFO 0000572 CUI,expresses,GTEXEXP:ENSG00000223972-5-Cells-EBV-transforme...,-,-,GTEXEXP,0.0
4,C1707950,expresses,GTEXEXP:ENSG00000223972-5-Esophagus-Mucosa CUI,Esophageal Squamous Epithelium,-,GTEXEXP,0.0


In [83]:
len(tissue_df.target), len(set(tissue_df.target))

(244832, 244832)

In [84]:
len(set(tissue_df.target).intersection(set(gene_df.source)))

244811

In [81]:
len(gene_df.source), len(set(gene_df.source))

(1573785, 1573785)

In [None]:
tissue_df.merge(gene_df)

In [69]:
for i, v in gene_df.iterrows():
    source = v["source"]
    gene_id = v["target"]
    gene_label = v['target_label']
    tissue_df[tissue_df.target == source].target = gene_id
    tissue_df[tissue_df.target == source].target_label = gene_label


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tissue_df[tissue_df.target == source].target = gene_id
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tissue_df[tissue_df.target == source].target_label = gene_label


KeyboardInterrupt: 

In [50]:
cols = ['source', 'relation', 'target', 'source_label', 'target_label', 'SAB',
       'evidence']
gtex = 'GTEXEXP'
for i in glob("out/0802/filtered/edges/%s.*edges.csv"%gtex):
    print(i)
    df = pd.read_csv(i)
    df[cols].to_csv(i)

for i in glob("out/0802/filtered/edges/*.%s.*edges.csv"%gtex):
    print(i)
    df = pd.read_csv(i)
    df[cols].to_csv(i)


out/0802/filtered/edges/GTEXEXP.expressed in.Tissue.edges.csv
out/0802/filtered/edges/GTEXEXP.has expression.EXPBINS.edges.csv
out/0802/filtered/edges/GTEXEXP.expressed in.Body Location or Region.edges.csv
out/0802/filtered/edges/GTEXEXP.expressed in.Body Part, Organ, or Organ Component.edges.csv
out/0802/filtered/edges/GTEXEXP.expressed in.Gene or Genome.edges.csv
out/0802/filtered/edges/Gene or Genome.expresses.GTEXEXP.edges.csv
out/0802/filtered/edges/Tissue.expresses.GTEXEXP.edges.csv
out/0802/filtered/edges/Body Part, Organ, or Organ Component.expresses.GTEXEXP.edges.csv
out/0802/filtered/edges/Body Location or Region.expresses.GTEXEXP.edges.csv


In [35]:
gtex = 'GTEXEXP'
for i in glob("out/0802/filtered/edges/%s.*edges.csv"%gtex):
    print(i)
    df = pd.read_csv(i)
    for k,v in df.iterrows():
        gid = v["source"]
        if gid in score_dict:
            df.at[k, 'evidence'] = score_dict[gid]
    display(df.head())
    df.to_csv(i)


out/0802/filtered/edges/GTEXEXP.expressed in.Tissue.edges.csv


Unnamed: 0.1,Unnamed: 0,source,relation,target,source_label,target_label,SAB,evidence
0,0,GTEXEXP:ENSG00000223972-5-Adipose-Subcutaneous...,expressed_in,C0222331,-,Subcutaneous Fat,GTEXEXP,0.0
1,1,GTEXEXP:ENSG00000223972-5-Breast-Mammary-Tissu...,expressed_in,UBERON:0008367 CUI,-,breast epithelium,GTEXEXP,0.0
2,2,GTEXEXP:ENSG00000223972-5-Cells-Cultured-fibro...,expressed_in,UBERON:EFO 0002009 CUI,-,-,GTEXEXP,0.0
3,3,GTEXEXP:ENSG00000223972-5-Cells-EBV-transforme...,expressed_in,UBERON:EFO 0000572 CUI,-,-,GTEXEXP,0.0
4,4,GTEXEXP:ENSG00000223972-5-Esophagus-Mucosa CUI,expressed_in,C1707950,-,Esophageal Squamous Epithelium,GTEXEXP,0.0


out/0802/filtered/edges/GTEXEXP.has expression.EXPBINS.edges.csv


Unnamed: 0.1,Unnamed: 0,source,relation,target,source_label,target_label,SAB,evidence
0,0,GTEXEXP:ENSG00000223972-5-Testis CUI,has_expression,EXPBINS:0.1.0.2 CUI,-,-,GTEXEXP,0.2
1,1,GTEXEXP:ENSG00000227232-5-Adipose-Subcutaneous...,has_expression,EXPBINS:4.0.5.0 CUI,-,-,GTEXEXP,5.0
2,2,GTEXEXP:ENSG00000227232-5-Adrenal-Gland CUI,has_expression,EXPBINS:2.0.3.0 CUI,-,-,GTEXEXP,3.0
3,3,GTEXEXP:ENSG00000227232-5-Artery-Aorta CUI,has_expression,EXPBINS:4.0.5.0 CUI,-,-,GTEXEXP,5.0
4,4,GTEXEXP:ENSG00000227232-5-Artery-Coronary CUI,has_expression,EXPBINS:3.0.4.0 CUI,-,-,GTEXEXP,4.0


out/0802/filtered/edges/GTEXEXP.expressed in.Body Location or Region.edges.csv


Unnamed: 0.1,Unnamed: 0,source,relation,target,source_label,target_label,SAB,evidence
0,0,GTEXEXP:ENSG00000223972-5-Kidney-Medulla CUI,expressed_in,C0736435,-,Set of outer region of renal pyramids,GTEXEXP,0.0
1,1,GTEXEXP:ENSG00000227232-5-Kidney-Medulla CUI,expressed_in,C0736435,-,Set of outer region of renal pyramids,GTEXEXP,3.0
2,2,GTEXEXP:ENSG00000278267-1-Kidney-Medulla CUI,expressed_in,C0736435,-,Set of outer region of renal pyramids,GTEXEXP,0.0
3,3,GTEXEXP:ENSG00000243485-5-Kidney-Medulla CUI,expressed_in,C0736435,-,Set of outer region of renal pyramids,GTEXEXP,0.0
4,4,GTEXEXP:ENSG00000237613-2-Kidney-Medulla CUI,expressed_in,C0736435,-,Set of outer region of renal pyramids,GTEXEXP,0.0


out/0802/filtered/edges/GTEXEXP.expressed in.Body Part, Organ, or Organ Component.edges.csv


Unnamed: 0,source,relation,target,source_label,target_label,SAB,evidence
0,GTEXEXP:ENSG00000223972-5-Adrenal-Gland CUI,expressed_in,C0001625,-,Adrenal Glands,GTEXEXP,0.0
1,GTEXEXP:ENSG00000223972-5-Artery-Aorta CUI,expressed_in,C0003956,-,Ascending aorta structure,GTEXEXP,0.0
2,GTEXEXP:ENSG00000223972-5-Artery-Coronary CUI,expressed_in,C0205042,-,Coronary artery,GTEXEXP,0.0
3,GTEXEXP:ENSG00000223972-5-Artery-Tibial CUI,expressed_in,C0085427,-,Tibial Arteries,GTEXEXP,0.0
4,GTEXEXP:ENSG00000223972-5-Bladder CUI,expressed_in,C0005682,-,Urinary Bladder,GTEXEXP,0.0


out/0802/filtered/edges/GTEXEXP.expressed in.Gene or Genome.edges.csv


Unnamed: 0,source,relation,target,source_label,target_label,SAB,evidence
0,GTEXEXP:ENSG00000223972-5-Adipose-Subcutaneous...,expressed_in,C2239334,-,DDX11L1 gene,GTEXEXP,0.0
1,GTEXEXP:ENSG00000223972-5-Adrenal-Gland CUI,expressed_in,C2239334,-,DDX11L1 gene,GTEXEXP,0.0
2,GTEXEXP:ENSG00000223972-5-Artery-Aorta CUI,expressed_in,C2239334,-,DDX11L1 gene,GTEXEXP,0.0
3,GTEXEXP:ENSG00000223972-5-Artery-Coronary CUI,expressed_in,C2239334,-,DDX11L1 gene,GTEXEXP,0.0
4,GTEXEXP:ENSG00000223972-5-Artery-Tibial CUI,expressed_in,C2239334,-,DDX11L1 gene,GTEXEXP,0.0


In [36]:
gtex = 'GTEXEXP'
for i in glob("out/0802/filtered/edges/*.%s.*edges.csv"%gtex):
    print(i)
    df = pd.read_csv(i)
    for k,v in df.iterrows():
        gid = v["target"]
        if gid in score_dict:
            df.at[k, 'evidence'] = score_dict[gid]
    display(df.head())
    df.to_csv(i)


out/0802/filtered/edges/Gene or Genome.expresses.GTEXEXP.edges.csv


Unnamed: 0,source,relation,target,source_label,target_label,SAB,evidence
0,C2239334,expresses,GTEXEXP:ENSG00000223972-5-Adipose-Subcutaneous...,DDX11L1 gene,-,GTEXEXP,0.0
1,C2239334,expresses,GTEXEXP:ENSG00000223972-5-Adrenal-Gland CUI,DDX11L1 gene,-,GTEXEXP,0.0
2,C2239334,expresses,GTEXEXP:ENSG00000223972-5-Artery-Aorta CUI,DDX11L1 gene,-,GTEXEXP,0.0
3,C2239334,expresses,GTEXEXP:ENSG00000223972-5-Artery-Coronary CUI,DDX11L1 gene,-,GTEXEXP,0.0
4,C2239334,expresses,GTEXEXP:ENSG00000223972-5-Artery-Tibial CUI,DDX11L1 gene,-,GTEXEXP,0.0


out/0802/filtered/edges/Tissue.expresses.GTEXEXP.edges.csv


Unnamed: 0,source,relation,target,source_label,target_label,SAB,evidence
0,C0222331,expresses,GTEXEXP:ENSG00000223972-5-Adipose-Subcutaneous...,Subcutaneous Fat,-,GTEXEXP,0.0
1,UBERON:0008367 CUI,expresses,GTEXEXP:ENSG00000223972-5-Breast-Mammary-Tissu...,breast epithelium,-,GTEXEXP,0.0
2,UBERON:EFO 0002009 CUI,expresses,GTEXEXP:ENSG00000223972-5-Cells-Cultured-fibro...,-,-,GTEXEXP,0.0
3,UBERON:EFO 0000572 CUI,expresses,GTEXEXP:ENSG00000223972-5-Cells-EBV-transforme...,-,-,GTEXEXP,0.0
4,C1707950,expresses,GTEXEXP:ENSG00000223972-5-Esophagus-Mucosa CUI,Esophageal Squamous Epithelium,-,GTEXEXP,0.0


out/0802/filtered/edges/Body Part, Organ, or Organ Component.expresses.GTEXEXP.edges.csv


Unnamed: 0,source,relation,target,source_label,target_label,SAB,evidence
0,C0001625,expresses,GTEXEXP:ENSG00000223972-5-Adrenal-Gland CUI,Adrenal Glands,-,GTEXEXP,0.0
1,C0003956,expresses,GTEXEXP:ENSG00000223972-5-Artery-Aorta CUI,Ascending aorta structure,-,GTEXEXP,0.0
2,C0205042,expresses,GTEXEXP:ENSG00000223972-5-Artery-Coronary CUI,Coronary artery,-,GTEXEXP,0.0
3,C0085427,expresses,GTEXEXP:ENSG00000223972-5-Artery-Tibial CUI,Tibial Arteries,-,GTEXEXP,0.0
4,C0005682,expresses,GTEXEXP:ENSG00000223972-5-Bladder CUI,Urinary Bladder,-,GTEXEXP,0.0


out/0802/filtered/edges/Body Location or Region.expresses.GTEXEXP.edges.csv


Unnamed: 0,source,relation,target,source_label,target_label,SAB,evidence
0,C0736435,expresses,GTEXEXP:ENSG00000223972-5-Kidney-Medulla CUI,Set of outer region of renal pyramids,-,GTEXEXP,0.0
1,C0736435,expresses,GTEXEXP:ENSG00000227232-5-Kidney-Medulla CUI,Set of outer region of renal pyramids,-,GTEXEXP,3.0
2,C0736435,expresses,GTEXEXP:ENSG00000278267-1-Kidney-Medulla CUI,Set of outer region of renal pyramids,-,GTEXEXP,0.0
3,C0736435,expresses,GTEXEXP:ENSG00000243485-5-Kidney-Medulla CUI,Set of outer region of renal pyramids,-,GTEXEXP,0.0
4,C0736435,expresses,GTEXEXP:ENSG00000237613-2-Kidney-Medulla CUI,Set of outer region of renal pyramids,-,GTEXEXP,0.0


In [31]:
filename = "out/0802/filtered/edges/GTEXEQTL"
scores = pd.read_csv(filename)
scores.head()

0.0

In [37]:
gtex = 'GTEXEQTL'
for i in glob("out/0802/filtered/edges/%s.*edges.csv"%gtex):
    print(i)
    # df = pd.read_csv(i)
    # for k,v in df.iterrows():
    #     gid = v["source"]
    #     if gid in score_dict:
    #         df.at[k, 'evidence'] = score_dict[gid]
    # display(df.head())
    # df.to_csv(i)


out/0802/filtered/edges/GTEXEQTL.has part.Tissue.edges.csv
out/0802/filtered/edges/GTEXEQTL.located in.Body Part, Organ, or Organ Component.edges.csv
out/0802/filtered/edges/GTEXEQTL.positively regulates.ENSEMBL.edges.csv
out/0802/filtered/edges/GTEXEQTL.has part.Body Part, Organ, or Organ Component.edges.csv
out/0802/filtered/edges/GTEXEQTL.negatively regulates.Gene.edges.csv
out/0802/filtered/edges/GTEXEQTL.located in.Cell.edges.csv
out/0802/filtered/edges/GTEXEQTL.p value.PVALUEBINS.edges.csv
out/0802/filtered/edges/GTEXEQTL.negatively regulates.ENSEMBL.edges.csv
out/0802/filtered/edges/GTEXEQTL.positively regulates.ENTREZ.edges.csv
out/0802/filtered/edges/GTEXEQTL.positively regulates.Gene.edges.csv
out/0802/filtered/edges/GTEXEQTL.has part.CLINGEN_ALLELE_REGISTRY.edges.csv
out/0802/filtered/edges/GTEXEQTL.located in.CHLO.edges.csv
out/0802/filtered/edges/GTEXEQTL.positively regulates.Gene or Genome.edges.csv
out/0802/filtered/edges/GTEXEQTL.located in.Tissue.edges.csv
out/0802/fil

In [40]:
pd.read_csv("out/0802/filtered/edges/GTEXEQTL.p value.PVALUEBINS.edges.csv").head().loc[1]

source          GTEXEQTL:eQTL.chr2.112358021.G.A.b38.Skin.Sun....
relation                                                  p_value
target                                 PVALUEBINS:1e-12.1e-11 CUI
source_label                                                    -
target_label                                                    -
SAB                                                      GTEXEQTL
evidence                                                      NaN
Name: 1, dtype: object