In [1]:
import os.path as path
import networkx as nx
import pandas as pd
import numpy as np
import time
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import plotly.express as px
%matplotlib notebook

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Relations-between-chemicals-and-genes" data-toc-modified-id="Relations-between-chemicals-and-genes-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Relations between chemicals and genes</a></span><ul class="toc-item"><li><span><a href="#gene-->-chemical" data-toc-modified-id="gene-->-chemical-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>gene -&gt; chemical</a></span></li><li><span><a href="#chemical-->-gene" data-toc-modified-id="chemical-->-gene-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>chemical -&gt; gene</a></span></li><li><span><a href="#other-relations" data-toc-modified-id="other-relations-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>other relations</a></span></li></ul></li></ul></div>

## Relations between chemicals and genes

In [3]:
import re
import pickle

In [4]:
root = "./raw_data"
chem_gene = pd.read_csv(path.join(root, "CTD_chem_gene_ixns.csv"),
                        header=None,
                        skiprows=range(29),
                        usecols=[0, 3, 8]).to_numpy()

In [8]:
adj_chemgo, adj_chemgene, adj_compgo, adj_compgene = set(), set(), set(), set()
processed = []
errored = []

In [9]:
for i in range(chem_gene.shape[0]):
    try:
        c, g, r = chem_gene[i]
        rr = r.replace('(', ':').replace(')', ':')
        cc = c.replace('(', ':').replace(')', ':')

        # ################################################################
        # chemical process -- gene process
        # ################################################################
        b = re.match(r"\[([^\[\]]*)\] (.*) \[([^\[\]]*)\]", r)  # [chem - relation], [GO]
        if b:
            r = b.group(2).split(' and ')
            for j in r:
                adj_compgo.add((b.group(1), b.group(3), j))
            processed.append(i)
            continue

        # ################################################################
        # chemical process -- gene
        # ################################################################
        b = re.match(r"\[([^\[\]]*)\](.*)", r)  # [chem proc] - [relation, gene]
        if b:
            r = re.match(r" (.*) {}".format(g), b.group(2))
            r = r.group(1).split(' and ')
            for j in r:
                adj_compgene.add((b.group(1), g, j))
            processed.append(i)
            continue
            
        # ################################################################
        # chemical -- gene process
        # ################################################################
        b = re.match(r"(.*)\[([^\[\]]*)\](.*)", rr)  # [chem - relation], [GO]
        if b:
            r = re.match(r"{} (.*) ".format(cc), b.group(1))
            r = r.group(1).split(' and ')
            for j in r:
                adj_chemgo.add((c, b.group(2), j))
            processed.append(i)
            continue

        # ################################################################
        # chemical -- gene
        # ################################################################
        r = re.match(r"{} (.*) {}".format(cc, g), rr)
        r = r.group(1).split(' and ')
        for j in r:
            adj_chemgene.add((c, g, j))
        processed.append(i)
    except AttributeError:
        errored.append(i)


In [10]:
len(errored), len(processed)

(346720, 1614823)

In [11]:
len(adj_chemgene), len(adj_chemgo), len(adj_compgene), len(adj_compgo)

(1099995, 121192, 116677, 3722)

### gene -> chemical

In [38]:
chem_gene[errored[:10]]

array([['10,11-dihydro-10,11-dihydroxy-5H-dibenzazepine-5-carboxamide',
        'EPHX1',
        '[EPHX1 gene SNP affects the metabolism of carbamazepine epoxide] which affects the chemical synthesis of 10,11-dihydro-10,11-dihydroxy-5H-dibenzazepine-5-carboxamide'],
       ['10,11-dihydro-10,11-dihydroxy-5H-dibenzazepine-5-carboxamide',
        'EPHX1',
        '[EPHX1 protein results in increased metabolism of carbamazepine epoxide] which results in increased chemical synthesis of 10,11-dihydro-10,11-dihydroxy-5H-dibenzazepine-5-carboxamide'],
       ['10,11-dihydro-10-hydroxycarbamazepine', 'ABCB1',
        'ABCB1 protein results in increased transport of 10,11-dihydro-10-hydroxycarbamazepine'],
       ["10-(6'-ubiquinonyl)decyltriphenylphosphonium bromide", 'SOD2',
        "[10-(6'-ubiquinonyl)decyltriphenylphosphonium bromide results in increased expression of SOD2 protein] which results in decreased susceptibility to Dichlorvos"],
       ['10-decarbamoylmitomycin C', 'CHEK1',
    

In [12]:
adj_genechem, errored1 = set(), list()
for i in errored:
    c, g, r = chem_gene[i]
    try:
        rr = r.replace('(', ':').replace(')', ':')
        cc = c.replace('(', ':').replace(')', ':')
        b = re.match(r"(.*)\[([^\[\]]*)\](.*)", rr)
        if not b:
            r = re.match(r"{} (.*) {}".format(g, cc), rr)
            r = r.group(1).split(' and ')
            for j in r:
                adj_genechem.add((g, c, j))
            continue
        errored1.append(i)       
    except:
        errored1.append(i)
len(errored1), len(adj_genechem)

(312924, 32277)

In [None]:
errored = errored1

In [18]:
chem_gene[errored[:10]]

array([['10,11-dihydro-10,11-dihydroxy-5H-dibenzazepine-5-carboxamide',
        'EPHX1',
        '[EPHX1 gene SNP affects the metabolism of carbamazepine epoxide] which affects the chemical synthesis of 10,11-dihydro-10,11-dihydroxy-5H-dibenzazepine-5-carboxamide'],
       ['10,11-dihydro-10,11-dihydroxy-5H-dibenzazepine-5-carboxamide',
        'EPHX1',
        '[EPHX1 protein results in increased metabolism of carbamazepine epoxide] which results in increased chemical synthesis of 10,11-dihydro-10,11-dihydroxy-5H-dibenzazepine-5-carboxamide'],
       ["10-(6'-ubiquinonyl)decyltriphenylphosphonium bromide", 'SOD2',
        "[10-(6'-ubiquinonyl)decyltriphenylphosphonium bromide results in increased expression of SOD2 protein] which results in decreased susceptibility to Dichlorvos"],
       ['10-decarbamoylmitomycin C', 'CHEK1',
        'carbobenzoxy-leucyl-leucyl-norvalinal inhibits the reaction [10-decarbamoylmitomycin C results in decreased expression of CHEK1 protein]'],
       ['10

In [27]:
tmp_adj, tmp_errored = set(), list()
for i in errored:
    c, g, r = chem_gene[i]
    try:
        rr = r.replace('(', ':').replace(')', ':')
        cc = c.replace('(', ':').replace(')', ':')
        b = re.match(r"(.*)\[{} ([^\[\]]*) {}([^\[\]]*)\]".format(g, cc), rr)
        if b:
            r = b.group(2).split(' and ')
            for j in r:
                tmp_adj.add((g, c, j))
            continue
        b = re.match(r"\[{} ([^\[\]]*) {}([^\[\]]*)\](.*)".format(g, cc), rr)
        if b:
            r = b.group(1).split(' and ')
            for j in r:
                tmp_adj.add((g, c, j))
            continue
        tmp_errored.append(i)       
    except:
        tmp_errored.append(i)
len(tmp_errored), len(tmp_adj)

(231882, 4991)

In [31]:
errored = tmp_errored
print(len(adj_genechem))
adj_genechem.update(tmp_adj)
print(len(adj_genechem))

32277
33310


In [32]:
gc = pd.DataFrame(adj_genechem, columns=['gene', 'chemical', 'relation'])
gc.to_csv('tmp/genechem.csv', index=False)
gc.head()

Unnamed: 0,gene,chemical,relation
0,SMAD3,Phospholipids,protein results in increased metabolism of
1,SLCO1D1,Diclofenac,results in increased transport of
2,CYP2E2,Acetaminophen,results in increased metabolism of
3,PARA-LIKE,ethofenprox,protein affects the susceptibility to
4,ULK1,Paraquat,protein affects the susceptibility to


### chemical -> gene

In [34]:
tmp_adj, tmp_errored = set(), list()
for i in errored:
    c, g, r = chem_gene[i]
    try:
        rr = r.replace('(', ':').replace(')', ':')
        cc = c.replace('(', ':').replace(')', ':')
        b = re.match(r"\[{} ([^\[\]]*) {}([^\[\]]*)\](.*)".format(cc, g), rr)
        if b:
            r = b.group(1).split(' and ')
            for j in r:
                tmp_adj.add((c, g, j))
            continue
        b = re.match(r"(.*)\[{} ([^\[\]]*) {}([^\[\]]*)\]".format(cc, g), rr)
        if b:
            r = b.group(2).split(' and ')
            for j in r:
                tmp_adj.add((c, g, j))
            continue
        tmp_errored.append(chem_gene[i])       
    except:
        tmp_errored.append(chem_gene[i])
len(tmp_errored), len(tmp_adj)

(99581, 58015)

In [35]:
errored = tmp_errored
print(len(adj_chemgene))
adj_chemgene.update(tmp_adj)
print(len(adj_chemgene))

1099995
1104131


In [36]:
1104131 - 1099995

4136

In [37]:
cg = pd.DataFrame(adj_chemgene, columns=['chemical', 'gene', 'relation'])
cg.to_csv('tmp/chemgene.csv', index=False)
cg.head()

Unnamed: 0,chemical,gene,relation
0,Lipopolysaccharides,VIM,results in increased expression of
1,jinfukang,DUSP1,results in decreased expression of
2,methylparaben,NFYB,results in decreased expression of
3,"2-methoxy-5-(2',3',4'-trimethoxyphenyl)tropone",PCDH8,results in decreased expression of
4,2-methoxypropanol,IL1B,results in increased expression of


### other relations

In [39]:
tmp = pd.DataFrame(adj_chemgo, columns=['chemical', 'gene process', 'relation'])
tmp.to_csv('tmp/chemgo.csv', index=False)
tmp.head()

Unnamed: 0,chemical,gene process,relation
0,Dexamethasone,RX3 gene mutant form affects the expression of...,inhibits the reaction
1,palbociclib,benzyloxycarbonylleucyl-leucyl-leucine aldehyd...,promotes the reaction
2,1-Methyl-3-isobutylxanthine,Terbinafine results in decreased secretion of ...,inhibits the reaction
3,Trazodone,CYP3A4 protein results in increased oxidation ...,inhibits the reaction
4,Hyaluronic Acid,Hydrogen Peroxide results in decreased express...,analog inhibits the reaction


In [40]:
tmp = pd.DataFrame(adj_compgo, columns=['chemical process', 'gene process', 'relation'])
tmp.to_csv('tmp/compgo.csv', index=False)
tmp.head()

Unnamed: 0,chemical process,gene process,relation
0,Potassium Dichromate co-treated with potassium...,Benzo(a)pyrene results in increased expression...,inhibits the reaction
1,pralidoxime co-treated with Atropine,methamidophos results in decreased expression ...,inhibits the reaction
2,Linoleic Acid co-treated with Glucose,Azoxymethane affects the expression of NOS2 pr...,affects the reaction
3,Mifepristone co-treated with NFE2L2 protein,Paraquat results in increased expression of TN...,inhibits the reaction
4,"2,4,5,2',5'-pentachlorobiphenyl co-treated wit...",Lipopolysaccharides results in increased expre...,inhibits the reaction


In [43]:
tmp = pd.DataFrame(adj_compgene, columns=['chemical process', 'gene', 'relation'])
tmp.to_csv('tmp/compgene.csv', index=False)
tmp.head()

Unnamed: 0,chemical process,gene,relation
0,Oxaliplatin co-treated with Topotecan,ID3,results in increased expression of
1,Copper co-treated with Diethylnitrosamine,CCL2,results in increased expression of
2,Estradiol co-treated with TGFB1 protein,HTRA3,results in increased expression of
3,Mercuric Chloride co-treated with Ionomycin co...,IFNG,results in increased expression of
4,pirinixic acid co-treated with PPARA,SYCP3,results in increased expression of


In [44]:
tmp

Unnamed: 0,chemical process,gene,relation
0,Oxaliplatin co-treated with Topotecan,ID3,results in increased expression of
1,Copper co-treated with Diethylnitrosamine,CCL2,results in increased expression of
2,Estradiol co-treated with TGFB1 protein,HTRA3,results in increased expression of
3,Mercuric Chloride co-treated with Ionomycin co...,IFNG,results in increased expression of
4,pirinixic acid co-treated with PPARA,SYCP3,results in increased expression of
5,Cisplatin co-treated with jinfukang,CWC27,results in decreased expression of
6,bisphenol A co-treated with Testosterone,BSG,results in increased expression of
7,NFE2L2 protein affects the susceptibility to O...,DNAJC28,which affects the expression of
8,potassium chromate(VI) co-treated with epigall...,AMPD1,results in increased expression of
9,Dietary Fats co-treated with Resveratrol,MNDA,results in decreased expression of
