In [1]:
import os.path as path
import networkx as nx
import pandas as pd
import numpy as np
import time
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import plotly.express as px
%matplotlib notebook

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Relations-between-chemicals-and-genes" data-toc-modified-id="Relations-between-chemicals-and-genes-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Relations between chemicals and genes</a></span><ul class="toc-item"><li><span><a href="#chemical-->-gene" data-toc-modified-id="chemical-->-gene-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>chemical -&gt; gene</a></span></li><li><span><a href="#gene-->-chemical" data-toc-modified-id="gene-->-chemical-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>gene -&gt; chemical</a></span></li></ul></li></ul></div>

## Relations between chemicals and genes

In [3]:
import re
import pickle

In [4]:
root = "./raw_data"
chem_gene = pd.read_csv(path.join(root, "CTD_chem_gene_ixns.csv"),
                        header=None,
                        skiprows=range(29),
                        usecols=[0, 3, 8]).to_numpy()

In [5]:
adj_chemgo, adj_chemgene, adj_compgo, adj_compgene = set(), set(), set(), set()
processed = []
errored = []

In [12]:
for i in range(chem_gene.shape[0]):
    try:
        c, g, r = chem_gene[i]
        rr = r.replace('(', ':').replace(')', ':')
        cc = c.replace('(', ':').replace(')', ':')

        # ################################################################
        # chemical process -- gene process
        # ################################################################
        b = re.match(r"\[([^\[\]]*)\] (.*) \[([^\[\]]*)\]", r)  # [chem - relation], [GO]
        if b:
            r = b.group(2).split(' and ')
            for j in r:
                ind_r = rela_map.get(j)
                if ind_r is None:
                    rela_map[j] = len(rela_map)
                    ind_r = rela_map[j]
                adj_compgo.add((b.group(1), b.group(3), j))
                adj_compgo_num.add((ind_c, ind_g, ind_r))
            processed.append(i)
            continue

        # ################################################################
        # chemical process -- gene
        # ################################################################
        b = re.match(r"\[([^\[\]]*)\](.*)", r)  # [chem proc] - [relation, gene]
        if b:
            ind_c, ind_g = comp_map.get(b.group(1)), gene_map.get(g)
            if ind_c is None:
                comp_map[b.group(1)] = len(comp_map)
                ind_c = comp_map[b.group(1)]
            if ind_g is None:
                gene_map[g] = len(gene_map)
                ind_g = gene_map[g]
            r = re.match(r" (.*) {}".format(g), b.group(2))
            r = r.group(1).split(' and ')
            for j in r:
                ind_r = rela_map.get(j)
                if ind_r is None:
                    rela_map[j] = len(rela_map)
                    ind_r = rela_map[j]
                adj_compgene.add((b.group(1), g, j))
                adj_compgene_num.add((ind_c, ind_g, ind_r))
            processed.append(i)
            continue
            
        # ################################################################
        # chemical -- gene process
        # ################################################################
        b = re.match(r"(.*)\[([^\[\]]*)\](.*)", rr)  # [chem - relation], [GO]
        if b:
            ind_c, ind_g = chem_map.get(c), go_map.get(b.group(2))
            if ind_c is None:
                chem_map[c] = len(chem_map)
                ind_c = chem_map[c]
            if ind_g is None:
                go_map[b.group(2)] = len(go_map)
                ind_g = go_map[b.group(2)]
            r = re.match(r"{} (.*) ".format(cc), b.group(1))
            r = r.group(1).split(' and ')
            for j in r:
                ind_r = rela_map.get(j)
                if ind_r is None:
                    rela_map[j] = len(rela_map)
                    ind_r = rela_map[j]
                adj_chemgo.add((c, b.group(2), j))
                adj_chemgo_num.add((ind_c, ind_g, ind_r))
            processed.append(i)
            continue

        # ################################################################
        # chemical -- gene
        # ################################################################
        ind_c, ind_g = chem_map.get(c), gene_map.get(g)
        if ind_c is None:
            chem_map[c] = len(chem_map)
            ind_c = chem_map[c]
        if ind_g is None:
            gene_map[g] = len(gene_map)
            ind_g = gene_map[g]
        r = re.match(r"{} (.*) {}".format(cc, g), rr)
        r = r.group(1).split(' and ')
        for j in r:
            ind_r = rela_map.get(j)
            if ind_r is None:
                rela_map[j] = len(rela_map)
                ind_r = rela_map[j]
            adj_chemgene.add((c, g, j))
            adj_chemgene_num.add((ind_c, ind_g, ind_r))
        processed.append(i)
    except AttributeError:
        errored.append(i)


In [13]:
len(errored), len(processed)

(346720, 1614823)

In [14]:
len(adj_chemgene), len(adj_chemgo), len(adj_compgene), len(adj_compgo)

(1099995, 121192, 116677, 3722)

### chemical -> gene

In [28]:
cg = pd.DataFrame(adj_chemgene, columns=['chemical', 'gene', 'relation'])

In [37]:
cg.to_csv('tmp/chemgene.csv', index=False)

### gene -> chemical

In [38]:
chem_gene[errored[:10]]

array([['10,11-dihydro-10,11-dihydroxy-5H-dibenzazepine-5-carboxamide',
        'EPHX1',
        '[EPHX1 gene SNP affects the metabolism of carbamazepine epoxide] which affects the chemical synthesis of 10,11-dihydro-10,11-dihydroxy-5H-dibenzazepine-5-carboxamide'],
       ['10,11-dihydro-10,11-dihydroxy-5H-dibenzazepine-5-carboxamide',
        'EPHX1',
        '[EPHX1 protein results in increased metabolism of carbamazepine epoxide] which results in increased chemical synthesis of 10,11-dihydro-10,11-dihydroxy-5H-dibenzazepine-5-carboxamide'],
       ['10,11-dihydro-10-hydroxycarbamazepine', 'ABCB1',
        'ABCB1 protein results in increased transport of 10,11-dihydro-10-hydroxycarbamazepine'],
       ["10-(6'-ubiquinonyl)decyltriphenylphosphonium bromide", 'SOD2',
        "[10-(6'-ubiquinonyl)decyltriphenylphosphonium bromide results in increased expression of SOD2 protein] which results in decreased susceptibility to Dichlorvos"],
       ['10-decarbamoylmitomycin C', 'CHEK1',
    

In [59]:
adj_genechem, errored1 = set(), list()
for i in errored:
    c, g, r = chem_gene[i]
    try:
        rr = r.replace('(', ':').replace(')', ':')
        cc = c.replace('(', ':').replace(')', ':')
        b = re.match(r"(.*)\[([^\[\]]*)\](.*)", rr)
        if not b:
            r = re.match(r"{} (.*) {}".format(g, cc), rr)
            r = r.group(1).split(' and ')
            for j in r:
                adj_genechem.add((g, c, j))
            continue
        errored1.append(i)       
    except:
        errored1.append(i)
len(errored1), len(adj_genechem)

(312924, 32277)

In [69]:
adj_cg, errored2 = set(), list()
for i in errored1:
    c, g, r = chem_gene[i]
    try:
        rr = r.replace('(', ':').replace(')', ':')
        cc = c.replace('(', ':').replace(')', ':')
        b = re.match(r"\[{} ([^\[\]]*) {} ([^\[\]]*)\](.*)".format(cc, g), rr)
        if b:
            r = b.group(1).split(' and ')
            for j in r:
                adj_cg.add((c, g, j))
            continue
        errored2.append(chem_gene[i])       
    except:
        errored2.append(chem_gene[i])
len(errored2), len(adj_cg)

(296556, 3971)

In [78]:
print(len(adj_chemgene), len(adj_cg))

adj_chemgene.update(adj_cg)
print(len(adj_chemgene))

1099995 3971
1101055


In [79]:
1101055 - 1099995

1060

In [None]:
del rr