In [4]:
import re
import numpy as np
import itertools
import pandas as pd
from tqdm import tqdm
import torch
import scipy.sparse
import networkx as nx

import torch_geometric.data
import networkx as nx
import pandas as pd
from rdkit import Chem

In [7]:
train_label_path = "./data/train_labels.csv"

In [8]:
df = pd.read_csv(train_label_path)

In [4]:
df["InChI"].iloc[0]

'InChI=1S/C13H20OS/c1-9(2)8-15-13-6-5-10(3)7-12(13)11(4)14/h5-7,9,11,14H,8H2,1-4H3'

In [5]:
chemical_formula_list = [inchi.split("/")[1] for inchi in tqdm(df["InChI"])]
atom_list_org = [re.split("\d+", chemical_formula) for chemical_formula in tqdm(chemical_formula_list)]
bounded_atom_list = set(itertools.chain.from_iterable(atom_list_org))
atom_list = []
for bounded_atom in bounded_atom_list:
    before_char = ""
    for char in bounded_atom:
        if char.isupper():
            if before_char.isupper():
                atom_list.append(before_char)
            before_char = char
        elif char.islower():
            atom_list.append(before_char+char)
            before_char = ""
print("atoms including in training chemical substances are follows %s" % set(atom_list))

100%|██████████| 2424186/2424186 [00:00<00:00, 2667874.96it/s]
100%|██████████| 2424186/2424186 [00:03<00:00, 647357.05it/s]

atoms including in training chemical substances are follows {'Si', 'C', 'Br', 'Cl', 'N', 'S', 'O', 'I', 'B', 'P', 'H', 'F'}





In [13]:
!pip install ipywidgets



In [15]:
from ipywidgets import IntProgress

ModuleNotFoundError: No module named 'ipywidgets'

In [11]:
!conda install -c conda-forge -y rdkit networkx

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/aaron/anaconda3/envs/ml

  added / updated specs:
    - networkx
    - rdkit


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    networkx-2.5               |             py_0         1.2 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         1.2 MB

The following packages will be UPDATED:

  certifi            pkgs/main::certifi-2020.12.5-py38h06a~ --> conda-forge::certifi-2020.12.5-py38h578d9bd_1

The following packages will be SUPERSEDED by a higher-priority channel:

  ca-certificates    pkgs/main::ca-certificates-2021.1.19-~ --> conda-forge::ca-certificates-2020.12.5-ha878542_0
  networkx                                        pkgs/main --> conda-forge
  openssl         

In [5]:
def mol_to_nx(mol: Chem.Mol) -> nx.Graph:
    G = nx.Graph()

    for atom in mol.GetAtoms():
        G.add_node(atom.GetIdx(),
                   atomic_num=atom.GetAtomicNum(),
                   formal_charge=atom.GetFormalCharge(),
                   chiral_tag=atom.GetChiralTag(),
                   hybridization=atom.GetHybridization(),
                   num_explicit_hs=atom.GetNumExplicitHs(),
                   is_aromatic=atom.GetIsAromatic())
    for bond in mol.GetBonds():
        G.add_edge(bond.GetBeginAtomIdx(),
                   bond.GetEndAtomIdx(),
                   bond_type=bond.GetBondType())
    return G

def nx_to_mol(G: nx.Graph) -> Chem.Mol:
    mol = Chem.RWMol()
    atomic_nums = nx.get_node_attributes(G, 'atomic_num')
    chiral_tags = nx.get_node_attributes(G, 'chiral_tag')
    formal_charges = nx.get_node_attributes(G, 'formal_charge')
    node_is_aromatics = nx.get_node_attributes(G, 'is_aromatic')
    node_hybridizations = nx.get_node_attributes(G, 'hybridization')
    num_explicit_hss = nx.get_node_attributes(G, 'num_explicit_hs')
    node_to_idx = {}
    for node in G.nodes():
        a=Chem.Atom(atomic_nums[node])
        a.SetChiralTag(chiral_tags[node])
        a.SetFormalCharge(formal_charges[node])
        a.SetIsAromatic(node_is_aromatics[node])
        a.SetHybridization(node_hybridizations[node])
        a.SetNumExplicitHs(num_explicit_hss[node])
        idx = mol.AddAtom(a)
        node_to_idx[node] = idx

    bond_types = nx.get_edge_attributes(G, 'bond_type')
    for edge in G.edges():
        first, second = edge
        ifirst = node_to_idx[first]
        isecond = node_to_idx[second]
        bond_type = bond_types[first, second]
        mol.AddBond(ifirst, isecond, bond_type)

    Chem.SanitizeMol(mol)
    return mol

In [9]:
inch = df.InChI[0]
mol = Chem.MolFromInchi(inch)
graph = mol_to_nx(mol)
restored_inchi = Chem.MolToInchi(nx_to_mol(graph))
assert restored_inchi == inch
print(inch)
print(restored_inchi)

InChI=1S/C13H20OS/c1-9(2)8-15-13-6-5-10(3)7-12(13)11(4)14/h5-7,9,11,14H,8H2,1-4H3
InChI=1S/C13H20OS/c1-9(2)8-15-13-6-5-10(3)7-12(13)11(4)14/h5-7,9,11,14H,8H2,1-4H3


In [10]:
graph

<networkx.classes.graph.Graph at 0x7fd724cc7fa0>

In [18]:
!python -c "import torch; print(torch.__version__)"

1.7.0


In [20]:
!python -c "import torch; print(torch.version.cuda)"

10.2


In [None]:
!pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-${TORCH}+${CUDA}.html
!pip install torch-sparse -f https://pytorch-geometric.com/whl/torch-${TORCH}+${CUDA}.html
pip install torch-cluster -f https://pytorch-geometric.com/whl/torch-${TORCH}+${CUDA}.html
pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-${TORCH}+${CUDA}.html
pip install torch-geometric

In [2]:
!echo $LD_LIBRARY_PATH

/usr/lib/cuda/lib64:


In [11]:
torch_geometric.utils.convert.from_networkx(graph)

Data(atomic_num=[15], bond_type=[30], chiral_tag=[15], edge_index=[2, 30], formal_charge=[15], hybridization=[15], is_aromatic=[15], num_explicit_hs=[15])