In [22]:
#go to correct folder
%cd "/content/drive/My Drive/Academia/lab projects/Syn_GCN"
!ls

/content/drive/My Drive/Academia/lab projects/Syn_GCN
data					       test		test_confs6.sdf
dataSetB.csv				       test_confs0.sdf	test_confs7.sdf
incompleteIndigoMappings_dataSetB.csv	       test_confs1.sdf	test_confs8.sdf
incompleteIndigoMappings_dataSetB_results.csv  test_confs2.sdf	test_confs9.sdf
manuscript				       test_confs3.sdf	training.csv
Miniconda3-latest-Linux-x86_64.sh	       test_confs4.sdf
research_proposal.docx			       test_confs5.sdf


In [23]:
#install python packages
#install rdkit
!wget -c https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
!chmod +x Miniconda3-latest-Linux-x86_64.sh
!time bash ./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
!time conda install -q -y -c conda-forge rdkit
%matplotlib inline
import matplotlib.pyplot as plt
import sys
import os
sys.path.append('/usr/local/lib/python3.7/site-packages/')

#install pytorch geometrics
%env CUDA=cu101
! pip install torch-scatter==latest+${CUDA} -f https://pytorch-geometric.com/whl/torch-1.5.0.html
! pip install torch-sparse==latest+${CUDA} -f https://pytorch-geometric.com/whl/torch-1.5.0.html
! pip install torch-cluster==latest+${CUDA} -f https://pytorch-geometric.com/whl/torch-1.5.0.html
! pip install torch-spline-conv==latest+${CUDA} -f https://pytorch-geometric.com/whl/torch-1.5.0.html
! pip install torch-geometric


--2020-07-18 20:26:26--  https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
Resolving repo.continuum.io (repo.continuum.io)... 104.18.201.79, 104.18.200.79, 2606:4700::6812:c84f, ...
Connecting to repo.continuum.io (repo.continuum.io)|104.18.201.79|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh [following]
--2020-07-18 20:26:26--  https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
Resolving repo.anaconda.com (repo.anaconda.com)... 104.16.130.3, 104.16.131.3, 2606:4700::6810:8203, ...
Connecting to repo.anaconda.com (repo.anaconda.com)|104.16.130.3|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.

PREFIX=/usr/local
Unpacking payload ...
Collecting package metadata (current_repodata.json): - \ | done
Solving environment: - \ | 

In [24]:
# import pandas
import pandas as pd

# import scipy
from scipy import sparse

# import rdkit
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdChemReactions
from rdkit.Chem import rdmolfiles
from rdkit.Chem.Draw import IPythonConsole

# import pytorch and pyG
from torch_geometric.data import InMemoryDataset
from torch_geometric.utils import convert
from torch_geometric.data import Data
from torch_geometric.data import DataLoader
from torch_geometric.nn import GCNConv
import torch

# import print color control
from termcolor import colored, cprint

In [25]:
#function definition
#============================
# Testing Zone
#===========================

def GenerateConformations(m, n): # testing fxn
  m = Chem.AddHs(m)
  ids=AllChem.EmbedMultipleConfs(m, numConfs=n, params=AllChem.ETKDG())
  Chem.rdmolops.Get3DDistanceMatrix(m,ids[0])
  return m, list(ids)

# def SMILES2DF(smiles_str):# SMILES to SDF
#   mol = Chem.MolFromSmiles(smiles_str)
#   mol_H = Chem.AddHs(mol)
#   AllChem.EmbedMolecule(mol_H, AllChem.ETKDG())
#   AllChem.UFFOptimizeMolecule(mol_H,1000)
#   sdf = rdkit.Chem.rdmolops.RemoveAllHs(mol_H)

def testing(smiles_str):# testing fxn
  mol = Chem.MolFromSmiles(smiles_str)
  mol_H = Chem.AddHs(mol)
  AllChem.EmbedMolecule(mol_H, AllChem.ETKDG())
  AllChem.UFFOptimizeMolecule(mol_H,1000)
  sdf = rdkit.Chem.rdmolops.RemoveAllHs(mol_H)

  # write sdf file for visulization externally
  writer=rdkit.Chem.rdmolfiles.SDWriter('test.sdf')
  writer.write(sdf)
  return  Chem.rdmolops.Get3DDistanceMatrix(sdf,-1)

#========================================================
def writeSDF(sdf, file_name): # write a sdf to a file
  writer=rdkit.Chem.rdmolfiles.SDWriter(file_name)
  writer.write(sdf)

def SMILES2Adjacency(smiles_str): # SMILES to adjacency matrix
  mol = Chem.MolFromSmiles(smiles_str)
  if(mol==None):
    cprint('SMILES2Adjacency(smiles_str): cannot convert SMILES to mol object', 'red')
    return None
  else:
    return Chem.rdmolops.GetAdjacencyMatrix(mol)

def SMILES2Distance(smiles_str): # SMILES to 2D distance matrix
  mol= Chem.MolFromSmiles(smiles_str)
  return Chem.rdmolops.GetDistanceMatrix(mol,-1)

def SMILES2Distance3D(smiles_str, n=0): # SMILES to 3D distance matrix; n is the number of conformer
  dist=[]
  if n == 0:
    sdf = SMILES2SDF(smiles_str)
    dist.append(Chem.rdmolops.Get3DDistanceMatrix(sdf, -1))
  else:
    sdfs, ids = SMILES2SDFConformers(smiles_str, n)
    for id in ids:
      dist.append(Chem.rdmolops.Get3DDistanceMatrix(sdfs, ids[id]))
  return dist


def SMILES2SDF(smiles_str): # SMILES to a single structure without multiple conformers
  mol = Chem.MolFromSmiles(smiles_str)
  mol_H = Chem.AddHs(mol)
  AllChem.EmbedMolecule(mol_H, AllChem.ETKDG())
  AllChem.UFFOptimizeMolecule(mol_H,1000)
  sdf = Chem.rdmolops.RemoveAllHs(mol_H)
  return sdf

def SMILES2SDFConformers(smiles_str, n): # SMILES to SDF structure with multiple conformers; n is the number of conformers. Recommendation: n=50 for rotation bonds(RB) <=7; n=200 if 8<=RB<=12; n=300 if RB>=13
  mol = Chem.MolFromSmiles(smiles_str)
  mol = Chem.AddHs(mol)

  ids=AllChem.EmbedMultipleConfs(mol, numConfs=n, params=AllChem.ETKDG())
  sdfs = Chem.rdmolops.RemoveAllHs(mol)
  return sdfs, list(ids)

# network functions
def train():
  model.train()

  loss_all=0
  for data in train_loader:
    data = data.to(device)
    optimizer.zero_grad()
    output = model(data)
    label = data.y.to(device)
    loss.backward()
    loss_all +=data.num
    optimizer.step()
  return loss_all/len(train_dataset)


In [46]:
%cd /content/drive/My Drive/Academia/lab projects/Syn_GCN
# data preprocess
rxn_list = pd.read_csv('dataSetB.csv')['rxnSmiles_Mapping_NameRxn']

# parameters
num_exp_data=40 # num of data for the experiment
byproduct_cutoff=65 # if the smiles string length is less than this number, it will be classified as byproduct; otherwise as main products
exp_data = rxn_list[0:num_exp_data]
# exp_data.to_csv('training.csv')
data_type = torch.long
training_perc = 0.8 # percentage of training data in the dataset
val_perc = 0.1 # percentage of validation data in the dataset
test_perc = 1-training_perc - val_perc # percentage of testing data in the dataset



# first_curated = [] # a list to store data filtered of byproducts
# curated_training = [] # a list to store final curated data



# check the number of products
# for i in range(N):
#   rxn = rdChemReactions.ReactionFromSmarts(exp_data[i])
#   products=rxn.GetProducts()
#   print('reaction '+str(i) +' has ' + str(len(products)) + ' products')
#   if(len(products)>1):
#     num_mul_products_rxn+=1
#     mul_products_rxn =(i, len(products))
#     # print('rxn_id' + ', num_products:'+ str(mul_products_rxn))
#     mul_products_rxn_list.append(mul_products_rxn)

#   for j in range(len(products)):
#     product_smile = rdmolfiles.MolToSmiles(products[j])
#     if len(product_smile)<byproduct_cutoff:
#       # cprint(str(len(product_smile))+':'+ product_smile,'green')
#       continue
#     else:
#       # first_curated.append(product_smile) # add main product to curated training data
#       num_valid_products+=1
#       cprint(str(len(product_smile))+':'+ product_smile,'blue')
#       if SMILES2Adjacency(product_smile) is not None:
#         A = SMILES2Adjacency(product_smile) # get adjacency matrix of the product, in numpy.matrix 
#         num_nodes = len(A) # get the num of nodes
#         sA = sparse.csr_matrix(A) # convert A from numpy.matrix to scipy sparse matrix
#         edge_index, edge_weight=convert.from_scipy_sparse_matrix(sA) # convert from scipy sparse matrix to edge_index
#         x = torch.tensor([[1]*num_nodes]).t()
#         data = Data(x=x, edge_index=edge_index)
#         # print('data:'+str(data))
#         # print(edge_index)
#         curated_training.append(data)
#       else:
#         num_none_mol+=1
#         continue


# print('total num of multi-products rxns:' + str(num_mul_products_rxn))
# print('num_valid_product:' + str(num_valid_products))
# print('num_none_mol: '+str(num_none_mol))
# print('len(curated_training): '+ str(len(curated_training)))



# # preparation of y
# labels=[1]*num_valid_products
# y = torch.tensor(labels, dtype=torch.float)


class SynDataset(InMemoryDataset):
  def __init__(self, root, transform=None, pre_transform=None):
    super(SynDataset, self).__init__(root, transform, pre_transform)
    self.data, self.slices = torch.load(self.processed_paths[0])

  @property
  def raw_file_names(self):
    return ['dataSetB']
  @property
  def processed_file_names(self):
    return ['processed_data.dataset']

  def download(self):
    pass
  
  def process(self):
    num_mul_products_rxn = 0
    mul_products_rxn_list=[]
    num_valid_products = 0 # number of main products
    num_none_mol = 0 # number for SMILES that cannot be convert to adjacency matrix
      
    curated_training = []

    for i in range(num_exp_data):
      rxn = rdChemReactions.ReactionFromSmarts(exp_data[i])
      products=rxn.GetProducts()
      print('reaction '+str(i) +' has ' + str(len(products)) + ' products')
      if(len(products)>1):
        num_mul_products_rxn+=1
        mul_products_rxn =(i, len(products))
        # print('rxn_id' + ', num_products:'+ str(mul_products_rxn))
        mul_products_rxn_list.append(mul_products_rxn)

      for j in range(len(products)):
        product_smile = rdmolfiles.MolToSmiles(products[j])
        if len(product_smile)<byproduct_cutoff:
          # cprint(str(len(product_smile))+':'+ product_smile,'green')
          continue
        else:
          # first_curated.append(product_smile) # add main product to curated training data
          num_valid_products+=1
          cprint(str(len(product_smile))+':'+ product_smile,'blue')
          if SMILES2Adjacency(product_smile) is not None:
            A = SMILES2Adjacency(product_smile) # get adjacency matrix of the product, in numpy.matrix 
            num_nodes = len(A) # get the num of nodes
            sA = sparse.csr_matrix(A) # convert A from numpy.matrix to scipy sparse matrix
            edge_index, edge_weight=convert.from_scipy_sparse_matrix(sA) # convert from scipy sparse matrix to edge_index
            x = torch.tensor([[1]*num_nodes]).t()
            y = 1
            data = Data(x=x, edge_index=edge_index, y=y)
            # print('data:'+str(data))
            # print(edge_index)
            curated_training.append(data)
          else:
            num_none_mol+=1
            continue
    
    data, slices = self.collate(curated_training)
    torch.save((data, slices), self.processed_paths[0])

dataset = SynDataset('data')
print(len(dataset))
dataset = dataset.shuffle()
num_training = int(num_exp_data*training_perc)
num_val = int(num_exp_data*val_perc)
train_dataset = dataset[:num_training]
val_dataset = dataset[num_training:(num_training + num_val)]
test_dataset = dataset[(num_training + num_val):]
print(num_training, num_val)

/content/drive/My Drive/Academia/lab projects/Syn_GCN
10
32 4


In [27]:
# create dataset


In [28]:
# main codes
#==========================
# Testing Zone

# smiles= '[CH3:1][N:10]1[CH2:9][c:7]2[c:6](-[cH:5][cH:4][c:3]([Cl:2])[cH:8]2)-[n:17]2[c:12]([CH2:11]1)-[n:13][n:14][c:15]2'
# #smiles= 'CC(=O)OC1=CC=CC=C1C(=O)O'
# n=10



# k= SMILES2Distance3D(smiles,n)
# for i in range(len(k)):
#   print(k[i][3])
#   print('\n')
# print(len(k))


# sdfs, ids = SMILES2SDFConformers(smiles, n)
# sdfs_list =[]
# for i in range(len(ids)):
#   file = 'test_confs'+str(i)+'.sdf'
#   sdfs_list.append(Chem.rdmolfiles.MolToMolFile(sdfs , file, True, ids[i]))
# # Chem.rdmolops.Get3DDistanceMatrix(sdfs,ids[1])
# # for i in range(len(sdfs_list)):
# #   print(sdfs_list[3])

# %cd test
# # writeSDF(sdfs, 'test_confs.sdf')
# writeSDF(SMILES2SDF(smiles),'test_no_conf.sdf')
# writeSDF(Chem.MolFromSmiles(smiles),'test_2D.sdf')
num_node_features =1
num_classes = 2
batch_size =1
num_epochs = 10
# %cd ..



class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(num_node_features, 16)
        self.conv2 = GCNConv(16, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
train_loader = DataLoader(train_dataset, batch_size = batch_size)
for epoch in range(num_epochs):
  train()

#========================





RuntimeError: ignored