In [1]:
import warnings
warnings.filterwarnings('ignore')

import argparse
import time
import random
import os
import io
import logging
import numpy as np
import mxnet as mx
from mxnet import gluon,nd,autograd,npx
import gluonnlp as nlp
import nmt
from gluonnlp.model.transformer import ParallelTransformer, get_transformer_encoder_decoder
import pandas as pd 
nlp.utils.check_version('0.7.0')

In [2]:
np.random.seed(101)
random.seed(101)
mx.random.seed(10001)
ctx = mx.gpu(0)

# parameters for dataset
dataset = 'pubchem'
src_lang, tgt_lang = 'random_smiles', 'rdkit_canonical_smiles'
src_max_len, tgt_max_len = 100, 100

# parameters for model
num_units=128
hidden_size=1024
tf_dropout=0
epsilon=0.1
num_layers=3
num_heads=4
scaled=True
share_embed=True
embed_size=128
tie_weights=True
embed_initializer=None
magnitude = 3.0
lr_update_factor = 0.5
param_file = 'MP/MP_best.params'

In [3]:
def _load_vocab(file_path, **kwargs):
    with open(file_path, 'r') as f:
        return nlp.Vocab.from_json(f.read())

In [4]:
src_vocab = _load_vocab('pubchem/vocab.random_smiles.json')
tgt_vocab = _load_vocab('pubchem/vocab.rdkit_canonical_smiles.json')

In [5]:
encoder, decoder, one_step_ahead_decoder = get_transformer_encoder_decoder(
    units=num_units,
    hidden_size=hidden_size,
    dropout=tf_dropout,
    num_layers=num_layers,
    num_heads=num_heads,
    max_src_length=src_max_len,
    max_tgt_length=tgt_max_len,
    scaled=scaled)
model = nlp.model.translation.NMTModel(src_vocab=src_vocab,
                 tgt_vocab=tgt_vocab,
                 encoder=encoder,
                 decoder=decoder,
                 one_step_ahead_decoder=one_step_ahead_decoder,
                 embed_size=num_units,
                 embed_initializer=None,
                 prefix='transformer_')

[11:40:10] ../src/storage/storage.cc:199: Using Pooled (Naive) StorageManager for CPU


In [6]:
from rdkit import Chem
def canonical_smile(sml):
    try:
        m = Chem.MolFromSmiles(sml)
        return Chem.MolToSmiles(m, canonical=True,isomericSmiles=False)
    except:
        return float('nan')

In [7]:
def no_split(sm):
    arr = []
    i = 0
    try:
        len(sm)
    except:
        print(sm)
    while i < len(sm)-1:
        arr.append(sm[i])
        i += 1
    if i == len(sm)-1:
        arr.append(sm[i])
    return ' '.join(arr)

In [8]:
length_clip = nlp.data.ClipSequence(100)
# Helper function to preprocess a single data point
def preprocess(data):
    # A token index or a list of token indices is
    # returned according to the vocabulary.
    src_sentence = src_vocab[data.split()]
    src_sentence.append(src_vocab[src_vocab.eos_token])
    src_npy = np.array(src_sentence, dtype=np.int32)
    src_nd = mx.nd.array(src_npy)
    return src_nd

# Helper function for getting the length
def get_length(x):
    return float(len(x.split(' ')))

In [9]:
cnn_dropout = 0.1
batch_size = 64
bucket_num, bucket_ratio = 2, 0

In [10]:
class SigmaNet(gluon.HybridBlock):
    """Network for sentiment analysis."""
    def __init__(self, dropout, src_vocab=src_vocab,embed_size=embed_size,output_size=1,
                 num_filters=(100, 200, 200, 200, 200, 100, 100, 100, 100,100), ngram_filter_sizes=(1, 2, 3,4, 5, 6, 7, 8, 9, 10),prefix=None, params=None):
        super(SigmaNet, self).__init__(prefix=prefix, params=params)
        with self.name_scope():
            self.src_embed = None
            self.encoder = None # will set with lm encoder later
            self.textcnn = nlp.model.ConvolutionalEncoder(embed_size=embed_size,
                                                          num_filters=num_filters,
                                                          ngram_filter_sizes=ngram_filter_sizes,
                                                          conv_layer_activation='relu',
                                                          num_highway=1)
            '''
            self.conv = gluon.nn.HybridSequential()
            with self.conv.name_scope():
                self.conv.add(gluon.nn.BatchNorm())
                self.conv.add(gluon.nn.Conv1D(256,kernel_size=3, padding=1,activation= 'relu'))
                self.conv.add(gluon.nn.BatchNorm())
                self.conv.add(gluon.nn.Conv1D(256,kernel_size=3, padding=1,activation= 'relu'))
            self.resnet = gluon.nn.HybridSequential()
            with self.resnet.name_scope():
                self.resnet.add(resnet_block(256, 2, first_block=True),
                resnet_block(256, 2),
                resnet_block(256, 2),
                resnet_block(256, 2),gluon.nn.GlobalAvgPool1D())
            '''
            self.output = gluon.nn.HybridSequential()
            with self.output.name_scope():
                #self.output.add(gluon.nn.Dense(1024))
                #self.output.add(gluon.nn.Dropout(dropout))
                self.output.add(gluon.nn.Dense(512))
                #self.output.add(gluon.nn.Dropout(dropout))
                self.output.add(gluon.nn.Dense(output_size, flatten= False))

    def hybrid_forward(self, F, src_nd, valid_length):# pylint: disable=arguments-differ
        #src_nd = F.reshape(src_nd, (F.shape_array(src_nd), -1))
        src_embed_ = self.src_embed(src_nd)
        encoded,_ = self.encoder(src_embed_,valid_length=valid_length)  # Shape(T, N, C)
        #encoded = F.reshape(encoded,shape= (F.shape_array(encoded)[1],batch_size,-1))
        textcnn = self.textcnn(F.transpose(encoded,axes = (1,0,2)))
        #transformed = self.resnet(mx.symbol.expand_dims(textcnn,axis=1))
        #conv = self.conv(F.transpose(mx.symbol.expand_dims(textcnn,axis=1),axes = (0,2,1)))
        #conv = self.conv(mx.symbol.expand_dims(textcnn,axis=1))
        #transformed = self.resnet(conv)
        out = self.output(textcnn)
        return out

In [11]:
net = SigmaNet(dropout=cnn_dropout)
net.encoder = model.encoder
net.src_embed =  model.src_embed
net.hybridize()
print(net)
#net.textcnn.initialize(mx.init.Xavier(), ctx=ctx)
#net.output.initialize(mx.init.Xavier(), ctx=ctx)

SigmaNet(
  (src_embed): HybridSequential(
    (0): Embedding(72 -> 128, float32)
    (1): Dropout(p = 0.0, axes=())
  )
  (encoder): TransformerEncoder(
    (layer_norm): LayerNorm(eps=1e-05, axis=-1, center=True, scale=True, in_channels=128)
    (transformer_cells): HybridSequential(
      (0): TransformerEncoderCell(
        (attention_cell): MultiHeadAttentionCell(
          (_base_cell): DotProductAttentionCell(
            (_dropout_layer): Dropout(p = 0, axes=())
          )
          (proj_query): Dense(None -> 128, linear)
          (proj_key): Dense(None -> 128, linear)
          (proj_value): Dense(None -> 128, linear)
        )
        (proj): Dense(None -> 128, linear)
        (ffn): PositionwiseFFN(
          (ffn_1): Dense(None -> 1024, linear)
          (activation): Activation(relu)
          (ffn_2): Dense(None -> 128, linear)
          (layer_norm): LayerNorm(eps=1e-05, axis=-1, center=True, scale=True, in_channels=128)
        )
        (layer_norm): LayerNorm(eps=1

In [12]:
net.load_parameters(param_file,ctx=ctx)
net.initialize(init=mx.init.Xavier(magnitude=magnitude), ctx=ctx)

[11:40:13] ../src/storage/storage.cc:199: Using Pooled (Naive) StorageManager for GPU


In [13]:
def predict(net, dataloader,context):
    out = []
    for i, ((data, length), label) in enumerate(dataloader):
        data = data.as_in_context(context)
        length = length.as_in_context(context).astype(np.float32)
        label = label.as_in_context(context)
        output = net(data,length)
        out= out+[f for f in output.asnumpy()]
    return out

In [14]:
def get_dataloader(predict_dataset,predict_smiles_lengths):

    # Pad data, stack label and lengths
    batchify_fn = nlp.data.batchify.Tuple(
        nlp.data.batchify.Pad(axis=0, pad_val=0, ret_length=True),
        nlp.data.batchify.Stack(dtype='float32'))
    predict_batch_sampler = nlp.data.sampler.FixedBucketSampler(
        predict_smiles_lengths,
        batch_size=batch_size,
        num_buckets=bucket_num,
        ratio=bucket_ratio,
        shuffle=False)

    predict_dataloader = gluon.data.DataLoader(dataset=predict_dataset,
                                             batch_sampler=predict_batch_sampler,
                                             batchify_fn=batchify_fn)
    return predict_dataloader

In [15]:
to_predict_data = pd.read_excel('mp_test_set.xls')

In [16]:
predict_smiles = to_predict_data['SMILES'].map(canonical_smile).map(no_split).map(preprocess)
predict_smiles_lengths = to_predict_data['SMILES'].map(canonical_smile).map(no_split).map(get_length)
predict_sigma =mx.nd.ones((len(predict_smiles),1))
predict_dataset = gluon.data.SimpleDataset(gluon.data.ArrayDataset(predict_smiles,predict_sigma))
predict_dataloader = get_dataloader(predict_dataset,predict_smiles_lengths)

In [17]:
predicted = predict(net, predict_dataloader,ctx)

[11:40:17] ../src/operator/cudnn_ops.cc:292: Auto-tuning cuDNN op, set MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable
[11:40:18] ../src/operator/cudnn_ops.cc:292: Auto-tuning cuDNN op, set MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable


In [18]:
df = pd.DataFrame(predicted)
df.to_csv('predicted_mp_test_set.csv')
print(df)

              0
0    315.489105
1    346.444946
2    312.293915
3    311.751678
4    315.064545
..          ...
406  275.522919
407  336.472931
408  294.943115
409  334.717194
410  294.337708

[411 rows x 1 columns]


K-fold cross valid avg train loss 1.3489668809973936
K-fold cross valid avg train r2 0.9973347734852677
K-fold cross valid avg test loss 13.18546958523049
K-fold cross valid avg test r2 0.8633920641584876

Total time cost 226.34s
K-fold cross valid avg train loss 1.0208112936765557
K-fold cross valid avg train r2 0.9973985693900078
K-fold cross valid avg test loss 11.410513927679215
K-fold cross valid avg test r2 0.9079264533881546