In [1]:
import warnings
warnings.filterwarnings('ignore')

import argparse
import time
import random
import os
import io
import logging
import numpy as np
import mxnet as mx
from mxnet import gluon,nd,autograd,npx
import gluonnlp as nlp
import nmt
from gluonnlp.model.transformer import ParallelTransformer, get_transformer_encoder_decoder
import pandas as pd 
nlp.utils.check_version('0.7.0')

In [2]:
np.random.seed(100)
random.seed(100)
mx.random.seed(10000)
ctx = mx.gpu(0)

# parameters for dataset
dataset = 'pubchem'
src_lang, tgt_lang = 'random_smiles', 'rdkit_canonical_smiles'
src_max_len, tgt_max_len = 100, 100

# parameters for model
num_units=128
hidden_size=1024
dropout=0.1
epsilon=0.1
num_layers=3
num_heads=4
scaled=True
share_embed=True
embed_size=128
tie_weights=True
embed_initializer=None
magnitude = 3.0
lr_update_factor = 0.5
param_file = 'viscosity/viscosity_best.params'


In [3]:
def _load_vocab(file_path, **kwargs):
    with open(file_path, 'r') as f:
        return nlp.Vocab.from_json(f.read())

In [4]:
src_vocab = _load_vocab('pubchem/vocab.random_smiles.json')
tgt_vocab = _load_vocab('pubchem/vocab.rdkit_canonical_smiles.json')

In [5]:
'''
encoder1, decoder1, one_step_ahead_decoder1 = get_transformer_encoder_decoder(
    units=num_units,
    hidden_size=hidden_size,
    dropout=dropout,
    num_layers=num_layers,
    num_heads=num_heads,
    max_src_length=src_max_len,
    max_tgt_length=tgt_max_len,
    scaled=scaled, prefix='transformer_1')

encoder2, decoder2, one_step_ahead_decoder2 = get_transformer_encoder_decoder(
    units=num_units,
    hidden_size=hidden_size,
    dropout=dropout,
    num_layers=num_layers,
    num_heads=num_heads,
    max_src_length=src_max_len,
    max_tgt_length=tgt_max_len,
    scaled=scaled,prefix='transformer_2')
'''
encoder3, decoder3, one_step_ahead_decoder3 = get_transformer_encoder_decoder(
    units=num_units,
    hidden_size=hidden_size,
    dropout=dropout,
    num_layers=num_layers,
    num_heads=num_heads,
    max_src_length=src_max_len,
    max_tgt_length=tgt_max_len,
    scaled=scaled,prefix='transformer_3')
'''
model1 = nlp.model.translation.NMTModel(src_vocab=src_vocab,
                 tgt_vocab=tgt_vocab,
                 encoder=encoder1,
                 decoder=decoder1,
                 one_step_ahead_decoder=one_step_ahead_decoder1,
                 embed_size=num_units,
                 embed_initializer=None,
                 prefix='transformer_1')
model2 = nlp.model.translation.NMTModel(src_vocab=src_vocab,
                 tgt_vocab=tgt_vocab,
                 encoder=encoder2,
                 decoder=decoder2,
                 one_step_ahead_decoder=one_step_ahead_decoder2,
                 embed_size=num_units,
                 embed_initializer=None,
                 prefix='transformer_2')
'''
model3 = nlp.model.translation.NMTModel(src_vocab=src_vocab,
                 tgt_vocab=tgt_vocab,
                 encoder=encoder3,
                 decoder=decoder3,
                 one_step_ahead_decoder=one_step_ahead_decoder3,
                 embed_size=num_units,
                 embed_initializer=None,
                 prefix='transformer_3')

[20:06:38] ../src/storage/storage.cc:199: Using Pooled (Naive) StorageManager for CPU


In [6]:
from rdkit import Chem
def canonical_smile(sml):
    try:
        m = Chem.MolFromSmiles(sml)
        #return Chem.MolToSmiles(m, canonical=True,isomericSmiles=False)
        return Chem.MolToSmiles(m, canonical=True,isomericSmiles=True)
    except:
        print(sml)
        return float('nan')

In [7]:
def no_split(sm):
    arr = []
    i = 0
    try:
        len(sm)
    except:
        print(sm)
    while i < len(sm)-1:
        arr.append(sm[i])
        i += 1
    if i == len(sm)-1:
        arr.append(sm[i])
    return ' '.join(arr)

In [8]:
length_clip = nlp.data.ClipSequence(100)
# Helper function to preprocess a single data point
def preprocess(data):
    # A token index or a list of token indices is
    # returned according to the vocabulary.
    src_sentence = src_vocab[length_clip(data.split())]
    src_sentence.append(src_vocab[src_vocab.eos_token])
    src_npy = np.array(src_sentence, dtype=np.int32)
    src_nd = mx.nd.array(src_npy)
    return src_nd

# Helper function for getting the length
def get_length(x):
    return float(len(x.split(' ')))

In [9]:
dropout = 0.05
train_batch_size = 128

In [10]:
class ILNet(gluon.HybridBlock):
    """Network for sentiment analysis."""
    def __init__(self,
                 dropout,
                 src_vocab=src_vocab,
                 embed_size=embed_size,
                 output_size=1,
                 num_filters=(100, 200, 200, 200, 200, 100,100),
                 ngram_filter_sizes=(1, 2, 3, 4, 5, 6,7),
                 IL_num_filters=(100, 200, 200, 200, 200, 100, 100, 100, 100,100, 160,160),
                 IL_ngram_filter_sizes=(1, 2, 3,4, 5, 6, 7, 8, 9, 10, 15,20),
                 prefix=None,
                 params=None):
        super(ILNet, self).__init__(prefix=prefix, params=params)
        with self.name_scope():
            
            self.num_filters = num_filters
            self.IL_num_filters = IL_num_filters
            '''
            self.cation_src_embed = None
            self.cation_encoder = None
            self.cation_textcnn = nlp.model.ConvolutionalEncoder(
                embed_size=embed_size,
                num_filters=num_filters,
                ngram_filter_sizes=ngram_filter_sizes,
                conv_layer_activation='relu',
                num_highway=1)
            #self.cation_dropout = gluon.nn.BatchNorm()
            self.cation_dropout = gluon.nn.Dropout(dropout)

            self.anion_src_embed = None
            self.anion_encoder = None
            self.anion_textcnn = nlp.model.ConvolutionalEncoder(
                embed_size=embed_size,
                num_filters=num_filters,
                ngram_filter_sizes=ngram_filter_sizes,
                conv_layer_activation='relu',
                num_highway=1)
            #self.anion_dropout = gluon.nn.BatchNorm()
            self.anion_dropout = gluon.nn.Dropout(dropout)
            '''
            self.IL_src_embed = None
            self.IL_encoder = None
            self.IL_textcnn = nlp.model.ConvolutionalEncoder(
                embed_size=embed_size,
                num_filters=IL_num_filters,
                ngram_filter_sizes=IL_ngram_filter_sizes,
                conv_layer_activation='relu',
                num_highway=1)
            #self.IL_dropout = gluon.nn.BatchNorm()
            #self.IL_dropout = gluon.nn.Dropout(dropout)
            '''
            self.mlp = gluon.nn.HybridSequential()
            with self.mlp.name_scope():
                #self.mlp.add(gluon.nn.Dropout(dropout))
                self.mlp.add(gluon.nn.Dense(4096))
                #self.mlp.add(gluon.nn.BatchNorm())
                self.mlp.add(gluon.nn.Activation('relu'))
                #self.mlp.add(gluon.nn.Dropout(dropout))
                
                self.mlp.add(gluon.nn.Dense(2048))
                #self.mlp.add(gluon.nn.BatchNorm())
                self.mlp.add(gluon.nn.Activation('relu'))
                
                self.mlp.add(gluon.nn.Dense(1024))
                #self.mlp.add(gluon.nn.BatchNorm())
                self.mlp.add(gluon.nn.Activation('relu'))
                
                self.mlp.add(gluon.nn.Dense(512))
                self.mlp.add(gluon.nn.BatchNorm())
                self.mlp.add(gluon.nn.Activation('relu'))
                self.mlp.add(gluon.nn.Dense(256))
                self.mlp.add(gluon.nn.BatchNorm())
                self.mlp.add(gluon.nn.Activation('relu'))
            '''
            self.output = gluon.nn.HybridSequential()
            with self.output.name_scope():
                self.output.add(gluon.nn.Dense(1024))
                self.output.add(gluon.nn.Activation('relu'))
                self.output.add(gluon.nn.Dropout(dropout))
                self.output.add(gluon.nn.Dense(512))
                self.output.add(gluon.nn.Activation('relu'))
                self.output.add(gluon.nn.Dense(output_size, flatten=False))

    def hybrid_forward(self, F,IL_src_nd, IL_valid_length, T):  # pylint: disable=arguments-differ
        '''
        cation_src_embed_ = self.cation_src_embed(cation_src_nd)
        cation_encoded, _ = self.cation_encoder(
            cation_src_embed_,
            valid_length=cation_valid_length)  # Shape(T, N, C)
        cation_textcnn = self.cation_textcnn(
            F.transpose(cation_encoded, axes=(1, 0, 2)))
        cation_textcnn = self.cation_dropout(cation_textcnn)

        anion_src_embed_ = self.anion_src_embed(anion_src_nd)
        anion_encoded, _ = self.anion_encoder(
            anion_src_embed_,
            valid_length=anion_valid_length)  # Shape(T, N, C)
        anion_textcnn = self.anion_textcnn(
            F.transpose(anion_encoded, axes=(1, 0, 2)))
        anion_textcnn = self.anion_dropout(anion_textcnn)
        '''
        IL_src_embed_ = self.IL_src_embed(IL_src_nd)
        IL_encoded, _ = self.IL_encoder(
            IL_src_embed_,
            valid_length=IL_valid_length)  # Shape(T, N, C)
        IL_textcnn = self.IL_textcnn(
            F.transpose(IL_encoded, axes=(1, 0, 2)))
        #IL_textcnn = self.IL_dropout(IL_textcnn)
        
        T_ = F.reshape(T, shape=(-1, 1))
        
        input_vecs = mx.symbol.concat(
            F.reshape(IL_textcnn,
                      shape=(-1, sum(self.IL_num_filters))),T_)
        
        #mlp_out = self.mlp(input_vecs)

        

        #add_temp_press = mx.symbol.concat(mlp_out, T_)
        #add_temp_press = mx.symbol.concat(input_vecs, T_)
        out = self.output(input_vecs)
        return out

In [11]:
net = ILNet(dropout=dropout)
#net.cation_encoder = model1.encoder
#net.cation_src_embed =  model1.src_embed

#net.anion_encoder = model2.encoder
#net.anion_src_embed =  model2.src_embed

net.IL_encoder = model3.encoder
net.IL_src_embed =  model3.src_embed
net.hybridize()
print(net)
#net.textcnn.initialize(mx.init.Xavier(), ctx=ctx)
#net.output.initialize(mx.init.Xavier(), ctx=ctx)

ILNet(
  (IL_src_embed): HybridSequential(
    (0): Embedding(72 -> 128, float32)
    (1): Dropout(p = 0.0, axes=())
  )
  (IL_encoder): TransformerEncoder(
    (dropout_layer): Dropout(p = 0.1, axes=())
    (layer_norm): LayerNorm(eps=1e-05, axis=-1, center=True, scale=True, in_channels=128)
    (transformer_cells): HybridSequential(
      (0): TransformerEncoderCell(
        (dropout_layer): Dropout(p = 0.1, axes=())
        (attention_cell): MultiHeadAttentionCell(
          (_base_cell): DotProductAttentionCell(
            (_dropout_layer): Dropout(p = 0.1, axes=())
          )
          (proj_query): Dense(None -> 128, linear)
          (proj_key): Dense(None -> 128, linear)
          (proj_value): Dense(None -> 128, linear)
        )
        (proj): Dense(None -> 128, linear)
        (ffn): PositionwiseFFN(
          (ffn_1): Dense(None -> 1024, linear)
          (activation): Activation(relu)
          (ffn_2): Dense(None -> 128, linear)
          (dropout_layer): Dropout(p = 0

In [12]:
net.load_parameters(param_file,ctx=ctx)
net.initialize(init=mx.init.Xavier(magnitude=magnitude), ctx=ctx)

[20:06:39] ../src/storage/storage.cc:199: Using Pooled (Naive) StorageManager for GPU


In [13]:
def predict(net, dataloader,context):
    out = []
    for i, ((IL_data, IL_length), T) in enumerate(dataloader):
        IL_data = IL_data.as_in_context(context)
        IL_length = IL_length.as_in_context(context).astype(np.float32)
        T = T.as_in_context(context)
        output = net(IL_data, IL_length,T)
        out= out+[f for f in output.asnumpy()]
    return out

In [14]:
def get_dataloader(train_dataset):

    # Pad data, stack label and lengths
    batchify_fn = nlp.data.batchify.Tuple(
        nlp.data.batchify.Pad(axis=0, pad_val=0, ret_length=True),
        nlp.data.batchify.Stack(dtype='float32'))

    # Construct a DataLoader object for both the training and test data
    train_dataloader = gluon.data.DataLoader(dataset=train_dataset,
                                             batchify_fn=batchify_fn,batch_size = train_batch_size)

    return train_dataloader

In [15]:
viscosity_database = pd.read_excel('smiles.xlsx',sheet_name='To_be_calculated')

In [16]:
train_IL_smiles = viscosity_database['SMILES'].map(canonical_smile).map(no_split).map(preprocess)
train_T =viscosity_database['normalized_T_vis']
train_dataset = gluon.data.SimpleDataset(gluon.data.ArrayDataset(train_IL_smiles,train_T))
predict_dataloader= get_dataloader(train_dataset)

In [17]:
predicted = predict(net, predict_dataloader,ctx)

[20:06:45] ../src/operator/cudnn_ops.cc:292: Auto-tuning cuDNN op, set MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable
[20:06:46] ../src/operator/cudnn_ops.cc:292: Auto-tuning cuDNN op, set MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable


In [18]:
df = pd.DataFrame(predicted)
df.to_csv('predicted_viscosity.csv')
print(df)

             0
0     7.912554
1     6.842931
2     7.128448
3     5.591094
4     5.442315
...        ...
5513  9.439482
5514  8.890400
5515  7.609926
5516  8.140711
5517  5.647591

[5518 rows x 1 columns]
