In [None]:
from tensorflow import keras
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers
from tensorflow import nn
import collections
from rdkit.Chem import Descriptors
from rdkit import Chem
from rdkit.Chem import AllChem
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, accuracy_score, multilabel_confusion_matrix, mean_squared_error
import statsmodels.api as sm

In [None]:
#params
#for quick prototyping epoch count is set to 20
epoch_count = 20

In [None]:
data = pd.read_csv('ratio_dataset.csv', sep="\t",low_memory=False)

In [None]:
# Vectorize input 

smiles_train = data["RP"]
print (smiles_train.shape)
charset = set("".join(list(data.RP))+"!E")
char_to_int = dict((c,i+1) for i,c in enumerate(charset))
int_to_char = dict((i+1,c) for i,c in enumerate(charset))
embed = max([len(smile) for smile in data .RP])
print (str(charset))
print(len(charset), embed)

def vectorize(smiles):
        one_hot =  np.zeros((smiles.shape[0], embed ),dtype=np.int8)
        print(one_hot.shape)
        for i,smile in enumerate(smiles):
            #encode the startchar
            #encode the rest of the chars
            for j,c in enumerate(smile):
                one_hot[i,j] = char_to_int[c]
            #Encode endchar
            
        #Return two, one for input and the other for output
        return one_hot#, one_hot[:,0:-1,:]
X_train = vectorize(smiles_train.values)

In [None]:
# vectorize output
one_hot_yz = np.zeros((len(data), 1 ),dtype=np.float16)
for i,nam in enumerate(data['rat2'])
    one_hot_yz[i] = float(nam)          
Y_train = np.array(one_hot_yz)


In [None]:
# split dataset
X_train = np.array(X_train)
X_train2, X_test2, y_train2_2,y_test2_2  = train_test_split(X_train,Y_train, test_size=0.1,shuffle=True)

In [None]:
embed_smiles = len(charset) +1
input_shape = X_train.shape[1]

In [None]:
inputA = keras.Input(shape=(199,),name='inputx')


x = layers.Embedding(embed_smiles,12, input_length=199)(inputA)


x = layers.Dense(64, activation="relu")(x)

x = layers.Dense(32, activation="relu")(x)

x = layers.Flatten()(x)

p = layers.Dense(32, activation='relu')(x)

p = layers.Dense(1, activation="linear",dtype='float32',name="ratio")(p)


modelx = keras.Model(inputs=inputA, outputs=p)

modelx.compile(loss='mse', optimizer="adam")
print (modelx.summary())

In [None]:
history = modelx.fit(X_train2,y_train2_2, epochs=epoch_count, batch_size=256, validation_split = 0.1)

In [None]:
y_pred2=modelx.predict(X_test2,verbose=1)

print('MSE: ',mean_squared_error([float(x) for x in y_test2_2], [float(x) for x in y_pred2]))

#plt.axis([0, 1.1, 0, 1.1])
#plt.scatter((y_pred2),(y_test2_2))
print('Correlation cof matrix:')
print(np.corrcoef([float(x) for x in y_pred2], [float(x) for x in y_test2_2]))
X_addC = sm.add_constant([float(x) for x in y_test2_2])
result = sm.OLS([float(x) for x in y_pred2], X_addC).fit()
print('R2:',result.rsquared,'R2 adj:', result.rsquared_adj)

In [None]:
inputA = keras.Input(shape=(199,),name='inputx')


x = layers.Embedding(embed_smiles,12, input_length=199)(inputA)
x = layers.Permute((2,1))(x)
x = layers.LSTM(32,return_sequences=False)(x)

p = layers.Dense(64, activation='relu')(x)

p = layers.Dense(32, activation='relu')(x)

p = layers.Dense(1, activation="linear",dtype='float32',name="ratio")(p)


modelx = keras.Model(inputs=inputA, outputs=p)

modelx.compile(loss='mse', optimizer="adam")
print (modelx.summary())

In [None]:
history = modelx.fit(X_train2,y_train2_2, epochs=epoch_count, batch_size=256, validation_split = 0.1)

In [None]:
y_pred2=modelx.predict(X_test2,verbose=1)

print('MSE: ',mean_squared_error([float(x) for x in y_test2_2], [float(x) for x in y_pred2]))

#plt.axis([0, 1.1, 0, 1.1])
#plt.scatter((y_pred2),(y_test2_2))
print('Correlation cof matrix:')
print(np.corrcoef([float(x) for x in y_pred2], [float(x) for x in y_test2_2]))
X_addC = sm.add_constant([float(x) for x in y_test2_2])
result = sm.OLS([float(x) for x in y_pred2], X_addC).fit()
print('R2:',result.rsquared,'R2 adj:', result.rsquared_adj)

In [None]:
inputA = keras.Input(shape=(199,),name='inputx')


x = layers.Embedding(embed_smiles,12, input_length=199)(inputA)

k = layers.Conv1D(32,12,activation='relu')(x)
k = layers.AveragePooling1D(3)(k)
k = layers.Conv1D(32,12,activation='relu')(k)
k = layers.AveragePooling1D(3)(k)
k = layers.Conv1D(32,5,activation='relu')(k)
k = layers.AveragePooling1D(3)(k)

k = layers.Flatten()(k)

p = layers.Dense(32, activation='relu')(k)

#p = layers.Dense(64, activation='relu')(p)

p = layers.Dense(1, activation="linear",dtype='float32',name="ratio")(p)


modelx = keras.Model(inputs=inputA, outputs=p)

modelx.compile(loss='mse', optimizer="adam")
modelx.summary()

In [None]:
history = modelx.fit(X_train2,y_train2_2, epochs=epoch_count, batch_size=256, validation_split = 0.1)


In [None]:
y_pred2=modelx.predict(X_test2,verbose=1)

print('MSE: ',mean_squared_error([float(x) for x in y_test2_2], [float(x) for x in y_pred2]))

#plt.axis([0, 1.1, 0, 1.1])
#plt.scatter((y_pred2),(y_test2_2))
print('Correlation cof matrix:')
print(np.corrcoef([float(x) for x in y_pred2], [float(x) for x in y_test2_2]))
X_addC = sm.add_constant([float(x) for x in y_test2_2])
result = sm.OLS([float(x) for x in y_pred2], X_addC).fit()
print('R2:',result.rsquared,'R2 adj:', result.rsquared_adj)