In [1]:
import joblib
from rdkit.Chem.Draw import IPythonConsole #Needed to show molecules+
from rdkit.Chem import Draw
from rdkit.Chem import AllChem as Chem
import gzip
import os
import sys
import csv
import pandas as pd

from syba.syba import SybaClassifier, SmiMolSupplier
import statsmodels.api as sm
from datetime import datetime
import tensorflow as tf

from tensorflow import keras
import numpy as np

from tensorflow.keras import optimizers

from tensorflow.keras import layers
from tensorflow.keras import models

In [2]:
def create_my_model():
    model = keras.Sequential()
    model.add(layers.Embedding(input_dim=1024, output_dim=10))
    model.add(layers.LSTM(128))
    model.add(layers.Dense(128))
    model.add(layers.Dense(64))
    model.add(layers.Dense(32))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer=optimizers.RMSprop(lr=0.001), loss='binary_crossentropy', metrics=['mae', 'accuracy'])
    return model

In [3]:
def create_my_model11():
    model = keras.Sequential()
    model.add(layers.Embedding(input_dim=1024, output_dim=264))
    model.add(layers.LSTM(264))
    model.add(layers.Dense(264))
    model.add(layers.Dense(128))
    model.add(layers.Dense(64))
    model.add(layers.Dense(32))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer=optimizers.RMSprop(lr=0.001), loss='binary_crossentropy', metrics=['mae', 'accuracy'])
    return model

In [4]:
rnn_load = create_my_model11()
rnn_load.load_weights('./rnn_lstm_V11.1000.10_weights.h5')
rnn_load.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 264)         270336    
_________________________________________________________________
lstm (LSTM)                  (None, 264)               558624    
_________________________________________________________________
dense (Dense)                (None, 264)               69960     
_________________________________________________________________
dense_1 (Dense)              (None, 128)               33920     
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_3 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 3

# Specialization for Alcools : 
First get the data

In [5]:
df = pd.read_csv("../data/transfer/Banques/train_alcool_faciles.csv")
df

Unnamed: 0,isosmiles
0,C[C@@]1([C@H]2[C@@H]([C@H]3[C@@H](C(=O)C(=C([C...
1,CC1(C2C(C3C(C(=O)C(=C(C3(C(=O)C2=C(C4=C1C=CC=C...
2,C[C@@]1([C@H]2C([C@H]3C(C(=O)C(=C([C@]3(C(=O)C...
3,C[C@]1([C@@H]2[C@H]([C@@H]3[C@H](C(=O)C(=C([C@...
4,C[C@@]1([C@@H]2[C@@H]([C@@H]3[C@@H](C(=O)C(=C(...
...,...
110,C[C@]1([C@H]2[C@H]([C@H]3[C@H](C(=O)C(=C([C@]3...
111,C[C@@]1([C@H]2[C@@H]([C@H]3[C@@H](C(=O)C(=C(C3...
112,C[C@@]1([C@@H]2[C@@H]([C@H]3[C@H](C(=O)C(=C([C...
113,C[C@@]1([C@H]2C([C@@H]3C(C(=O)C(=C([C@]3(C(=O)...


In [6]:
nBits = 1024
syn_fps = np.array([np.array(Chem.GetMorganFingerprintAsBitVect(spls[0],2,nBits=nBits)) for spls in SmiMolSupplier(open("../data/transfer/Banques/Alcools_faciles.csv", mode="rt"), header=True, smi_col=0)])
n = len(syn_fps)


non_fps = [np.array(Chem.GetMorganFingerprintAsBitVect(spls[0],2,nBits=nBits)) for spls in SmiMolSupplier(open("../data/transfer/Banques/Alcools_difficiles.csv", mode="rt"), header=True, smi_col=0)]
m = len(non_fps)

In [7]:
nBits = 1024
syn_fps = np.array([np.array(Chem.GetMorganFingerprintAsBitVect(spls[0],2,nBits=nBits)) for spls in SmiMolSupplier(open("../data/transfer/Banques/train_alcool_faciles.csv", mode="rt"), header=True, smi_col=0)])

n = len(syn_fps)


non_fps = [np.array(Chem.GetMorganFingerprintAsBitVect(spls[0],2,nBits=nBits)) for spls in SmiMolSupplier(open("../data/transfer/Banques/train_alcool_difficiles.csv", mode="rt"), header=True, smi_col=0)]

m = len(non_fps)

if(n > m): 
    syn_fps = syn_fps[:m]
    n = m
else :
    non_fps = non_fps[:n]


syn_classes = np.ones(n)

x_train_1 = syn_fps[:int(n*0.8)]
y_train_1 = syn_classes[:int(n*0.8)]
x_test_1 = syn_fps[int(n*0.8):]   #int(n*0.24)]
y_test_1 = syn_classes[int(n*0.8):]  #int(n*0.24)]

non_classes = np.zeros(n)

x_train_0 = non_fps[:int(n*0.8)]
y_train_0 = non_classes[:int(n*0.8)]
x_test_0 = non_fps[int(n*0.8):]   #int(0.24*n)]
y_test_0 = non_classes[int(n*0.8):]  #int(0.24*n)]


x_train = np.concatenate((x_train_1, x_train_0))
y_train = np.concatenate((y_train_1, y_train_0))
x_test = np.concatenate((x_test_1, x_test_0))
y_test = np.concatenate((y_test_1, y_test_0))

print(len(x_train), len(y_train), len(x_test), len(y_test))

184 184 46 46


# Entrainer un nouveau model :

In [8]:
rnn_new = create_my_model11()
history = rnn_new.fit(x_train, y_train, validation_data=(x_test, y_test), batch_size=10, epochs=4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


Aprés 4 epochs l'accuracy reste assez faibles, alors que si on utilise notre réseau deja entrainé, il suffit d'une epochs de 40 secondes pour qu'il se spécialise dans les alcools : 

# Re-entrainer notre model pour plus de spécialization :

In [None]:
rnn_load = create_my_model11()
rnn_load.load_weights('./rnn_lstm_V11.1000.10_weights.h5')
rnn_load.summary()
history = rnn_load.fit(x_train, y_train, validation_data=(x_test, y_test), batch_size=10, epochs=1)

# Comparer a l'Accuracy de SYBA sur un autre fichier de test

In [10]:
#D'abord charger syba et les données de test
syba = SybaClassifier()
syba.fitDefaultScore()


In [None]:

inpath = "../data/transfer/Banques/"
outpath = "../out/RNN/V8/"
files = ["test_alcool_faciles.csv", "test_alcool_difficiles.csv"]
for f in files:
    with open(inpath+f) as inp, open(outpath+f, "w") as out:
        header = inp.readline().strip()
        out.write(header)
        out.write("idx,smiles,atoms,SybaScore,LstmScore\n")
        data = np.array([np.array(Chem.GetMorganFingerprintAsBitVect(spls[0],2,nBits=nBits)) for spls in SmiMolSupplier(open(inpath+f, mode="rt"), header=True, smi_col=0)])    
        pr = rnn_load.predict(data)
        i = 0
        for line in inp:
            idx, smi, atoms = line.strip().split(",")
            out.write("{},{},{},{},{}\n".format(idx, smi, atoms, syba.predict(smi), pr[i]))
            i += 1