# Description

This notebook is used to process and evalutate the initial universe of generated smiles, and then retain the network applying techniques and principles from both transfer learning and genetic algorithms to increasingly improve molecule generation for the specific task of binding with cornovirus protease.

## First process initial generated smiles for PyRx analysis

In [1]:
import pandas as pd
from rdkit import Chem, DataStructs
import random
import numpy as np
import rdkit.Chem.PropertyMol
import rdkit.Chem.Descriptors

In [2]:
gen0_table = pd.read_csv('./generations/gen0_smiles.smi',sep=',', header=None)
gen0 = list(gen0_table[0])[0:10000]
len(gen0)

9703

In [3]:
def validate_mols(list_of_smiles):
    valid_mols = []
    for smi in list_of_smiles:
        mol = Chem.MolFromSmiles(smi)
        if mol is not None:
            valid_mols.append(mol)
    return valid_mols

def convert_mols_to_smiles(list_of_mols):
    valid_smiles = [Chem.MolToSmiles(mol) for mol in list_of_mols]
    return valid_smiles

In [4]:
gen0_mols = validate_mols(gen0)
len(gen0_mols)

9703

In [5]:
'''Intakes a list of smiles, randomly shuffles them, then adds first thirty,
then sets a max-similarity threshold between any new molecule and existing list
and iteratively increases the treshold until X components are picked to ensure diveristy'''

def initialize_generation_from_mols(list_of_mols,desired_length):  
    assert desired_length >30
    random.shuffle(list_of_mols)
    random.shuffle(list_of_mols)
    
    #Prepare fingerprints for similarity calcs
    mol_fingerprints = []
    for mol in list_of_mols:
        mol_fingerprints.append(Chem.RDKFingerprint(mol))
    
    selected_mols = list_of_mols[0:30]
    selected_fingerprints = mol_fingerprints[0:30]
    remaining_mols = list_of_mols[30:]
    remaining_fingerprints = mol_fingerprints[30:]
    
    similarity_threshold = .05   
    while len(selected_mols) < desired_length:
        for fingerprint, mol in zip(remaining_fingerprints, remaining_mols):
            max_similarity = np.max(DataStructs.BulkTanimotoSimilarity(fingerprint,selected_fingerprints))
            if (max_similarity <= similarity_threshold) and (max_similarity < 1):
                selected_fingerprints.append(fingerprint)
                selected_mols.append(mol)
        print("Completed loop with threshold at: ", similarity_threshold, ". Length is currently: ", len(selected_mols))
        similarity_threshold += .05
    return selected_mols

In [6]:
gen0_mols = initialize_generation_from_mols(gen0_mols,1000)
print(len(gen0_mols))

Completed loop with threshold at:  0.05 . Length is currently:  30
Completed loop with threshold at:  0.1 . Length is currently:  32
Completed loop with threshold at:  0.15000000000000002 . Length is currently:  33
Completed loop with threshold at:  0.2 . Length is currently:  39
Completed loop with threshold at:  0.25 . Length is currently:  72
Completed loop with threshold at:  0.3 . Length is currently:  201
Completed loop with threshold at:  0.35 . Length is currently:  528
Completed loop with threshold at:  0.39999999999999997 . Length is currently:  1132
1132


In [7]:
master_table = pd.read_csv('./generations/master_results_table.csv',sep=',')
master_table.shape[0]

9354

In [8]:
'''Certainly not opimized and not strictly necessary, but in the PyRx GUI
molecule names would sort oddly when in any numeric order, so ordering
molcules by a four letter code. This function iterates the four letter code.'''
def iterate_alpha(alpha_code):
    numbers = []
    for letter in alpha_code:
        number = ord(letter)
        numbers.append(number)
    
    if numbers[3]+1 > 90:
        if numbers[2]+1 > 90:
            if numbers[1]+1 > 90:
                if numbers[0]+1 > 90:
                    raise ValueError('Too long for alpha code')
                else:
                    numbers[3] = 65
                    numbers[2] = 65
                    numbers[1] = 65
                    numbers[0] = numbers[0] + 1
            else:
                numbers[3] = 65
                numbers[2] = 65
                numbers[1] = numbers[1] + 1
        else:
            numbers[3] = 65
            numbers[2] = numbers[2] + 1
    else:
        numbers[3] = numbers[3] + 1
    

    new_code = ""
    for number in numbers:
        new_code += chr(number)
    return new_code
iterate_alpha('AAAA')

'AAAB'

In [9]:
def append_to_tracking_table(master_table,mols_to_append, source, generation):
    # Assign IDs for tracking to each mol, and assign a pandas table entry for each
    mols_to_export = []
    rows_list = []
    
    master_table_gen = master_table[master_table['gen'] == generation]
    if master_table_gen.shape[0] == 0:
        id_code = 'AAAA'
    else:
        master_table_gen_ids = master_table_gen.sort_values('id', ascending=True)
        master_table_gen_max_id = master_table_gen_ids.tail(1)
        key = master_table_gen_max_id['id'].keys()[0]
        id_code = iterate_alpha(str(master_table_gen_max_id['id'][key]))
        
    training_data = pd.read_csv('./datasets/all_smiles_clean.smi', header=None)
    training_set = set(list(training_data[0]))
    
    for mol in mols_to_append:
        pm = Chem.PropertyMol.PropertyMol(mol)
        title = 'id' + str(id_code) + 'gen'+ str(generation)
        print(title)
        # Enables for tracking which molecule is which in PyRx GUI and PyRx results export
        pm.SetProp('Title', title)
        mols_to_export.append(pm)

        #And track in pandas
        mol_dict = {}
        mol_dict['id'] = id_code
        mol_dict['gen'] = generation
        smile = Chem.MolToSmiles(mol)
        assert type(smile) == type('string')
        mol_dict['smile'] = smile

        if (source!= 'hiv' and source != 'manual' and source != 'baseline') and (smile in training_set):
            mol_dict['source'] = 'training'
        else:
            mol_dict['source'] = source
        mol_dict['score'] = 99.9

        rows_list.append(mol_dict)
        id_code = iterate_alpha(id_code)
        
    df = pd.DataFrame(rows_list)
    return df, mols_to_export

In [10]:
new_mols_to_test = append_to_tracking_table(master_table,gen0_mols, 'generated', 0)
mols_for_pd = new_mols_to_test[0]
mols_for_export = new_mols_to_test[1]
master_table = master_table.append(mols_for_pd)
len(mols_for_export)

idANVUgen0
idANVVgen0
idANVWgen0
idANVXgen0
idANVYgen0
idANVZgen0
idANWAgen0
idANWBgen0
idANWCgen0
idANWDgen0
idANWEgen0
idANWFgen0
idANWGgen0
idANWHgen0
idANWIgen0
idANWJgen0
idANWKgen0
idANWLgen0
idANWMgen0
idANWNgen0
idANWOgen0
idANWPgen0
idANWQgen0
idANWRgen0
idANWSgen0
idANWTgen0
idANWUgen0
idANWVgen0
idANWWgen0
idANWXgen0
idANWYgen0
idANWZgen0
idANXAgen0
idANXBgen0
idANXCgen0
idANXDgen0
idANXEgen0
idANXFgen0
idANXGgen0
idANXHgen0
idANXIgen0
idANXJgen0
idANXKgen0
idANXLgen0
idANXMgen0
idANXNgen0
idANXOgen0
idANXPgen0
idANXQgen0
idANXRgen0
idANXSgen0
idANXTgen0
idANXUgen0
idANXVgen0
idANXWgen0
idANXXgen0
idANXYgen0
idANXZgen0
idANYAgen0
idANYBgen0
idANYCgen0
idANYDgen0
idANYEgen0
idANYFgen0
idANYGgen0
idANYHgen0
idANYIgen0
idANYJgen0
idANYKgen0
idANYLgen0
idANYMgen0
idANYNgen0
idANYOgen0
idANYPgen0
idANYQgen0
idANYRgen0
idANYSgen0
idANYTgen0
idANYUgen0
idANYVgen0
idANYWgen0
idANYXgen0
idANYYgen0
idANYZgen0
idANZAgen0
idANZBgen0
idANZCgen0
idANZDgen0
idANZEgen0
idANZFgen0
idANZGgen0

idAPAXgen0
idAPAYgen0
idAPAZgen0
idAPBAgen0
idAPBBgen0
idAPBCgen0
idAPBDgen0
idAPBEgen0
idAPBFgen0
idAPBGgen0
idAPBHgen0
idAPBIgen0
idAPBJgen0
idAPBKgen0
idAPBLgen0
idAPBMgen0
idAPBNgen0
idAPBOgen0
idAPBPgen0
idAPBQgen0
idAPBRgen0
idAPBSgen0
idAPBTgen0
idAPBUgen0
idAPBVgen0
idAPBWgen0
idAPBXgen0
idAPBYgen0
idAPBZgen0
idAPCAgen0
idAPCBgen0
idAPCCgen0
idAPCDgen0
idAPCEgen0
idAPCFgen0
idAPCGgen0
idAPCHgen0
idAPCIgen0
idAPCJgen0
idAPCKgen0
idAPCLgen0
idAPCMgen0
idAPCNgen0
idAPCOgen0
idAPCPgen0
idAPCQgen0
idAPCRgen0
idAPCSgen0
idAPCTgen0
idAPCUgen0
idAPCVgen0
idAPCWgen0
idAPCXgen0
idAPCYgen0
idAPCZgen0
idAPDAgen0
idAPDBgen0
idAPDCgen0
idAPDDgen0
idAPDEgen0
idAPDFgen0
idAPDGgen0
idAPDHgen0
idAPDIgen0
idAPDJgen0
idAPDKgen0
idAPDLgen0
idAPDMgen0
idAPDNgen0
idAPDOgen0
idAPDPgen0
idAPDQgen0
idAPDRgen0
idAPDSgen0
idAPDTgen0
idAPDUgen0
idAPDVgen0
idAPDWgen0
idAPDXgen0
idAPDYgen0
idAPDZgen0
idAPEAgen0
idAPEBgen0
idAPECgen0
idAPEDgen0
idAPEEgen0
idAPEFgen0
idAPEGgen0
idAPEHgen0
idAPEIgen0
idAPEJgen0

1132

In [11]:
master_table = master_table.reset_index(drop=True)
master_table.to_csv(r'./generations/master_results_table.csv', index=False)

In [12]:
# Add HIV inhibitors manually into the table
hiv_smiles = pd.read_csv('./datasets/hiv_inhibitors_cleaned.smi',sep=',', header=None)
hiv_smiles = list(hiv_smiles[0])
hiv_mols = validate_mols(hiv_smiles)

master_table = pd.read_csv('./generations/master_results_table.csv',sep=',')
new_mols_to_test = append_to_tracking_table(master_table,hiv_mols, 'hiv', 0)
mols_for_pd = new_mols_to_test[0]
mols_for_export = mols_for_export + new_mols_to_test[1]

master_table = master_table.append(mols_for_pd)
master_table = master_table.reset_index(drop=True)
master_table.to_csv(r'./generations/master_results_table.csv', index=False)

idAPNIgen0
idAPNJgen0
idAPNKgen0
idAPNLgen0
idAPNMgen0
idAPNNgen0
idAPNOgen0
idAPNPgen0
idAPNQgen0
idAPNRgen0
idAPNSgen0
idAPNTgen0
idAPNUgen0


In [13]:
# Add a few other smiles manually into the table ("control group" of training smiles)
manual_smiles = pd.read_csv('./datasets/manual_testing_cleaned.smi',sep=',', header=None)
manual_smiles = list(manual_smiles[0])
manual_mols = validate_mols(hiv_smiles)

master_table = pd.read_csv('./generations/master_results_table.csv',sep=',')
new_mols_to_test = append_to_tracking_table(master_table,manual_mols, 'manual', 0)
mols_for_pd = new_mols_to_test[0]
mols_for_export = mols_for_export + new_mols_to_test[1]

master_table = master_table.append(mols_for_pd)
master_table = master_table.reset_index(drop=True)
master_table.to_csv(r'./generations/master_results_table.csv', index=False)

idAPNVgen0
idAPNWgen0
idAPNXgen0
idAPNYgen0
idAPNZgen0
idAPOAgen0
idAPOBgen0
idAPOCgen0
idAPODgen0
idAPOEgen0
idAPOFgen0
idAPOGgen0
idAPOHgen0


In [14]:
def write_gen_to_sdf(mols_for_export, generation, batch_size):
    if len(mols_for_export) > batch_size:
        batches = (len(mols_for_export) // 1000)+1
        for i in range(0,batches):
            batch_to_export = mols_for_export[i*batch_size:(i+1)*batch_size]
            w = Chem.SDWriter('./generations/gen' +str(generation) + '_batch_' + str(i+1) + '.sdf')
            for m in batch_to_export: w.write(m)
    else:
        w = Chem.SDWriter('./generations/gen' +str(generation) + '.sdf')
        for m in mols_for_export:
            w.write(m)
    
    # Noticed an issue where the very last line item of an sdf write is not written correctly until another arbitary write is made
    w = Chem.SDWriter('./generations/junk/test.sdf')
    w.write(m)
    
    return mols_for_export

In [15]:
write_gen_to_sdf(mols_for_export, 0, 2000)
print('ok')

ok


## NOW GO TO PyRx: Analyze the SDF file and create a csv of binding score results


## Afterwards, process binding simulation results to 'evolve' the molecules

In [16]:
'''This number must be MANUALLY iterated each generation. I did not write the entire process into a smooth function or loop but that would be the next steps.''' 
GLOBAL_GENERATION = 11

In [17]:
master_table = pd.read_csv('./generations/master_results_table_gen' + str(GLOBAL_GENERATION-1) + '.csv',sep=',')
master_table.tail()

Unnamed: 0,id,gen,smile,source,weight,score
3015,AAAM,10,O=C(O)C(Cc1ccccc1)NC(=O)C(Cc1ccccc1)NC(=O)C1C2...,generated,625.722,99.9
3016,AAAO,10,O=C(NC1CC2CCCC(C1)N2CCc1ccccc1)C1CC2CC(C1)C2C(...,generated,704.912,99.9
3017,AABB,0,Cc1noc(CCCn2ccc(CN3CCCC3)n2)n1,generated,275.356,99.9
3018,AAJU,0,COc1ccc(NC(=O)C(O)C(C)N2CCOCC2)cc1Cl,generated,328.796,99.9
3019,AACU,7,O=CC(Cc1ccccc1)NC(=O)C(Cc1ccccc1)NC(=O)C(Cc1cc...,generated,1049.326,99.9


In [18]:
new_scores = pd.read_csv('./generations/results/results_gen' + str(GLOBAL_GENERATION-1) + '.csv',sep=',')
new_scores.head()

Unnamed: 0,Ligand,Binding Affinity,rmsd/ub,rmsd/lb
0,6lu7_idAACHgen10,-12.3,0.0,0.0
1,6lu7_idAACHgen10,-11.2,8.18,1.483
2,6lu7_idAACHgen10,-10.6,5.671,2.828
3,6lu7_idAACHgen10,-10.3,5.683,2.447
4,6lu7_idAACHgen10,-10.2,15.747,14.575


In [19]:
new_scores = new_scores.groupby("Ligand").min()["Binding Affinity"].reset_index()
new_scores['id'] = new_scores['Ligand'].str.split("_").str[1].str.split("gen").str[0].str.split("id").str[1]
new_scores['gen'] = new_scores['Ligand'].str.split("_").str[1].str.split("gen").str[1]
new_scores['score'] = new_scores["Binding Affinity"]
new_scores = new_scores[['id','gen','score']]
new_scores.head()

Unnamed: 0,id,gen,score
0,AAAA,10,-14.9
1,AAAB,10,-10.5
2,AAAC,10,-16.3
3,AAAD,10,-12.4
4,AAAE,10,-11.9


In [20]:
new_scores.id = new_scores.id.astype(str)
new_scores.gen = new_scores.gen.astype(int)
master_table.id = master_table.id.astype(str)
master_table.gen = master_table.gen.astype(int)
new_table = pd.merge(master_table, new_scores, on=['id','gen'], suffixes=('_old','_new'), how='left')
new_table['score'] = np.where(new_table['score_new'].isnull(), new_table['score_old'], new_table['score_new'])
new_table = new_table.drop(['score_old','score_new'], axis=1)
new_table['weight'] = new_table['smile'].apply(lambda x: Chem.Descriptors.MolWt(Chem.MolFromSmiles(x)))
new_table = new_table.sort_values('score', ascending=True)
new_table.head()

Unnamed: 0,id,gen,smile,source,weight,score
0,AABL,10,O=C(NC(Cc1ccccc1)C(=O)NC(Cc1ccccc1)C(=O)O)C1CC...,generated,786.926,-17.9
1,AADG,9,O=C(NC1CC2CCC(C1)N2CCc1ccccc1)C1CC2CC(C1)C2C(=...,generated,838.062,-17.7
2,AACA,10,O=C(NC1CC2CCC(C1)N2CCc1ccccc1)C1CC2CC(C1)C2C(=...,generated,838.062,-17.7
3,AABO,9,O=C(NC1CC2CCC(C1)N2CCc1ccccn1)C1CC2CCC(C1)C2C(...,generated,853.077,-17.6
4,AABK,10,O=C(NC1CC2CCC(C1)N2CCc1ccccn1)C1CC2CCC(C1)C2C(...,generated,853.077,-17.6


In [21]:
new_table.to_csv(r'./generations/master_results_table_gen' + str(GLOBAL_GENERATION-1) + '.csv', index=False)

In [22]:
# Select top X ranked by score for training data to refine the molecule generator RNN
training_smiles = list(set(list(new_table.head(35)['smile'])))
len(training_smiles)

21

In [23]:
training_fingerprints = []
for smile in training_smiles:
    training_fingerprints.append(Chem.RDKFingerprint(Chem.MolFromSmiles(smile)))

def calc_similarity_score(row):
    fingerprint = Chem.RDKFingerprint(Chem.MolFromSmiles(row['smile']))
    similarity = np.max(DataStructs.BulkTanimotoSimilarity(fingerprint,training_fingerprints))
    adj_factor = (1 / similarity) **.333
    adj_score = row['score'] * adj_factor
    return adj_score

similarity_adjusted = new_table.copy(deep=True)
similarity_adjusted = similarity_adjusted[similarity_adjusted['weight'] < 900]
similarity_adjusted['similarity_adj_score'] = similarity_adjusted.apply(calc_similarity_score, axis=1)
similarity_adjusted = similarity_adjusted.sort_values('similarity_adj_score', ascending=True)
similarity_adjusted.head()

Unnamed: 0,id,gen,smile,source,weight,score,similarity_adj_score
396,AAGJ,6,CC(C)(C)C1CCC(C2CCCCCC2(C)C)CCCC2(C)C(CCC3CCCC...,generated,661.2,-11.5,-24.697105
198,AABP,5,CC1(CC(CC(O)C2(O)CCCC2)C(O)CO)CCCCCCCCCCCCCCCC...,generated,763.286,-13.0,-23.467864
642,AAAE,7,CC1(C)CCCC2(C)C(CCC3CCCC3)CCCCCC(CCC3CCCC3)CCC...,generated,661.2,-10.3,-22.120016
648,AAAL,7,CC(C)(C)C1CC2(C(C)(C)C)CCCC2(C)C1CCC12CC3CC(CC...,generated,398.719,-10.3,-21.819331
649,AAAI,8,CC(C)(C)C1CC2(C(C)(C)C)CCCC2(C)C1CCC12CC3CC(CC...,generated,398.719,-10.3,-21.819331


In [24]:
# Select top X ranked by similarity adjusted score for training data to refine the molecule generator RNN (ensures diverity)
training_smiles += list(similarity_adjusted.head(5)['smile'])
len(training_smiles)

26

In [25]:
def calc_weight_score(row):
    adj_factor = (900 / row['weight']) ** .333
    if adj_factor < 1:
        adj_score = 0
    else:
        adj_score = row['score'] * adj_factor
    return adj_score

weight_adjusted = new_table.copy(deep=True)
weight_adjusted['weight_adj_score'] = weight_adjusted.apply(calc_weight_score, axis=1)
weight_adjusted = weight_adjusted.sort_values('weight_adj_score', ascending=True)
weight_adjusted.head()

Unnamed: 0,id,gen,smile,source,weight,score,weight_adj_score
0,AABL,10,O=C(NC(Cc1ccccc1)C(=O)NC(Cc1ccccc1)C(=O)O)C1CC...,generated,786.926,-17.9,-18.718446
1,AADG,9,O=C(NC1CC2CCC(C1)N2CCc1ccccc1)C1CC2CC(C1)C2C(=...,generated,838.062,-17.7,-18.125294
2,AACA,10,O=C(NC1CC2CCC(C1)N2CCc1ccccc1)C1CC2CC(C1)C2C(=...,generated,838.062,-17.7,-18.125294
15,AABX,9,O=C(NC(Cc1ccccc1)C(=O)NC(Cc1ccccc1)C(=O)O)C1CC...,generated,772.899,-17.2,-18.094489
13,AACC,10,O=C(NC(Cc1ccccc1)C(=O)NC(Cc1ccccc1)C(=O)O)C1CC...,generated,772.899,-17.2,-18.094489


In [26]:
# Select top X ranked by similarity adjusted score for training data to refine the molecule generator RNN (ensures diverity)
training_smiles += list(weight_adjusted.head(5)['smile'])
len(training_smiles)

31

In [27]:
import tensorflow
tensorflow.test.is_gpu_available()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


True

In [28]:
import numpy as np
from copy import copy

import keras

from lstm_chem.utils.config import process_config
from lstm_chem.model import LSTMChem
from lstm_chem.generator import LSTMChemGenerator
from lstm_chem.trainer import LSTMChemTrainer
from lstm_chem.data_loader import DataLoader

Using TensorFlow backend.


In [29]:
# Generate some with the base original model
CONFIG_FILE = 'experiments/2020-04-21/LSTM_Chem/config.json'
config = process_config(CONFIG_FILE)
modeler = LSTMChem(config, session='generate')
generator = LSTMChemGenerator(modeler)

Loading model architecture from experiments/2020-04-21/LSTM_Chem/model_arch.json ...
Loading model checkpoint from experiments/2020-04-21/LSTM_Chem/checkpoints/LSTM_Chem-22-0.46.hdf5 ...
Loaded the Model.


In [30]:
sample_number = 20

In [31]:
base_generated = generator.sample(num=sample_number)

100%|██████████| 20/20 [00:14<00:00,  1.37it/s]


In [32]:
base_generated_mols = validate_mols(base_generated)
base_generated_smiles = convert_mols_to_smiles(base_generated_mols)
random.shuffle(base_generated_smiles)
random.shuffle(base_generated_smiles)
# Select X for training data to refine the molecule generator RNN (ensures diverity)
training_smiles += base_generated_smiles[0:5]
len(training_smiles)

36

In [33]:
master_table = pd.read_csv('./generations/master_results_table_gen' + str(GLOBAL_GENERATION-1) + '.csv',sep=',')
master_table.head()

Unnamed: 0,id,gen,smile,source,weight,score
0,AABL,10,O=C(NC(Cc1ccccc1)C(=O)NC(Cc1ccccc1)C(=O)O)C1CC...,generated,786.926,-17.9
1,AADG,9,O=C(NC1CC2CCC(C1)N2CCc1ccccc1)C1CC2CC(C1)C2C(=...,generated,838.062,-17.7
2,AACA,10,O=C(NC1CC2CCC(C1)N2CCc1ccccc1)C1CC2CC(C1)C2C(=...,generated,838.062,-17.7
3,AABO,9,O=C(NC1CC2CCC(C1)N2CCc1ccccn1)C1CC2CCC(C1)C2C(...,generated,853.077,-17.6
4,AABK,10,O=C(NC1CC2CCC(C1)N2CCc1ccccn1)C1CC2CCC(C1)C2C(...,generated,853.077,-17.6


In [34]:
# Save the list of smiles to train on
with open('./generations/training/gen' + str(GLOBAL_GENERATION) + '_training.smi', 'w') as f:
    for item in training_smiles:
        f.write("%s\n" % item)


## Retrain the network to create molecules more like those selected above

In [35]:
from lstm_chem.finetuner import LSTMChemFinetuner

In [36]:
config = process_config('experiments/2019-12-23/LSTM_Chem/config.json')
config['model_weight_filename'] = 'experiments/2019-12-23/LSTM_Chem/checkpoints/finetuned_gen' + str(GLOBAL_GENERATION-1) + '.hdf5'
config['finetune_data_filename'] = './generations/training/gen' + str(GLOBAL_GENERATION) + '_training.smi'
print(config)

batch_size: 512
checkpoint_dir: experiments/2020-04-22/LSTM_Chem/checkpoints/
checkpoint_mode: min
checkpoint_monitor: val_loss
checkpoint_save_best_only: false
checkpoint_save_weights_only: true
checkpoint_verbose: 1
config_file: experiments/2019-12-23/LSTM_Chem/config.json
data_filename: ./datasets/all_smiles_clean.smi
data_length: 0
exp_dir: experiments/2020-04-22/LSTM_Chem
exp_name: LSTM_Chem
finetune_batch_size: 1
finetune_data_filename: ./generations/training/gen11_training.smi
finetune_epochs: 5
model_arch_filename: experiments/2019-12-23/LSTM_Chem/model_arch.json
model_weight_filename: experiments/2019-12-23/LSTM_Chem/checkpoints/finetuned_gen10.hdf5
num_epochs: 42
optimizer: adam
sampling_temp: 0.75
seed: 71
smiles_max_length: 128
tensorboard_log_dir: experiments/2020-04-22/LSTM_Chem/logs/
tensorboard_write_graph: true
train_smi_max_len: 128
units: 256
validation_split: 0.1
verbose_training: true



In [37]:
modeler = LSTMChem(config, session='finetune')
finetune_dl = DataLoader(config, data_type='finetune')

finetuner = LSTMChemFinetuner(modeler, finetune_dl)
finetuner.finetune()

Loading model architecture from experiments/2019-12-23/LSTM_Chem/model_arch.json ...


100%|██████████| 36/36 [00:00<00:00, 1723.53it/s]

Loading model checkpoint from experiments/2019-12-23/LSTM_Chem/checkpoints/finetuned_gen10.hdf5 ...
Loaded the Model.
loading SMILES...
done.
tokenizing SMILES...
done.
Instructions for updating:
Please use Model.fit, which supports generators.
  ...
    to  
  ['...']
Train for 36 steps
Epoch 1/5





Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fa00a59de50>

In [38]:
finetuner.model.save_weights('experiments/2019-12-23/LSTM_Chem/checkpoints/finetuned_gen' + str(GLOBAL_GENERATION) + '.hdf5')

In [39]:
config['model_weight_filename'] = 'experiments/2019-12-23/LSTM_Chem/checkpoints/finetuned_gen' + str(GLOBAL_GENERATION) + '.hdf5'
modeler = LSTMChem(config, session='generate')
generator = LSTMChemGenerator(modeler)
print(config)

Loading model architecture from experiments/2019-12-23/LSTM_Chem/model_arch.json ...
Loading model checkpoint from experiments/2019-12-23/LSTM_Chem/checkpoints/finetuned_gen11.hdf5 ...
Loaded the Model.
batch_size: 512
checkpoint_dir: experiments/2020-04-22/LSTM_Chem/checkpoints/
checkpoint_mode: min
checkpoint_monitor: val_loss
checkpoint_save_best_only: false
checkpoint_save_weights_only: true
checkpoint_verbose: 1
config_file: experiments/2019-12-23/LSTM_Chem/config.json
data_filename: ./datasets/all_smiles_clean.smi
data_length: 0
exp_dir: experiments/2020-04-22/LSTM_Chem
exp_name: LSTM_Chem
finetune_batch_size: 1
finetune_data_filename: ./generations/training/gen11_training.smi
finetune_epochs: 5
model_arch_filename: experiments/2019-12-23/LSTM_Chem/model_arch.json
model_weight_filename: experiments/2019-12-23/LSTM_Chem/checkpoints/finetuned_gen11.hdf5
num_epochs: 42
optimizer: adam
sampling_temp: 0.75
seed: 71
smiles_max_length: 128
tensorboard_log_dir: experiments/2020-04-22/LST

In [40]:
sample_number = 5000
sampled_smiles = generator.sample(num=sample_number)

100%|██████████| 5000/5000 [3:39:35<00:00,  2.64s/it]  


In [41]:
valid_mols = []
for smi in sampled_smiles:
    mol = Chem.MolFromSmiles(smi)
    if mol is not None:
        valid_mols.append(mol)
# low validity
print('Validity: ', f'{len(valid_mols) / sample_number:.2%}')

valid_smiles = [Chem.MolToSmiles(mol) for mol in valid_mols]
# high uniqueness
print('Uniqueness: ', f'{len(set(valid_smiles)) / len(valid_smiles):.2%}')

# Of valid smiles generated, how many are truly original vs ocurring in the training data
import pandas as pd
training_data = pd.read_csv('./datasets/all_smiles_clean.smi', header=None)
training_set = set(list(training_data[0]))
original = []
for smile in list(set(valid_smiles)):
    if not smile in training_set:
        original.append(smile)
print('Originality: ', f'{len(set(original)) / len(set(valid_smiles)):.2%}')

Validity:  89.26%
Uniqueness:  16.40%
Originality:  99.45%


In [42]:
valid_smiles = list(set(valid_smiles))
len(valid_smiles)

732

In [43]:
#take the valid smiles from above and run them through process to add to tracking table and to generate next PyRx testing data
mols_for_next_generation = validate_mols(valid_smiles)

master_table = pd.read_csv('./generations/master_results_table_gen' + str(GLOBAL_GENERATION-1) +'.csv',sep=',')
new_mols_to_test = append_to_tracking_table(master_table,mols_for_next_generation, 'generated', GLOBAL_GENERATION)
mols_for_pd = new_mols_to_test[0]
mols_for_export = new_mols_to_test[1]

master_table = master_table.append(mols_for_pd)
master_table = master_table.reset_index(drop=True)
master_table.to_csv(r'./generations/master_results_table_gen' + str(GLOBAL_GENERATION) + '.csv', index=False)

idAAAAgen11
idAAABgen11
idAAACgen11
idAAADgen11
idAAAEgen11
idAAAFgen11
idAAAGgen11
idAAAHgen11
idAAAIgen11
idAAAJgen11
idAAAKgen11
idAAALgen11
idAAAMgen11
idAAANgen11
idAAAOgen11
idAAAPgen11
idAAAQgen11
idAAARgen11
idAAASgen11
idAAATgen11
idAAAUgen11
idAAAVgen11
idAAAWgen11
idAAAXgen11
idAAAYgen11
idAAAZgen11
idAABAgen11
idAABBgen11
idAABCgen11
idAABDgen11
idAABEgen11
idAABFgen11
idAABGgen11
idAABHgen11
idAABIgen11
idAABJgen11
idAABKgen11
idAABLgen11
idAABMgen11
idAABNgen11
idAABOgen11
idAABPgen11
idAABQgen11
idAABRgen11
idAABSgen11
idAABTgen11
idAABUgen11
idAABVgen11
idAABWgen11
idAABXgen11
idAABYgen11
idAABZgen11
idAACAgen11
idAACBgen11
idAACCgen11
idAACDgen11
idAACEgen11
idAACFgen11
idAACGgen11
idAACHgen11
idAACIgen11
idAACJgen11
idAACKgen11
idAACLgen11
idAACMgen11
idAACNgen11
idAACOgen11
idAACPgen11
idAACQgen11
idAACRgen11
idAACSgen11
idAACTgen11
idAACUgen11
idAACVgen11
idAACWgen11
idAACXgen11
idAACYgen11
idAACZgen11
idAADAgen11
idAADBgen11
idAADCgen11
idAADDgen11
idAADEgen11
idAA

In [44]:
len(mols_for_export)

732

In [45]:
write_gen_to_sdf(mols_for_export, GLOBAL_GENERATION, 2000)
print('ok')

ok
