# Description

This notebook is used to tabulate final results and compare novel generated molecules vs HIV inhibitors vs Remdesivir which entered clinical trials just a few days ago.

## Now the best results and save to sdf for last double check tabulation in PyRx

In [1]:
import pandas as pd
from rdkit import Chem, DataStructs
import random
import numpy as np
import rdkit.Chem.PropertyMol

In [2]:
final = pd.read_csv('./generations/master_results_table_gen10.csv',sep=',')
final = final.sort_values('score', ascending=True)
print(final.shape)
final.head()

(3020, 6)


Unnamed: 0,id,gen,smile,source,weight,score
0,AABL,10,O=C(NC(Cc1ccccc1)C(=O)NC(Cc1ccccc1)C(=O)O)C1CC...,generated,786.926,-17.9
1,AADG,9,O=C(NC1CC2CCC(C1)N2CCc1ccccc1)C1CC2CC(C1)C2C(=...,generated,838.062,-17.7
2,AACA,10,O=C(NC1CC2CCC(C1)N2CCc1ccccc1)C1CC2CC(C1)C2C(=...,generated,838.062,-17.7
3,AABO,9,O=C(NC1CC2CCC(C1)N2CCc1ccccn1)C1CC2CCC(C1)C2C(...,generated,853.077,-17.6
4,AABK,10,O=C(NC1CC2CCC(C1)N2CCc1ccccn1)C1CC2CCC(C1)C2C(...,generated,853.077,-17.6


In [3]:
final_max = final.groupby("smile").max()[["score","gen"]].reset_index()
final_max = final_max.sort_values('score', ascending=True)
print(final_max.shape)
final_max.head()

(2728, 3)


Unnamed: 0,smile,score,gen
2437,O=C(NC(Cc1ccccc1)C(=O)NC(Cc1ccccc1)C(=O)O)C1CC...,-17.9,10
2474,O=C(NC1CC2CCC(C1)N2CCc1ccccc1)C1CC2CC(C1)C2C(=...,-17.7,10
2492,O=C(NC1CC2CCCC(C1)N2CCc1ccccn1)C1CC2CCC(C1)C2C...,-17.6,10
2482,O=C(NC1CC2CCC(C1)N2CCc1ccccn1)C1CC2CCC(C1)C2C(...,-17.6,10
2480,O=C(NC1CC2CCC(C1)N2CCc1ccccn1)C1CC2CC(C1)C2C(=...,-17.5,10


In [4]:
final_joined = pd.merge(final_max, final, on=['smile','gen'], suffixes=('_old','_new'), how='left')
final_joined = final_joined[(final_joined['score_new'] <= -15.0) & (final_joined['weight'] < 900)]
final_joined['score'] = final_joined['score_new']
final_joined = final_joined.drop('score_old', axis=1)
final_joined = final_joined.drop('score_new', axis=1)
print(final_joined.shape)
final_joined.head()

(41, 6)


Unnamed: 0,smile,gen,id,source,weight,score
0,O=C(NC(Cc1ccccc1)C(=O)NC(Cc1ccccc1)C(=O)O)C1CC...,10,AABL,generated,786.926,-17.9
1,O=C(NC1CC2CCC(C1)N2CCc1ccccc1)C1CC2CC(C1)C2C(=...,10,AACA,generated,838.062,-17.7
2,O=C(NC1CC2CCCC(C1)N2CCc1ccccn1)C1CC2CCC(C1)C2C...,10,AABP,generated,867.104,-17.6
3,O=C(NC1CC2CCC(C1)N2CCc1ccccn1)C1CC2CCC(C1)C2C(...,10,AABK,generated,853.077,-17.6
4,O=C(NC1CC2CCC(C1)N2CCc1ccccn1)C1CC2CC(C1)C2C(=...,10,AAAF,generated,839.05,-17.5


In [5]:
hiv = final[final['source'] == 'hiv']
hiv.head()

Unnamed: 0,id,gen,smile,source,weight,score
548,ABSI,0,CC(C)(C)NC(=O)C1CC2CCCCC2CN1CC(O)C(Cc1ccccc1)N...,hiv,670.855,-10.8
713,ABSH,0,CC(C)CN(CC(O)C(Cc1ccccc1)NC(=O)OC1COC2OCCC12)S...,hiv,756.004,-10.0
832,ABSN,0,COC(=O)NC(C(=O)NC(Cc1ccccc1)C(O)CN(Cc1ccc(-c2c...,hiv,704.869,-9.5
892,ABSK,0,CC(C)CN(CC(O)C(Cc1ccccc1)NC(=O)OC1COC2OCCC12)S...,hiv,547.674,-9.2
1001,ABSL,0,Cc1c(O)cccc1C(=O)NC(CSc1ccccc1)C(O)CN1CC2CCCCC...,hiv,567.796,-8.5


In [6]:
final_table = final_joined.append(hiv)
final_table.head()

Unnamed: 0,smile,gen,id,source,weight,score
0,O=C(NC(Cc1ccccc1)C(=O)NC(Cc1ccccc1)C(=O)O)C1CC...,10,AABL,generated,786.926,-17.9
1,O=C(NC1CC2CCC(C1)N2CCc1ccccc1)C1CC2CC(C1)C2C(=...,10,AACA,generated,838.062,-17.7
2,O=C(NC1CC2CCCC(C1)N2CCc1ccccn1)C1CC2CCC(C1)C2C...,10,AABP,generated,867.104,-17.6
3,O=C(NC1CC2CCC(C1)N2CCc1ccccn1)C1CC2CCC(C1)C2C(...,10,AABK,generated,853.077,-17.6
4,O=C(NC1CC2CCC(C1)N2CCc1ccccn1)C1CC2CC(C1)C2C(=...,10,AAAF,generated,839.05,-17.5


In [7]:
# Note I also manually added Remdesivir because as working on this it went into clinical trials
# https://en.wikipedia.org/wiki/Remdesivir
final_table.to_csv(r'./generations/master_results_table_final2.csv', index=False)

In [8]:
final_table = pd.read_csv('./generations/master_results_table_final.csv')
final_table.tail()

Unnamed: 0,smile,gen,id,source,weight,score_best,score_avg,similarity_to_hiv_inhibitors,similarity_to_remdesivir
50,CC(C)CN(CC(O)C(Cc1ccccc1)NC(=O)OC1CCOC1)S(=O)(...,0,ABSM,hiv,505.637,-7.7,-7.288889,0.558467,0.404068
51,CCCC1(CCc2ccccc2)CC(O)=C(C(CC)c2cccc(NS(=O)(=O...,0,ABSP,hiv,602.675,-7.6,-7.2,0.493003,0.562334
52,O=C1Nc2ccc(Cl)cc2C(C#CC2CC2)(C(F)(F)F)O1,0,ABSQ,hiv,315.678,-6.4,-6.055556,0.433602,0.481928
53,CC(C)c1nc(CN(C)C(=O)NC(C(=O)NC(Cc2ccccc2)CC(O)...,0,ABSO,hiv,720.962,,,0.451159,0.476061
54,Cc1cccc(C)c1OCC(=O)NC(Cc1ccccc1)C(O)CC(Cc1cccc...,0,ABSJ,hiv,628.814,,,0.430199,0.422064


In [9]:
def set_molecule(row):
    mol = Chem.MolFromSmiles(row['smile'])
    pm = Chem.PropertyMol.PropertyMol(mol)
    title = 'id' + str(row['id']) + 'gen'+ str(row['gen'])
#     print(title)
    # Enables for tracking which molecule is which in PyRx GUI and PyRx results export
    pm.SetProp('Title', title)
    return pm

In [10]:
mols_for_export = final_table.apply(set_molecule, axis=1)
# list(mols_for_export)

In [11]:
w = Chem.SDWriter('./generations/genfinal.sdf')
for m in mols_for_export:
    w.write(m)

## Rerun everything through PyRx once more to double check, then publish final metrics

In [12]:
new_scores = pd.read_csv('./generations/results/results_genfinal.csv',sep=',')
new_scores.head()

Unnamed: 0,Ligand,Binding Affinity,rmsd/ub,rmsd/lb
0,6lu7_idAABLgen10,-17.9,0.0,0.0
1,6lu7_idAACAgen10,-17.7,0.0,0.0
2,6lu7_idAABPgen10,-17.6,0.0,0.0
3,6lu7_idAABKgen10,-17.6,0.0,0.0
4,6lu7_idAACAgen10,-17.5,7.073,2.871


In [13]:
new_scores = new_scores.groupby("Ligand").agg({'Binding Affinity': [np.min,np.average]}).reset_index()
new_scores.head()

Unnamed: 0_level_0,Ligand,Binding Affinity,Binding Affinity
Unnamed: 0_level_1,Unnamed: 1_level_1,amin,average
0,6lu7_idAAAAgen11,-13.5,-12.666667
1,6lu7_idAAACgen10,-16.3,-14.511111
2,6lu7_idAAAFgen10,-17.5,-15.677778
3,6lu7_idAAAGgen10,-16.2,-14.311111
4,6lu7_idAAAHgen10,-16.3,-14.357143


In [14]:
new_scores['id'] = new_scores['Ligand'].str.split("_").str[1].str.split("gen").str[0].str.split("id").str[1]
new_scores['gen'] = new_scores['Ligand'].str.split("_").str[1].str.split("gen").str[1]
new_scores['score_best'] = new_scores["Binding Affinity"]["amin"]
new_scores['score_avg'] = new_scores["Binding Affinity"]["average"]
new_scores = new_scores[['id','gen','score_best','score_avg']]
new_scores.head()

Unnamed: 0,id,gen,score_best,score_avg
,,,,
0.0,AAAA,11.0,-13.5,-12.666667
1.0,AAAC,10.0,-16.3,-14.511111
2.0,AAAF,10.0,-17.5,-15.677778
3.0,AAAG,10.0,-16.2,-14.311111
4.0,AAAH,10.0,-16.3,-14.357143


In [15]:
new_scores.id = new_scores.id.astype(str)
new_scores.gen = new_scores.gen.astype(str)
final_table.id = final_table.id.astype(str)
final_table.gen = final_table.gen.astype(str)

In [16]:
new_table = pd.merge(final_table, new_scores, on=['id','gen'], suffixes=('_old','_new'), how='left')
new_table['score_best'] = new_table[('score_best', '')]
new_table['score_avg'] = new_table[('score_avg', '')]
new_table = new_table.drop([('score_best', ''),('score_avg', '')], axis=1)
new_table = new_table.sort_values('score_best', ascending=True)
new_table.tail()



Unnamed: 0,smile,gen,id,source,weight,score_best,score_avg,similarity_to_hiv_inhibitors,similarity_to_remdesivir
50,CC(C)CN(CC(O)C(Cc1ccccc1)NC(=O)OC1CCOC1)S(=O)(...,0,ABSM,hiv,505.637,-7.7,-7.288889,0.558467,0.404068
51,CCCC1(CCc2ccccc2)CC(O)=C(C(CC)c2cccc(NS(=O)(=O...,0,ABSP,hiv,602.675,-7.6,-7.2,0.493003,0.562334
52,O=C1Nc2ccc(Cl)cc2C(C#CC2CC2)(C(F)(F)F)O1,0,ABSQ,hiv,315.678,-6.4,-6.055556,0.433602,0.481928
53,CC(C)c1nc(CN(C)C(=O)NC(C(=O)NC(Cc2ccccc2)CC(O)...,0,ABSO,hiv,720.962,,,0.451159,0.476061
54,Cc1cccc(C)c1OCC(=O)NC(Cc1ccccc1)C(O)CC(Cc1cccc...,0,ABSJ,hiv,628.814,,,0.430199,0.422064


In [17]:
new_table.columns

Index(['smile', 'gen', 'id', 'source', 'weight', 'score_best', 'score_avg',
       'similarity_to_hiv_inhibitors', 'similarity_to_remdesivir'],
      dtype='object')

In [18]:
hiv_smiles = new_table[(new_table['source'] == 'hiv') & (new_table['score_best'].notnull())]
hiv_smiles_list = list(hiv_smiles['smile'])
hiv_smiles.head(20)

Unnamed: 0,smile,gen,id,source,weight,score_best,score_avg,similarity_to_hiv_inhibitors,similarity_to_remdesivir
42,CC(C)(C)NC(=O)C1CC2CCCCC2CN1CC(O)C(Cc1ccccc1)N...,0,ABSI,hiv,670.855,-10.7,-9.633333,0.513706,0.489396
43,CC(C)CN(CC(O)C(Cc1ccccc1)NC(=O)OC1COC2OCCC12)S...,0,ABSH,hiv,756.004,-9.9,-9.355556,0.588024,0.589407
44,CC(C)CN(CC(O)C(Cc1ccccc1)NC(=O)OC1COC2OCCC12)S...,0,ABSK,hiv,547.674,-9.7,-8.655556,0.592052,0.470364
45,COC(=O)NC(C(=O)NC(Cc1ccccc1)C(O)CN(Cc1ccc(-c2c...,0,ABSN,hiv,704.869,-9.1,-8.366667,0.501215,0.45357
46,CC(C)(C)NC(=O)C1CN(Cc2cccnc2)CCN1CC(O)CC(Cc1cc...,0,ABSR,hiv,613.803,-8.5,-8.133333,0.474714,0.473684
47,Cc1c(O)cccc1C(=O)NC(CSc1ccccc1)C(O)CN1CC2CCCCC...,0,ABSL,hiv,567.796,-8.1,-7.833333,0.472333,0.432787
48,CCOP(=O)(COc1ccc(CC(NC(=O)OC2COC3OCCC23)C(O)CN...,0,ABSS,hiv,728.798,-8.1,-7.733333,0.585561,0.530075
49,COC(=O)NC(C(=O)NCCCCC(CO)N(CC(C)C)S(=O)(=O)c1c...,0,ABST,hiv,624.804,-7.9,-7.677778,0.511851,0.43382
50,CC(C)CN(CC(O)C(Cc1ccccc1)NC(=O)OC1CCOC1)S(=O)(...,0,ABSM,hiv,505.637,-7.7,-7.288889,0.558467,0.404068
51,CCCC1(CCc2ccccc2)CC(O)=C(C(CC)c2cccc(NS(=O)(=O...,0,ABSP,hiv,602.675,-7.6,-7.2,0.493003,0.562334


In [19]:
hiv_fingerprints = []
for smile in hiv_smiles_list:
    hiv_fingerprints.append(Chem.RDKFingerprint(Chem.MolFromSmiles(smile)))

def calc_hiv_similarity_score(row):
    fingerprint = Chem.RDKFingerprint(Chem.MolFromSmiles(row['smile']))
    similarity = np.mean(DataStructs.BulkTanimotoSimilarity(fingerprint,hiv_fingerprints))
    return similarity

remdesivir_fingerprint = Chem.RDKFingerprint(Chem.MolFromSmiles('CCC(CC)COC(=O)[C@H](C)NP(=O)(OC[C@H]1O[C@](C#N)([C@H](O)[C@@H]1O)C1=CC=C2N1N=CN=C2N)OC1=CC=CC=C1'))

new_table['similarity_to_hiv_inhibitors'] = new_table.apply(calc_hiv_similarity_score, axis=1)
new_table['similarity_to_remdesivir'] = new_table['smile'].apply(lambda x: DataStructs.TanimotoSimilarity(Chem.RDKFingerprint(Chem.MolFromSmiles(x)),remdesivir_fingerprint))
new_table.head(50)

Unnamed: 0,smile,gen,id,source,weight,score_best,score_avg,similarity_to_hiv_inhibitors,similarity_to_remdesivir
0,O=C(NC(Cc1ccccc1)C(=O)NC(Cc1ccccc1)C(=O)O)C1CC...,10,AABL,generated,786.926,-17.9,-16.075,0.375563,0.300506
1,O=C(NC1CC2CCC(C1)N2CCc1ccccc1)C1CC2CC(C1)C2C(=...,10,AACA,generated,838.062,-17.7,-15.911111,0.411519,0.350693
2,O=C(NC1CC2CCCC(C1)N2CCc1ccccn1)C1CC2CCC(C1)C2C...,10,AABP,generated,867.104,-17.6,-15.555556,0.422139,0.380663
3,O=C(NC1CC2CCC(C1)N2CCc1ccccn1)C1CC2CCC(C1)C2C(...,10,AABK,generated,853.077,-17.6,-15.7125,0.423334,0.383214
4,O=C(NC1CC2CCC(C1)N2CCc1ccccn1)C1CC2CC(C1)C2C(=...,10,AAAF,generated,839.05,-17.5,-15.677778,0.42618,0.386439
5,O=C(NC1CC2CCCC(C1)N2CCc1ccccc1)C1CC2CCC(C1)C2C...,10,AAAJ,generated,866.116,-17.5,-15.711111,0.40625,0.343143
6,O=C(NC1CC2CCCC(C1)N2CCc1ccccc1)C1CC2CC(C1)C2C(...,10,AAAN,generated,852.089,-17.4,-15.722222,0.408519,0.346644
7,O=C(NC1CC2CCCC(C1)N2CCc1ccccc1)C1COC2CC(C1)C2C...,10,AABF,generated,868.088,-17.3,-15.388889,0.443731,0.425683
8,O=C(NC1CC2CCC(C1)N2CCc1ccccc1)C1CC2CCC(C1)C2C(...,10,AAAU,generated,852.089,-17.2,-15.611111,0.408807,0.346837
9,O=C(NC(Cc1ccccc1)C(=O)NC(Cc1ccccc1)C(=O)O)C1CC...,10,AACC,generated,772.899,-17.2,-15.7,0.377883,0.304666


In [20]:
new_table.to_csv(r'./generations/master_results_table_final.csv', index=False)