In [1]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, QED
from rdkit.Chem import Draw
from rdkit.Chem import rdMolDescriptors
from rdkit import RDLogger
from envs.sascorer import calculateScore
from docking.docking_modif import dock_score
from envs.environment import qed, penalized_logp
import argparse

In [21]:
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import json
from tqdm import tqdm
import re
import torch
from datetime import datetime
import matplotlib.pyplot as plt

In [3]:
def tanimoto(sml):
    
    mol_dmnp = Chem.MolFromSmiles('CC1CCC(C2=C1C=CC(=C2)C)C(C)CCC(=O)O')
    fp_dmnp = AllChem.GetMorganFingerprint(mol_dmnp, 2)

    mol_sml = Chem.MolFromSmiles(sml)
    fp_sml = AllChem.GetMorganFingerprint(mol_sml, 2)
    
    return DataStructs.TanimotoSimilarity(fp_dmnp, fp_sml)

In [4]:
grand = [
    'CCC(=O)CC(C)CC1CCC(C)c2ccc(C)cc21',
    'Cc1ccc2c(c1)C(CCCC(=O)O)CCC2C',
    'C=CCCC(C)Cc1cc(CC)ccc1C(C)CC',
    'C=CCCC(C)C1CCC(C)c2ccc(CC)cc21',
    'CCC(C)C(CC)c1ccc(C)cc1',
    'Cc1ccc2c(c1)C(C(C)CCC1=CS1)CCC2C',
    'Cc1ccc2c(c1)CCCC2C',
    'CC=C(O)CCC(C)CCC(CC)c1cccc(C)c1',
    'C=C(C)CC=CCCC(C)(C)CC(C)CCCC',
    'Cc1ccccc1C(C)C1CC1',
    'CC(=O)CCC(C)C1CCC(C)c2ccc(C)cc21',
    'CC1CCC(C2=C1C=CC(=C2)C)C(C)CCC(=O)CCC(C)C1CCC(C)c2ccc(C)cc21',
    'CC1CCC(C2=C1C=CC(=C2)C)C(C)C1CCC(C2=C1C=CC(=C2)C)C(C)CCC(=O)O',
    'CC1CCC(C2=C1C=CC(=C2)C)C(C)CCC(=O)C1CCC(C2=C1C=CC(=C2)C)C(C)CCC(=O)O',
    'C=C(C)CCC(C)C1CCC(C)c2ccc(C)cc21',
    'CC1CCC(C2=C1C=CC(=C2)C)C(C)CCC(=C)CCC(=O)CCC(=O)O',
    'Cc1ccc2c(c1)C(C(C)CCCC(=O)O)CCC2C',
    'Cc1ccc2c(c1)C(C(C)CCC(N)=O)CCC2CC(C)N1CCC(C)c2ccc(C)cc21',
    'CC1CCC(C2=C1C=CC(=C2)C)C(C)CCC(C)C(C)CCC(=O)O',
    'CCC(=O)CC(C)CC(C)CC=CCCC(O)CCC(C)CCCc1ccc2c(c1)C(C(C)CCC1=CS1)CCC2C',
    'CC1CCC(C2=C1C=CC(=C2)C)C(C)CCCC(C)CCC(=O)O',
    'COC(CC(=O)CCC(C)C1OCC(C)c2ccc(C)cc21)c1cccc(C)c1',
    'Cc1ccc2c(c1)C(C(C)C)CCC2C',
    'C=CCCC(C)C1CCC(C)C2:ccc(OC)cc:21',
    'Cc1ccc2c(c1)C(C(C)CCC1=CS1)CCC2C',
    'CC1CCC(C2=C1C=CC(=C2)C)C(C)CCC(=O)CCC(=O)O',
    'Cc1ccc2c(c1)C(C(C)CCC(=O)CCc1ccc3c(c1)CCC3C(C)C1CCC(C)c3ncc(C)cc31)CCC2C',
    'CC1CCC(C2=C1C=CC(=C2)C)C(C)CCC1CCC(C2=C1C=CC(=C2)C)C(C)CCC(=O)O',
    'Cc1ccc2c(c1)C(CC1CCC(C(C)CCC(=O)O)c3cc(C)ccc31)CCC2C',
    'CC1CCC(C2=C1C=CC(=C2)C)C1CCC(C)c2ccc(C)cc21',
    'CC1CCC(C2=C1C=CC(=C2)C)C(C)C1CCC(C2=C1C=CC(=C2)C)C(C)CCC(=O)O',
    'CC1CCC(CCCC(=O)O)c2c:N(C):cccccc(C)cc21',
    'CC1CCCC(N)=Cc2ccc(O):C:c2C1C(C)CCC(=O)CO',
    'CC1CCC(C2=C1C=CC(=C2)C)C(C)CCCCC(=O)O',
    'CC1CCC(C2=C1C=CC(=C2)C)C(C)CC=CCCC(O)CCC(C)CCCC1:ccc2c(c:1)C(C(C)CCC1=CS1)CCC2C',
    'CC1CCC(C2=C1C=CC(=C2)C)C(C)C1:ccc2c(c:1)C(C(C)C(C)CNC(=O)O)CCC2C',
    'CC1CCC(C2=C1C=CC(=C2)C)C(C)CC1CCC(C2=C1C=CC(=C2)C)C(C)CCC(=O)O',
    'CC1CCC(C2=C1C=CC(=C2)C)C(C)CCC(=O)CCC(C)C1CCC(C)c2ccc(C)cc21',
    'CC1CCC(C2=C1C=CC(=C2)C)C(C)CCC(C)CCC(=O)O',
    'Cc1ccc2c(c1)C(C(C)CCCC(=O)O)CCC2C',
    'Cc1ccc2c(c1)C(C(C)CN)CCC2CN(C)C1CCC(C2=C1C=CC(=C2)C)C1CCC(C2=C1C=CC(=C2)C)C(C)CCC(=O)O',
    'CC1CCC(C2=C1C=CC(=C2)C)C(C)CC1CCC(C2=C1C=CC(=C2)C)C(C)CCC(=O)O',
    'CC1CCC(C2=C1C=CC(=C2)C)C(C)CCC(=O)C1CCC(C)c2ccc(C)cc21',
    'CC1CCC(C2=C1C=CC(=C2)C)C1CCC(C2=C1C=CC(=C2)C)C(C)CCC(=O)O',
    'Cc1ccc2c(c1)C(C(C)CCC(=O)O)CCC2C(C)C1CCC(C)c2ccc(C)cc21',
    'Cc1ccc2c(c1)C(C(C)CCC(=O)O)CCC2C1CCC(C)c2ccc(C)cc21',
    'CC1CCC(C2=C1C=CC(=C2)C)C1CCC(C2=C1C=CC(=C2)C)C(C)CCC(=O)O',
    'CC1CCC(C2=C1C=CC(=C2)C)C(C)CCCC1CCC(C2=C1C=CC(=C2)C)C(C)CCC(=O)O',
    'Cc1cc:C2:c(c1)C(C(C)CCCC1CCC(C(C)CCC(=O)O)c3cc(C)ccc31)CCC2C',
    'Cc1ccc2c(c1)C(C(C)CCC(=O)O)CCC2CC(C)C1CCC(C)c2ccc(C)cc21',
    'CC1CCC(C2=C1C=CC(=C2)C)C(C)C1CCC(C)c2ccc(C)cc21',
    'C=C(CSc1cnn:N:1c1cc(NC(C)=C2CCC(C)c3ccc(C)cc32)ccn1)c1ccc(F)c(F)c1',
    'CC1CCC(C2=C1C=CC(=C2)C)C(C)CCc1ccc2c(c1)C(C(C)CNC(=O)O)CCC2C',
    'C=C(O)OCC(C)C1CCC(C)c2ccc(CC(C)C3CCC(C)c4ccc(C)cc43)cc21',
    'CC1CCC(C2=C1C=CC(=C2)C)C(C)Cc1ccc2c(:C:1)C(C(C)CCC(=O)O)CCC2C',
    'Cc1ccc2c(c1)C(C(C)OCC(=O)O)CCC2C(C)C1CCC(C)c2ccc(C)cc21',
    'CC1CCC(C2=C1C=CC(=C2)C)C(C)CCCc1ccc2c(c1)C(C(C)CCC(=O)O)CCC2C',
    'CC1CCC(C2=C1C=CC(=C2)C)C(C)C1CCC(C2=C1C=CC(=C2)C)C(C)CCC(=O)O'
]

scores = []

for sml in tqdm(grand):
    try:
        mol = Chem.MolFromSmiles(sml)
        scores.append((sml, penalized_logp(mol), calculateScore(mol), dock_score(sml, 8), tanimoto(sml)))
    except Exception:
        scores.append((sml, 0, 0, 0, 0))
    # print(sml, penalized_logp(mol), calculateScore(mol), dock_score(sml, 8), tanimoto(sml))


100%|██████████| 58/58 [26:15<00:00, 27.16s/it]


Docking MolHF

1. Cc1ccc2c(c1)C(C(C)CCC(=O)CCc1ccc3c(c1)CCC3C(C)C1CCC(C)c3ncc(C)cc31)CCC2C
2. Cc1ccc2c(c1)C(C(C)CCC(=O)O)CCC2C(C)C1CCC(C)c2ccc(C)cc21
3. CC1CCC(C2=C1C=CC(=C2)C)C1CCC(C)c2ccc(C)cc21

plogP GA

1. CC1CCC(C2=C1C=CC(=C2)C)C(C)CCCC1CCC(C2=C1C=CC(=C2)C)C(C)CCC(=O)O
2. Cc1ccc2c(c1)C(C(C)CCC(=O)O)CCC2CC(C)C1CCC(C)c2ccc(C)cc21
3. CC1CCC(C2=C1C=CC(=C2)C)C(C)C1CCC(C2=C1C=CC(=C2)C)C(C)CCC(=O)O

Tanimoto GA

1. CC1CCC(C2=C1C=CC(=C2)C)C(C)CCCCC(=O)O
2. CC1CCC(C2=C1C=CC(=C2)C)C(C)CCC(C)CCC(=O)O
3. CC1CCC(C2=C1C=CC(=C2)C)C1CCC(C2=C1C=CC(=C2)C)C(C)CCC(=O)O

Docking GA

1. CC1CCC(C2=C1C=CC(=C2)C)C(C)CC1CCC(C2=C1C=CC(=C2)C)C(C)CCC(=O)O
2. Cc1ccc2c(c1)C(C(C)OCC(=O)O)CCC2C(C)C1CCC(C)c2ccc(C)cc21
3. CC1CCC(C2=C1C=CC(=C2)C)C(C)CCc1ccc2c(c1)C(C(C)CNC(=O)O)CCC2C

In [36]:
scores.sort(key=lambda x: x[3])
for i, fnd in enumerate(scores):
    mol = Chem.MolFromSmiles(fnd[0])
    print('Top {}: {:.2f} - {}, docking {:.2f}, tanimoto {:.2f}'.format(i + 1, fnd[1], fnd[0], fnd[3], fnd[4]))
    # img = Draw.MolToImage(mol, size=(400, 400))
    # img.save(f"../Картинки для теха/molhf/dock/molhf_dock_{i + 1}_{fnd[0]}.png")

Top 1: 3.75 - Cc1ccc2c(c1)C(C(C)CCC(=O)CCc1ccc3c(c1)CCC3C(C)C1CCC(C)c3ncc(C)cc31)CCC2C, docking -10.50, tanimoto 0.37
Top 2: 2.58 - Cc1ccc2c(c1)C(C(C)CCC(=O)O)CCC2C(C)C1CCC(C)c2ccc(C)cc21, docking -9.90, tanimoto 0.58
Top 3: 2.66 - CC1CCC(C2=C1C=CC(=C2)C)C1CCC(C)c2ccc(C)cc21, docking -9.90, tanimoto 0.35
Top 4: 2.67 - Cc1ccc2c(c1)C(CC1CCC(C(C)CCC(=O)O)c3cc(C)ccc31)CCC2C, docking -9.80, tanimoto 0.60
Top 5: 2.88 - CC1CCC(C2=C1C=CC(=C2)C)C(C)CC1CCC(C2=C1C=CC(=C2)C)C(C)CCC(=O)O, docking -9.80, tanimoto 0.57
Top 6: 2.72 - CC1CCC(C2=C1C=CC(=C2)C)C(C)CCC(=O)C1CCC(C)c2ccc(C)cc21, docking -9.80, tanimoto 0.53
Top 7: 2.63 - CC1CCC(C2=C1C=CC(=C2)C)C1CCC(C2=C1C=CC(=C2)C)C(C)CCC(=O)O, docking -9.70, tanimoto 0.62
Top 8: 2.63 - Cc1ccc2c(c1)C(C(C)CCC(=O)O)CCC2C1CCC(C)c2ccc(C)cc21, docking -9.60, tanimoto 0.62
Top 9: 3.47 - CC1CCC(C2=C1C=CC(=C2)C)C(C)CCC(=O)CCC(C)C1CCC(C)c2ccc(C)cc21, docking -9.60, tanimoto 0.47
Top 10: 2.63 - CC1CCC(C2=C1C=CC(=C2)C)C1CCC(C2=C1C=CC(=C2)C)C(C)CCC(=O)O, docking -9.40,

In [13]:
file_path = os.path.join('./dataset/zinc250k/zinc250k.smi')
fp = open(file_path, 'r')
smiles_list = [smiles.strip() for smiles in fp]

tanimoto_vec = np.vectorize(tanimoto)
tan = tanimoto_vec(smiles_list)

In [31]:
tan.max()

0.3870967741935484