In [1]:
import os
import sys
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"

sys.path.insert(0,'..')
import argparse
import json
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from tqdm import tqdm
from rdkit import Chem
from copy import deepcopy
from dataloader import PretrainDataset
from models.MolHF import MolHF
from torch.utils.data import DataLoader
from multiprocessing import Pool
from distutils.util import strtobool
from time import time, ctime
import optimize_property as op
from envs import environment as env
from envs.timereport import TimeReport
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from utils import set_random_seed
from envs.environment import qed, penalized_logp
from envs.sascorer import calculateScore

import warnings
warnings.filterwarnings("ignore")

In [2]:
def arg_parse():
    return torch.load('MolHF_conf.json')

class PropNet(nn.Module):
    def __init__(self, input_size=512, hidden_size=[128, 32], activ=[nn.Tanh(), nn.Tanh()]):
        super(PropNet, self).__init__()

        self.latent_size = input_size
        self.hidden_size = hidden_size

        vh = (self.latent_size,) + tuple(hidden_size) + (1,)
        modules = []
        for i in range(len(vh)-1):
            modules.append(nn.Linear(vh[i], vh[i+1]))
            if i < len(vh) - 2:
                modules.append(activ[i])
        self.net = nn.Sequential(*modules)

    def forward(self, h):
        output = self.net(h)
        return output


class OptimModel(nn.Module):
    def __init__(self, gen_model:MolHF, hidden_size, activ):
        super(OptimModel, self).__init__()
        
        self.model = gen_model

        self.latent_node_length = gen_model.latent_node_length
        self.latent_edge_length = gen_model.latent_edge_length
        self.latent_size = self.latent_node_length + self.latent_edge_length
        
        self.hidden_size = hidden_size

        self.ds_model = PropNet(self.latent_size, hidden_size, activ)
        self.sa_model = PropNet(self.latent_size, hidden_size, activ)
        self.td_model = PropNet(self.latent_size, hidden_size, activ)

    def encode(self, x, adj):
        z, _, _  = self.model(x, adj)  # z = [h, adj_h]
        return z
    
    def forward(self, x, adj):
        z = self.encode(x, adj)
        h = self.model.to_latent_format(z)
        out_ds = self.ds_model(h)
        out_sa = self.sa_model(h)
        out_td = self.td_model(h)
        return out_ds, out_sa, out_td

    def reverse(self, z):
        out = self.model.to_molecule_format(z)
        x, adj = self.model.reverse(out, true_adj=None)
        return x, adj
    

def train_model(opt_model, optimizer, train_loader, metrics, tr, epoch, lrn_set=['DS']):
    '''
    Р”РµР»Р°РµС‚ РїСЂРѕС…РѕРґ РїРѕ РѕРґРЅРѕР№ СЌРїРѕС…Рµ СЃ С€Р°РіРѕРј РѕРїС‚РёРјРёР·Р°С‚РѕСЂР°
    '''
    log_step = 20
    train_iter_per_epoch = len(train_loader)
    global GEN_RATIO, DS_RATIO, SA_RATIO, TD_RATIO, args
    
    print("Training...")
    opt_model.train()

    total_pd_y = []
    total_true_y = []

    total_pd_sa = []
    total_true_sa_y = []
    
    total_pd_td = []
    total_true_td_y = []
    
    for i, batch in enumerate(train_loader):

        x = batch['node'].to(args.device)   # (bs,9,5)
        adj = batch['adj'].to(args.device)   # (bs,4,9, 9)
        true_y = batch['property'][:,0].float().unsqueeze(1).to(args.device)
        true_sa_y = batch['property'][:,1].float().unsqueeze(1).to(args.device)
        true_td_y = batch['property'][:,2].float().unsqueeze(1).to(args.device)

        # model and loss
        optimizer.zero_grad()
        y, sa_y, td_y = opt_model(x, adj)

        total_pd_y.append(y)
        total_true_y.append(true_y)

        total_pd_sa.append(sa_y)
        total_true_sa_y.append(true_sa_y)

        total_pd_td.append(td_y)
        total_true_td_y.append(true_td_y)
        
        if 'Gen' in lrn_set:
            out_z, out_logdet, _ = opt_model.model(x, adj)
            loss_node, loss_edge = opt_model.model.log_prob(out_z, out_logdet)
            loss_gen = loss_node + loss_edge
        else:
            loss_gen = torch.tensor([0], requires_grad=False).to(args.device)

        if 'DS' in lrn_set:
            loss_ds = metrics(y, true_y)
        else:
            loss_ds = torch.tensor([0], requires_grad=False).to(args.device)

        if 'SA' in lrn_set:
            loss_sa = metrics(sa_y, true_sa_y)
        else:
            loss_sa = torch.tensor([0], requires_grad=False).to(args.device)

        if 'TD' in lrn_set:
            loss_td = metrics(td_y, true_td_y)
        else:
            loss_td = torch.tensor([0], requires_grad=False).to(args.device)

        loss = loss_gen * GEN_RATIO + loss_ds * DS_RATIO + loss_sa * SA_RATIO + loss_td * TD_RATIO
        
        loss.backward()
        optimizer.step()
        tr.update()
        
        # Print log info
        if (i + 1) % log_step == 0:  # i % args.log_step == 0:
            print('Epoch [{}/{}], Iter [{}/{}], loss: {:.5f}, loss_gen: {:.5f}, loss_prop: {:.5f}, loss_sa: {:.5f}, loss_td: {:.5f}, {:.2f} sec/iter, {:.2f} iters/sec: '.
                    format(epoch + 1, args.max_epochs, i + 1, train_iter_per_epoch,
                            loss.item(), loss_gen.item(), loss_ds.item(), loss_sa.item(), loss_td.item(),
                            tr.get_avg_time_per_iter(), tr.get_avg_iter_per_sec()))

            t_pd_y = torch.cat(total_pd_y, dim=-1)
            t_true_y = torch.cat(total_true_y, dim=-1)
            print('Current R^2 score: ', r2_score(t_true_y.cpu().detach().numpy(), t_pd_y.cpu().detach().numpy()))

            tr.print_summary()
    
    total_pd_y = torch.cat(total_pd_y, dim=-1)
    total_true_y = torch.cat(total_true_y, dim=-1)
    
    mse = metrics(total_pd_y, total_true_y)
    mae = mean_absolute_error(total_true_y.cpu().detach().numpy(), total_pd_y.cpu().detach().numpy())
    r2 = r2_score(total_true_y.cpu().detach().numpy(), total_pd_y.cpu().detach().numpy())

    with open(f'r2_tr_{lrn_set}.txt', 'a') as f:
        f.write(f'{r2},')

    print("Training, loss_mle:{}, loss_prop:{}, mse:{}, mae:{}, r2:{}".format(loss_gen.item(), loss_ds.item(), mse, mae, r2))


def validate_model(model, valid_loader, metrics, col, tr, epoch):
    log_step = 20
    valid_iter_per_epoch = len(valid_loader)
    
    print("Validating...")    
    model.eval()
    total_pd_y = []
    total_true_y = []
    with torch.no_grad():
        for i, batch in enumerate(valid_loader):

            x = batch['node'].to(args.device)   # (bs,9,5)
            adj = batch['adj'].to(args.device)   # (bs,4,9, 9)
            true_y = batch['property'][:, col].unsqueeze(1).float().to(args.device)
            # model and loss
            y, _, _ = model(x, adj)
            total_pd_y.append(y)
            total_true_y.append(true_y)
            loss_prop = metrics(y, true_y)
            tr.update()
            # Print log info
            if (i + 1) % log_step == 0:  # i % args.log_step == 0:
                print('Epoch [{}/{}], Iter [{}/{}], loss_prop: {:.5f}, {:.2f} sec/iter, {:.2f} iters/sec: '.
                        format(epoch + 1, args.max_epochs, i + 1, valid_iter_per_epoch,
                                loss_prop.item(),
                                tr.get_avg_time_per_iter(), tr.get_avg_iter_per_sec()))
                tr.print_summary()
        total_pd_y = torch.cat(total_pd_y, dim=-1)
        total_true_y = torch.cat(total_true_y, dim=-1)
        mse = metrics(total_pd_y, total_true_y)
        mae = mean_absolute_error(total_true_y.cpu().detach().numpy(), total_pd_y.cpu().detach().numpy())
        r2 = r2_score(total_true_y.cpu().detach().numpy(), total_pd_y.cpu().detach().numpy())

        with open('r2_val_mlp.txt', 'a') as f:
            f.write(f'{r2},')

        print("Validating, loss_prop:{}, mse:{}, mae:{}, r2:{}".format(loss_prop.item(), mse, mae, r2))
        
    return r2   


def fit_model(opt_model, train_loader, valid_loader, args, property_model_path, lrn_set=['DS']):
    start = time()
    print("Start at Time: {}".format(ctime()))
    print('Moduls for learning: ', lrn_set)
    
    # Loss and optimizer
    metrics = nn.MSELoss()
    best_metrics = float('-inf')
    optimizer = torch.optim.Adam(opt_model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    
    train_iter_per_epoch = len(train_loader)
    valid_iter_per_epoch = len(valid_loader)
    tr = TimeReport(total_iter = args.max_epochs * (train_iter_per_epoch+valid_iter_per_epoch))

    moduls_dict = {'Gen': opt_model.model, 'DS': opt_model.ds_model, 'SA': opt_model.sa_model, 'TD': opt_model.td_model}

    for modul in set(moduls_dict.keys()).difference(lrn_set):
        for param in moduls_dict[modul].parameters():
            param.requires_grad_(False)
    
    for epoch in range(args.max_epochs):
        print("In epoch {}, Time: {}".format(epoch + 1, ctime()))
        # op.generate_molecule(model, train_loader, args, epoch) # РїСЂРѕРІРµСЂРєР° С‚РµРєСѓС‰РµРіРѕ РєР°С‡РµСЃС‚РІР° РіРµРЅРµСЂР°С†РёРё СЃ РїСЂРёРЅС‚Р°РјРё РІР°Р»РёРґРЅРѕСЃС‚Рё Рё С‚.Рґ. 
        
        train_model(opt_model, optimizer, train_loader, metrics, tr, epoch, lrn_set)
        cur_metrics = validate_model(opt_model, valid_loader, metrics, 0, tr, epoch)
        
        if best_metrics < cur_metrics:
            best_metrics = cur_metrics
            print("Epoch {}, saving {} regression model to: {}".format(epoch+1, args.hidden, property_model_path))
            torch.save(opt_model.state_dict(), property_model_path)
        
    tr.print_summary()
    tr.end()
    
    print("The model's training is done. Start at {}, End at {}, Total {:.2f}".
          format(ctime(start), ctime(), time()-start))
    return opt_model


def load_property_csv(filename, normilize=True):

    df = pd.read_csv(filename)  # smiles, DS, SA, TD

    min_max = lambda prop: (df[prop] - df[prop].min()) / (df[prop].max() - df[prop].min())
    gauss = lambda prop: (df[prop] - df[prop].mean()) / df[prop].std()

    if normilize:

        df['DS'] = df['DS'].clip(-10, 5)
        df['DS'] = min_max('DS')
        
        # df['SA'] = df['SA'].clip(-12, -5)
        df['SA'] = min_max('SA')
        
        df['TD'] = min_max('TD')
        
    tuples = [tuple(x[1:]) for x in df.values]

    print('Load {} done, length: {}'.format(filename, len(tuples)))
    return tuples


def optimize_mol(optimize_model:OptimModel, smiles, data_config, args, random=False):
    
    lr = args.opt_lr
    num_iter = args.num_iter
    
    optimize_model.eval()

    with torch.no_grad():
        atoms, bond = op.smiles_to_adj(smiles, args.dataset)
        print(atoms.shape, bond.shape)
        atoms, bond = op.get_mol_data(atoms, bond, data_config)
        print(atoms.shape, bond.shape)
        atoms, bond = torch.from_numpy(atoms).unsqueeze(0), torch.from_numpy(bond).unsqueeze(0)
        print(atoms.shape, bond.shape)
        atoms, bond = atoms.to(args.device), bond.to(args.device)
        mol_z = optimize_model.encode(atoms, bond)
            
    mol = op.Chem.MolFromSmiles(smiles)
    fp1 = op.AllChem.GetMorganFingerprint(mol, 2)
    mol_x, mol_adj = mol_z
    # print(mol_x[0].shape, mol_adj[0].shape)
    
    # здесь одна молекула в разбивке по слоям
    cur_xs = [x.clone().detach().requires_grad_(True).to(args.device) for x in mol_x]
    cur_adjs = [adj.clone().detach().requires_grad_(True).to(args.device) for adj in mol_adj]
    cur_vec = optimize_model.model.to_latent_format([cur_xs, cur_adjs])
    
    start_xs = [x.clone().detach().requires_grad_(True).to(args.device) for x in mol_x]
    start_adjs = [adj.clone().detach().requires_grad_(True).to(args.device) for adj in mol_adj]
    start_vec = optimize_model.model.to_latent_format([start_xs, start_adjs])

    visited = []
    # проход по шагам оптимизации
    for _ in range(num_iter):
        
        total_val = optimize_model.ds_model(cur_vec).squeeze()   
        grad = torch.autograd.grad(total_val, cur_vec)[0]
        # cur_vec = cur_vec.data + lr * grad.data
        if random:
            rad = torch.randn_like(cur_vec.data) # заменить единичную дисперсию на ту, которая в модели
            cur_vec = start_vec.data + lr * rad / torch.sqrt(rad * rad)
        else:
            cur_vec = cur_vec.data + lr * grad.data / torch.norm(grad.data, dim=-1)
        
        lr = lr*args.lr_decay
        cur_vec = cur_vec.clone().detach().requires_grad_(True).to(args.device)  # torch.tensor(cur_vec, requires_grad=True).to(mol_vec)
        visited.append(cur_vec)

    hidden_z = torch.cat(visited, dim=0).to(args.device) # собираем все
    x, adj = optimize_model.reverse(hidden_z) # делаем набор матриц
    
    val_res = op.check_validity(x, adj, num2atom, atom_valency, debug=True)
    print('Check validity done')
    valid_mols = val_res['valid_mols']
    valid_smiles = val_res['valid_smiles']
    results = [[], [], [], []]
    sm_set = set()
    sm_set.add(smiles)
    min_ds = 0
    for m, s in zip(valid_mols, valid_smiles):
        if s in sm_set or s == "":
            continue
        sm_set.add(s)
        qed_ = penalized_logp(m)
        sa = calculateScore(m)
        fp2 = op.AllChem.GetMorganFingerprint(m, 2) # здесь возможно возникает то самое предупреждение с MorganFingerprint
        sim = op.DataStructs.TanimotoSimilarity(fp1, fp2) # здесь вставить свой dist
        print(s, qed_, sa, sim)
        if sim >= 0:
            results[0].append((s, qed_, sa, sim, smiles))
        if sim >= 0.2:
            results[1].append((s, qed_, sa, sim, smiles))
        if sim >= 0.4:
            results[2].append((s, qed_, sa, sim, smiles))
        if sim >= 0.6:
            results[3].append((s, qed_, sa, sim, smiles))
    # smile, property, similarity, mol
    print(min_ds)
    results[0].sort(key=lambda tup: tup[1], reverse=True)
    results[1].sort(key=lambda tup: tup[1], reverse=True)
    results[2].sort(key=lambda tup: tup[1], reverse=True)
    results[3].sort(key=lambda tup: tup[1], reverse=True)
    return results


def optimize_selected_mol(optimize_model, start_smiles, data_config, args, optim_mode='grad'):
    start_time = time()

    print('Optiimization {} for better {} score'.format(start_smiles[0], 'docking'))
    
    result_list = [[],[],[],[]]
    nfail = [0, 0, 0, 0]

    smile, qed1, sa1, _ = start_smiles
    results = optimize_mol(optimize_model, smile,  data_config, args, random=False)

    # обработка уже сгенерированных результатов
    for t in range(len(results)):
        if len(results[t]) > 0:
            smile2, qed2, sa2, sim, _ = results[t][0]
            ds_delta = qed2 - qed1
            if ds_delta > 0: #and sa2 < sa1:
                print('The candidate', smile2, qed1, qed2, sa1, sa2, sim)
                result_list[t].append((smile2, qed1, qed2, sa1, sa2, sim))
            else:
                nfail[t] += 1
                print('Failure: for dist {} the best values less than initial'.format(t))
        else:
            nfail[t] += 1
            print('Failure: there is no moleculars with dist'.format())
                
    for i in range(len(result_list)):
        df = pd.DataFrame(result_list[i],
                        columns=['smile_new', 'qed_old', 'qed_new', 'sa_old', 'sa_new', 'sim'])

        print(df.describe())
        print("For sim > {}:".format(0.2*i))
        # print('nfail:{} in total:{}'.format(nfail[i], args.topk))
        # print('success rate: {}'.format((args.topk-nfail[i])*1.0/args.topk))

In [7]:
.99**100

0.3660323412732292

In [12]:
start = time()
print("Start at Time: {}".format(ctime()))
args = arg_parse()

args.property_model_path = 'plogp_32_10_[\'Gen\', \'DS\']_idx.pth'
args.opt_lr = 0.9
args.lr_decay = 0.99
# set_random_seed(args.seed)
# configuration
num2atom = {0: 6, 1: 7, 2: 8, 3: 9, 4: 15, 5: 16, 6: 17, 7: 35, 8: 53}
atom_valency = {6: 4, 7: 3, 8: 2, 9: 1, 15: 3, 16: 2, 17: 1, 35: 1, 53: 1}

data_path = os.path.join('./data_preprocessed', args.dataset)
with open(os.path.join(data_path, 'config.txt'), 'r') as f:
    data_config = eval(f.read())

with open("./data_preprocessed/zinc250k/idx.json", "r") as f:
    train_idx, valid_idx = json.load(f)
dataset = PretrainDataset("./data_preprocessed/{}".format(args.dataset), data_config, args)
train_dataset = deepcopy(dataset)
train_dataset._indices = train_idx # РґР°РЅРЅС‹Рµ С…СЂР°РЅСЏС‚СЃСЏ РІСЃРµ, РЅРѕ Р±РµСЂСѓС‚СЃСЏ С‚РѕР»СЊРєРѕ С‚Рµ, РєРѕС‚РѕСЂС‹Рµ РµСЃС‚СЊ РІ СЃРїРёСЃРєРµ РёРЅРґРµРєСЃРѕРІ
valid_dataset = deepcopy(dataset)
valid_dataset._indices = valid_idx # Р°РЅР°Р»РѕРіРёС‡РЅРѕ

if args.hidden in ('', ','):
    hidden = []
else:
    hidden = [int(d) for d in args.hidden.strip(',').split(',')]
print('Hidden dim for output regression: ', hidden)

if args.ratio in ('',','):
    ratio = []
else:
    GEN_RATIO, DS_RATIO, SA_RATIO, TD_RATIO = [float(d) for d in args.ratio.strip(',').split(',')]

if args.moduls in ('',','):
    raise ValueError('empty moduls list')
else:
    moduls_list = [mod for mod in args.moduls.strip(',').split(',')]

if args.moduls in ('',','):
    raise ValueError('empty activation list')
else:
    acti_dict = {'tanh': nn.Tanh(), 'sigm': nn.Sigmoid(), 'relu': nn.ReLU()}
    activ = [acti_dict[acti] for acti in args.activation.strip(',').split(',')]

if args.property_model_path is None:
    print('in')
    mol_property = load_property_csv('./docking/DS_data/docking_dataset.csv')

    train_dataset.is_mol_property = True
    train_dataset.mol_property = mol_property
    valid_dataset.is_mol_property = True
    valid_dataset.mol_property = mol_property

    train_loader = DataLoader(train_dataset, batch_size=args.batch_size,collate_fn=PretrainDataset.collate_fn, num_workers=args.num_workers, drop_last=True)
    valid_loader = DataLoader(valid_dataset, batch_size=args.batch_size,collate_fn=PretrainDataset.collate_fn, num_workers=args.num_workers, drop_last=True)
    
    property_model_path = os.path.join(args.model_dir, '{}_{}-{}-{}-{}_{}.pth'.format(args.hidden, GEN_RATIO, DS_RATIO, SA_RATIO, TD_RATIO, moduls_list))
    
    gen_model = MolHF(data_config, args).to(args.device)
    op.initialize_from_checkpoint(gen_model, args)

    opti_model = OptimModel(gen_model, hidden, activ).to(args.device)
    property_model = fit_model(opti_model, train_loader, valid_loader, args, property_model_path, moduls_list)   
else:
    # DMNP
    dmnp_smiles = 'CC1CCC(C2=C1C=CC(=C2)C)C(C)CCC(=O)O'
    # dmnp_smiles = 'O=C1NC(=S)NC(=O)C1=CNC1=CC=C([N+](=O)[O-])C=C1O'
    dmnp_start = tuple([dmnp_smiles, 1.2139587974886887, 3.3559737244164882, 1])
    
    property_model_path = os.path.join(args.model_dir, args.property_model_path)
    
    gen_model = MolHF(data_config, args).to(args.device)
    op.initialize_from_checkpoint(gen_model, args)
    
    opti_model = OptimModel(gen_model, hidden, activ).to(args.device)
    opti_model.load_state_dict(torch.load(property_model_path, map_location=args.device))

    opti_model.eval()

    optimize_selected_mol(opti_model, dmnp_start, data_config, args)

Start at Time: Wed Apr 30 20:20:52 2025
reading data from ./data_preprocessed/zinc250k
Atom order: bfs
Hidden dim for output regression:  [32]
initialize from ./save_pretrain/zinc250k_model/checkpoint.pth Done!
Optiimization CC1CCC(C2=C1C=CC(=C2)C)C(C)CCC(=O)O for better docking score
(40,) (3, 40, 40)
(40, 10) (4, 40, 40)
torch.Size([1, 40, 10]) torch.Size([1, 4, 40, 40])
nan
nan
nan
valid molecules: 200/200
[0] Cc1ccc2c(c1)C(C(C)CCC(=O)O)CCC2C
[1] Cc1ccc2c(c1)C(C(C)CCC(=O)O)CCC2C
[2] Cc1ccc2c(c1)C(C(C)CCC(=O)O)CCC2C
[3] Cc1ccc2c(c1)C(C(C)CCC(=O)O)CCC2C
[4] Cc1ccc2c(c1)C(C(C)CCC(=O)O)CCC2C
[5] Cc1ccc2c(c1)C1(CCC2C)CC1CCC(=O)O
[6] Cc1ccc2c(c1)C1(CCC2C)CC1CCC(=O)O
[7] Cc1ccc2c(c1)C1(CCC2C)CC1CCC(=O)O
[8] Cc1ccc2c(c1)C1(CCC2C)CC1CCC(=O)O
[9] Cc1ccc2c(c1)C1(CCC2C)CC1CCC(=O)O
[10] Cc1ccc2c(c1)C1(CCC2C)CC1CCC(=O)O
[11] Cc1ccc2c(c1)C1(CCC2C)CC1CCC(=O)O
[12] Cc1ccc2c(c1)C1(CCC2C)CC1CCC(=O)O
[13] Cc1ccc2c(c1)C1(CCC2C)CC1CCC(=O)O
[14] Cc1ccc2c(c1)C1(CCC2C)CC1CCC(=O)O
[15] Cc1ccc2c(c1)C1(CCC2C)C

In [43]:
file_path = os.path.join('./dataset/zinc250k/zinc250k.smi')
fp = open(file_path, 'r')
smiles_list = [smiles.strip() for smiles in fp]
all_z = np.empty((len(smiles_list), 6800), dtype=float)

for i, sml in enumerate(tqdm(smiles_list)):
    with torch.no_grad():
        atoms, bond = op.smiles_to_adj(sml, args.dataset)
        atoms, bond = op.get_mol_data(atoms, bond, data_config)
        atoms, bond = torch.from_numpy(atoms).unsqueeze(0), torch.from_numpy(bond).unsqueeze(0)
        # atoms, bond = atoms.to(args.device), bond.to(args.device)
        mol_z, _, _ = gen_model(atoms, bond)
        h = gen_model.to_latent_format(mol_z)
        all_z[i] = h.ravel().numpy()

all_z.dump('z_dataset.npy')

100%|██████████| 249456/249456 [6:59:19<00:00,  9.92it/s]  


MemoryError: 

In [49]:
# with open('z_dataset.npy', 'w') as f:
np.save('z_dataset_safety.npy', all_z)

In [51]:
new_all_z = all_z.astype(np.float32)

In [None]:
tch_all_z = torch.tensor(all_z, device='cuda')
tch_all_z

In [None]:
from torch_pca import PCA

pca_model = PCA(n_components=None, svd_solver='full')
new_train_data = pca_model.fit_transform(tch_all_z[:10000])

In [None]:
from optimize_property import initialize_from_checkpoint, smiles_to_adj, get_mol_data
from utils import construct_mol, correct_mol, valid_mol_can_with_seg
import random
import numpy as np
import torch
from rdkit import Chem, RDLogger
from rdkit.Chem import AllChem, Descriptors
from rdkit.DataStructs import FingerprintSimilarity, TanimotoSimilarity
from envs.environment import qed


def get_initial_vec(sml):

    with torch.no_grad():
        atoms, bond = smiles_to_adj(sml, args.dataset)
        atoms, bond = get_mol_data(atoms, bond, data_config)
        atoms, bond = torch.from_numpy(atoms).unsqueeze(0), torch.from_numpy(bond).unsqueeze(0)
        atoms, bond = atoms.to(args.device), bond.to(args.device)

        mol_z, _, _  = gen_model(atoms, bond)
        h = gen_model.to_latent_format(mol_z)

    return h

num2atom = {0: 6, 1: 7, 2: 8, 3: 9, 4: 15, 5: 16, 6: 17, 7: 35, 8: 53}
atom_valency = {6: 4, 7: 3, 8: 2, 9: 1, 15: 3, 16: 2, 17: 1, 35: 1, 53: 1}


data_path = os.path.join('./data_preprocessed', args.dataset)
with open(os.path.join(data_path, 'config.txt'), 'r') as f:
    data_config = eval(f.read())

gen_model = MolHF(data_config, args).to(args.device)
initialize_from_checkpoint(gen_model, args)

dmnp_smiles = 'CC1CCC(C2=C1C=CC(=C2)C)C(C)CCC(=O)O'
dmnp_mol = Chem.MolFromSmiles(dmnp_smiles)
dmnp_vec = get_initial_vec(dmnp_smiles)
dmnp_fp = AllChem.GetMorganFingerprintAsBitVect(dmnp_mol, 2, nBits=1024)

def tanimoto(smiles):
    """Функция для расчета расстояния Танимото до целевой молекулы"""
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return 0  # Штраф за невалидные молекулы
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
    return TanimotoSimilarity(fp, dmnp_fp)

for i in range(20):
    prior_dist = torch.distributions.normal.Normal(torch.zeros(
        [6800]), 0.6*torch.ones([6800]))
    z = prior_dist.sample((1,)).to(gen_model.device)
    samp_z = dmnp_vec + 0.2*z

    out = gen_model.to_molecule_format(samp_z)
    x, adj = gen_model.reverse(out, true_adj=None)
    mol = construct_mol(x[0], adj[0], num2atom, atom_valency)[0]
    mol = correct_mol(mol)
    mol = valid_mol_can_with_seg(mol, largest_connected_comp=True)
    smiles = Chem.MolToSmiles(mol)
    print('i: {}, DS: {}, SA: {}, TN: {}, QED: {}'.format(i, dock_score(smiles), calculateScore(mol), tanimoto(smiles), qed(mol)))

initialize from ./save_pretrain/zinc250k_model/checkpoint.pth Done!
i: 0, DS: -7.9, SA: 3.3559737244164882, TN: 1.0, QED: 0.8664348917225079
i: 1, DS: -7.6, SA: 4.091265736250809, TN: 0.6458333333333334, QED: 0.5105887538679946
i: 2, DS: -7.8, SA: 3.3559737244164882, TN: 1.0, QED: 0.8664348917225079
i: 3, DS: -6.5, SA: 3.4358198556037554, TN: 0.22413793103448276, QED: 0.5660950660130104
i: 4, DS: -6.3, SA: 4.224563124834523, TN: 0.1076923076923077, QED: 0.27676697933115507
i: 5, DS: -7.8, SA: 3.3559737244164882, TN: 1.0, QED: 0.8664348917225079
i: 6, DS: -7.2, SA: 3.21366683467482, TN: 0.6041666666666666, QED: 0.9033392141044173
i: 7, DS: -7.7, SA: 3.3559737244164882, TN: 1.0, QED: 0.8664348917225079
i: 8, DS: -6.3, SA: 3.4358198556037554, TN: 0.22413793103448276, QED: 0.5660950660130104
i: 9, DS: -7.7, SA: 3.3559737244164882, TN: 1.0, QED: 0.8664348917225079
i: 10, DS: -7.8, SA: 3.3559737244164882, TN: 1.0, QED: 0.8664348917225079
i: 11, DS: -7.7, SA: 3.3559737244164882, TN: 1.0, QED:

In [15]:
calculateScore(dmnp_mol), qed(dmnp_mol)

(3.3559737244164882, 0.866434891722508)

In [None]:
np.save('z_dataset.npy')

MemoryError: 

In [None]:
gen_model.eval()

for i, batch in enumerate(tqdm(train_loader)):

    x = batch['node'].to('cuda')   # (bs,9,5)
    adj = batch['adj'].to('cuda')   # (bs,4,9, 9)
    true_y = batch['property'].float().to('cuda')

    z, _, _ = gen_model(x, adj)

In [None]:
model.to_latent_format(z)

In [None]:
# ds = np.load('qed_like_ds.npy', allow_pickle=True)
# sa = np.load('SAS.npy', allow_pickle=True)

file_path = os.path.join('./dataset/zinc250k/zinc250k.smi')
fp = open(file_path, 'r')
smiles_list = [smiles.strip() for smiles in fp]

ds = np.random.uniform(low=-10, high=-3, size=len(smiles_list))
sa = np.random.uniform(low=1, high=10, size=len(smiles_list))

In [None]:
dset = pd.DataFrame({'SMILES': np.array(smiles_list), 'DS': ds, 'SA': sa})

In [None]:
dset.to_csv('tmp_docking_dataset.csv', index=False)

In [None]:

load_property_csv('tmp_docking_dataset.csv', )

Load tmp_docking_dataset.csv done, length: 249456


[('CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1',
  0.7319008436872337,
  2.084094572072681),
 ('C[C@@H]1CC(Nc2cncc(-c3nncn3C)c2)C[C@@H](C)C1',
  0.9411116113894996,
  3.4320038192747795),
 ('N#Cc1ccc(-c2ccc(O[C@@H](C(=O)N3CCCC3)c3ccccc3)cc2)cc1',
  0.626104542084784,
  2.4706326078252),
 ('CCOC(=O)[C@@H]1CCCN(C(=O)c2nc(-c3ccc(C)cc3)n3c2CCCCC3)C1',
  0.716224500059291,
  2.822753311170967),
 ('N#CC1=C(SCC(=O)Nc2cccc(Cl)c2)N=C([O-])[C@H](C#N)C12CCCCC2',
  0.8095718119534945,
  4.035182138302743),
 ('CC[NH+](CC)[C@](C)(CC)[C@H](O)c1cscc1Br',
  0.8271497220047328,
  5.091437549913934),
 ('COc1ccc(C(=O)N(C)[C@@H](C)C/C(N)=N/O)cc1O',
  0.3227319993935653,
  2.8523163989668188),
 ('O=C(Nc1nc[nH]n1)c1cccnc1Nc1cccc(F)c1',
  0.6867867336169149,
  2.627857491802141),
 ('Cc1c(/C=N/c2cc(Br)ccn2)c(O)n2c(nc3ccccc32)c1C#N',
  0.5036973395225374,
  3.073935310669482),
 ('C[C@@H]1CN(C(=O)c2cc(Br)cn2C)CC[C@H]1[NH3+]',
  0.8223798445329892,
  3.947933469646377),
 ('CCOc1ccc(OCC)c([C@H]2C(C#N)=C(N)N(c3ccccc3C(

In [18]:
from envs.sascorer import calculateScore

def dock(mol):
    # mol = Chem.MolFromSmiles(sml)
    return calculateScore(mol)

def MFp_dist(x, y): return 1

def Fr_dist(x, y): return 1

In [23]:
dmnp = tuple(op.get_mol_property('CC1CCC(C2=C1C=CC(=C2)C)C(C)CCC(=O)O') + [-10] + ['CC1CCC(C2=C1C=CC(=C2)C)C(C)CCC(=O)O'])

In [90]:
dmnp = 'CC1CCC(C2=C1C=CC(=C2)C)C(C)CCC(=O)O'
optimize_selected_mol(property_model, (dmnp, -8.2, 3.4), data_config, args)

Optiimization CC1CCC(C2=C1C=CC(=C2)C)C(C)CCC(=O)O for better docking score
(40,) (3, 40, 40)
(40, 10) (4, 40, 40)
torch.Size([1, 40, 10]) torch.Size([1, 4, 40, 40])
nan
nan
nan
nan
nan
nan
nan
nan
nan
valid molecules: 100/100
[0] Cc1ccc2c(c1)C(C(C)CCC(=O)O)CCC2C
[1] Cc1ccc2c(c1)C(C(C)CCC(=O)O)CCC2C
[2] Cc1ccc2c(c1)C(C(C)CCC(=O)O)CCC2C
[3] Cc1ccc2c(c1)C(C(C)CCC(=O)O)CCC2C
[4] Cc1ccc2c(c1)C(C(C)CCC(=O)O)CCC2C
[5] Cc1ccc2c(c1)C(C(C)CCC(=O)O)CCC2C
[6] Cc1ccc2c(c1)C(C(C)CCC(=O)O)CCC2C
[7] Cc1ccc2c(c1)C(C(C)CCC(=O)O)CCC2C
[8] Cc1ccc2c(c1)C(C(C)CCC(=O)O)CCC2C
[9] Cc1ccc2c3c1N(CC2C)C3C(C)CCC(=O)O
[10] Cc1ccc2c3c1N(CC2C)C3C(C)CCC(=O)O
[11] CC1=CN2CC(C)C(=NC2C(C)CCC(=O)O)C=C1
[12] CC1=CN2CC(C)C(=NC2C(C)CCC(=O)O)C=C1
[13] CC1=CN2CC(C)C(=NC2C(C)CCC=O)C=C1
[14] CC1=CN2CC(C)C(=NC2C(C)CCC=O)C=C1
[15] CCCCC(C)C1N=C2C=CC(C)=CN1CC2C
[16] CCCCC(C)C1N=C2C=CC(C)=CN1CC2C
[17] CCCCC(C)C1N=C2C=CC(C)=CN1CC2C
[18] CCC1CC(C2N=C3C=CC(C)=CN2CC3C)C1
[19] CCC1CC(C2N=C3C=C[SH](C)OC2NC3C)C1
[20] CCC1CC(C2N=C3C=C[SH](C