In [177]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import math

In [178]:
from joblib import dump, load
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs
from rdkit.Chem import Descriptors
from rdkit.Contrib.SA_Score import sascorer
from rdkit import DataStructs
fpgen = AllChem.GetRDKitFPGenerator()

In [179]:
df = pd.read_csv("Информация о связывании медицинского радионуклида различными молекулами.csv")

In [180]:
df

Unnamed: 0,id,smiles,lgK
0,0,CC(C)(O)C(=O)O,3.41
1,1,CCC(O)(CC)C(=O)O,3.25
2,2,O=C(O)C1(O)CCCC1,3.22
3,3,O=C(O)C1(O)CCCCC1,2.78
4,4,O=C(O)C(O)c1ccccc1,2.82
...,...,...,...
242,244,O=C(O)c1cccc([N+](=O)[O-])c1,1.64
243,245,CCC(O)C(=O)O,3.13
244,246,CCCC(O)C(=O)O,2.76
245,247,CC(C)C(O)C(=O)O,2.99


In [181]:
base_smiles = []
for i in range(len(df)):
    base_smiles.append(df.iloc[i][1])

In [182]:
base_smiles

['CC(C)(O)C(=O)O',
 'CCC(O)(CC)C(=O)O',
 'O=C(O)C1(O)CCCC1',
 'O=C(O)C1(O)CCCCC1',
 'O=C(O)C(O)c1ccccc1',
 'CC(O)C(C)(O)C(=O)O',
 'O=C(O)C(O)(CO)CO',
 'O=C(O)[C@H](O)[C@@H](O)[C@H](O)[C@H](O)CO',
 'O=C([O-])C1(O)C[C@@H](O)C(O)[C@H](O)C1',
 'O=C(O)CCO',
 'CC(CO)(CO)C(=O)O',
 'COc1ccccc1C(=O)O',
 'COc1cccc(C(=O)O)c1',
 'COc1ccc(C(=O)O)cc1',
 'O=C(O)CC(=O)O',
 'CC(C(=O)O)C(=O)O',
 'CCC(C(=O)O)C(=O)O',
 'CCCC(C(=O)O)C(=O)O',
 'CCCCC(C(=O)O)C(=O)O',
 'CCCCCC(C(=O)O)C(=O)O',
 'CC(C)(C(=O)O)C(=O)O',
 'CCC(CC)(C(=O)O)C(=O)O',
 'CCCC(CCC)(C(=O)O)C(=O)O',
 'CCCCC(CCCC)(C(=O)O)C(=O)O',
 'O=C(O)C1(C(=O)O)CCC1',
 'O=C(O)C1(C(=O)O)CCCC1',
 'O=C(O)CCC(=O)O',
 'CC(CC(=O)O)C(=O)O',
 'O=C(O)/C=C\\C(=O)O',
 'O=C(O)/C=C/C(=O)O',
 'O=C(O)CCCC(=O)O',
 'O=C(O)CCCCC(=O)O',
 'O=C(O)[C@H]1CC[C@H](C(=O)O)CC1',
 'O=C(O)c1ccccc1C(=O)O',
 'O=C(O)c1cccc(C(=O)O)c1',
 'O=C(O)C[C@H](O)C(=O)O',
 'O=C(O)[C@H](O)[C@@H](O)C(=O)O',
 'O=C(O)COCC(=O)O',
 'CC(OC(C)C(=O)O)C(=O)O',
 'O=C(O)c1cccc(C(=O)O)c1C(=O)O',
 'O=C(O)CC(O)(

In [183]:
from joblib import load
import catboost

MODEL_LOGK = load("cb.joblib")

In [184]:
def number_of_atoms(atom_list, df):
    for i in atom_list:
        df['num_of_{}_atoms'.format(i)] = df['mol'].apply(lambda x: len(x.GetSubstructMatches(Chem.MolFromSmiles(i))))

symbols = ["C", "O", "N", "P", "S"]

def lgK_calculator(smiles):
    #get dataframe with smile
    """
    :param smiles: smiles string like "c1ccccc1"
    :return: logP value
    """
    data = [[smiles]]
    df = pd.DataFrame(data_example, columns=['smiles'])
    df['mol'] = df['smiles'].apply(lambda x: Chem.AddHs(Chem.MolFromSmiles(x)))
    df['num_of_atoms'] = df['mol'].apply(lambda x: x.GetNumAtoms())
    df['num_of_heavy_atoms'] = df['mol'].apply(lambda x: x.GetNumHeavyAtoms())
    df['tpsa'] = df['mol'].apply(lambda x: Descriptors.TPSA(x))
    df['mol_w'] = df['mol'].apply(lambda x: Descriptors.ExactMolWt(x))
    df['num_valence_electrons'] = df['mol'].apply(lambda x: Descriptors.NumValenceElectrons(x))
    df['num_heteroatoms'] = df['mol'].apply(lambda x: Descriptors.NumHeteroatoms(x))
    df['num_rings'] = df['mol'].apply(lambda x: Descriptors.RingCount(x))
    df['Fp_Density_Morgan1'] = df['mol'].apply(lambda x: Descriptors.FpDensityMorgan1(x))
    df['HeavyAtomMolWt'] = df['mol'].apply(lambda x: Descriptors.HeavyAtomMolWt(x))
    
    number_of_atoms(symbols, df)
    
    df = df.drop(columns=['smiles', 'mol'])
    
    return MODEL_LOGK.predict(df)[0]

In [224]:
new_mols_gen = []

In [225]:
import random
import re
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs
from rdkit import RDLogger


def mutate_population(
    top_worst_mols: [str], top_best_mols: [str], num_iterations=5
) -> str:
    """
    Split 'good' and 'bad' molecules in fragments and replace one fragment in 'bad' molecule with fragment from 'good'
    molecule, make population of such mutant molecules and choose best of them
    :param top_worst_mols: best molecules from population by logP value
    :param top_best_mols: the best molecules from population by logP value
    :param num_iterations: amount of single exchanges between 'good' and 'bad' fragments
    :return: the best mutant molecule smiles string
    """
    raw_generation = []
    for mol in top_worst_mols:
        replacement_genes_bank = []
        for i in top_best_mols:
            replacement_genes_bank.extend(
                get_branches_from_smiles(
                    Chem.MolToSmiles(
                        Chem.MolFromSmiles(i), canonical=False, isomericSmiles=False
                    )
                )
            )
        mol = Chem.MolToSmiles(
            Chem.MolFromSmiles(mol), canonical=False, isomericSmiles=False
        )
        target_mol_genes = get_branches_from_smiles(mol)
        for _ in range(num_iterations):
            try:
                raw_generation.append(
                    mol.replace(
                        random.choice(target_mol_genes),
                        random.choice(replacement_genes_bank),
                        1,
                    )
                )
            except:
                pass
    raw_generation = [i for i in raw_generation if Chem.MolFromSmiles(i)]
    return sorted(raw_generation, key=lambda x: lgK_calculator(x))[0]


def search_step(smiles_pop: [str], top_to_change=5) -> [str]:
    random.shuffle(smiles_pop)
    top_worst_mols = smiles_pop[-top_to_change:]
    top_best_mols = smiles_pop[:top_to_change]

    new_mol = mutate_population(top_worst_mols, top_best_mols)
#     print(lgK_calculator(new_mol))
    new_mols_gen.append(new_mol)

    smiles_pop.pop(-1)
    smiles_pop.append(new_mol)
    
    return smiles_pop


def get_branches_from_smiles(smiles: str) -> [str]:
    """
    Get all possible branches from smiles string
    :param smiles:
    :return:
    """
    branches = re.findall(r"(\(.*?\))", smiles)
    for branch in branches:
        bracket_diff = branch.count("(") - branch.count(")")
        if bracket_diff > 0:
            branches[branches.index(branch)] = branch + ")" * bracket_diff
        else:
            branches[branches.index(branch)] = "(" * bracket_diff + branch
    return branches


In [226]:
cur_smiles = base_smiles.copy()

In [227]:
while len(set(new_mols_gen)) < 100:
#     print(len(set(new_mols_gen)))
    cur_smiles = search_step(cur_smiles)

[18:06:36] Explicit valence for atom # 1 N, 4, is greater than permitted
[18:06:36] Explicit valence for atom # 1 N, 4, is greater than permitted
[18:06:36] SMILES Parse Error: unclosed ring for input: 'CCCCc1ccc2c(O)OCCOCCOCCOCCO2'
[18:06:36] SMILES Parse Error: unclosed ring for input: 'CCCCc1ccc2c(=O)OCCOCCOCCOCCO2'
[18:06:36] SMILES Parse Error: unclosed ring for input: 'CCCCc1ccc2c(CP(=O))OCCOCCOCCOCCO2'
[18:06:36] SMILES Parse Error: unclosed ring for input: 'CCCCc1ccc2c(N=Nc3ccccc3[As](=O))OCCOCCOCCOCCO2'
[18:06:36] SMILES Parse Error: unclosed ring for input: 'CCCCc1ccc2c(O)OCCOCCOCCOCCO2'
[18:06:37] Explicit valence for atom # 1 C, 5, is greater than permitted
[18:06:37] Explicit valence for atom # 1 C, 5, is greater than permitted
[18:06:37] Explicit valence for atom # 1 C, 5, is greater than permitted
[18:06:37] Explicit valence for atom # 1 C, 5, is greater than permitted
[18:06:37] Explicit valence for atom # 2 N, 4, is greater than permitted
[18:06:37] Explicit valence fo

In [230]:
print(len(set(new_mols_gen)))

100


In [229]:
send_df = pd.DataFrame(new_mols_gen)

In [231]:
send_df.to_csv('base_tune_first_submit.csv', index=False, header=False)