In [34]:
import random
import numpy as np
from deap import base, creator, tools, algorithms
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
from rdkit.Chem.QED import qed
import joblib
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig, AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerFast
import torch
import pandas as pd
from tqdm.auto import tqdm

from src.llm_interface import PROMPT, get_answer
from src.utils import all_rools_valid

In [2]:
model_name_or_id = "OpenDFM/ChemDFM-v1.5-8B"
model = AutoModelForCausalLM.from_pretrained(
    model_name_or_id,
    torch_dtype=torch.float16,
    device_map="balanced",
    max_memory={0: '10GB', 1: '10GB'}
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00,  1.58s/it]


In [35]:
estimation_model, feature_names = list(joblib.load('models/model_iter_1').values())

desc_dict = dict(Descriptors.descList)

descriptor_funcs = []
for f_name in feature_names:
    descriptor_funcs.append((f_name, desc_dict[f_name]))

In [55]:
initial_molecules = \
pd.read_csv('data/quantitive_neftekod25_data.csv')['SMILES'].drop_duplicates().tolist() +\
pd.read_csv('data/export.csv')['SMILES'].tolist()

In [56]:
initial_molecules = [s for s in initial_molecules if all_rools_valid(s)]

In [57]:
initial_molecules

['C1=CC=C(C=C1)NC2=CC=CC=C2',
 'CC(C)(C)CC(C)(C)C1=CC=CC=C1NC2=CC=CC3=CC=CC=C32',
 'C1(=CC=CC=C1N(C2=CC=CC=C2CCCCCCCCC)[H])CCCCCCCCC',
 'C1=CC=C(C=C1)NC2=CC=CC3=CC=CC=C32',
 'CC1=C(C(=CC=C1)O)C',
 'CC1=CC(=C(C=C1)C)O',
 'CC1=C(C=C(C=C1)O)C',
 'CC1=CC(=CC(=C1)O)C',
 'CCC1=CC=C(C=C1)O',
 'CC1=CC(=C(C(=C1)C(C)(C)C)O)C(C)(C)C',
 'CC1=CC(=C(C(=C1)C(C)(C)C)O)CC2=C(C(=CC(=C2)C)C(C)(C)C)O',
 'CC(C)(C)C1=CC(=CC(=C1O)C(C)(C)C)CC2=CC(=C(C(=C2)C(C)(C)C)O)C(C)(C)C',
 'C1(=C(C=C(C=C1C(C)(C)C)CCC(=O)OC)C(C)(C)C)O[H]',
 'Cc1cc(O)c(C(C)(C)C)c(C)c1O',
 'Cc1cc(C(C)(C)C)c(O)c(C(C)(C)C)c1N',
 'CSc1cc(C(C)(C)C)c(N)c(C(C)(C)C)c1',
 'CSc1cc(C(C)(C)C)c(O)c(C(C)(C)C)c1N',
 'CNC(=O)c1cc(C(C)(C)C)c(N)c(C(C)(C)C)c1',
 'CNC(=O)c1cc(C(C)(C)C)c(O)c(C(C)(C)C)c1N',
 'CCOC(=O)c1cc(C(C)(C)C)c(N)c(C(C)(C)C)c1',
 'CCOC(=O)c1cc(C(C)(C)C)c(O)c(C(C)(C)C)c1N',
 'CCOC(=O)c1cc(C(C)(C)C)c(N)c(C(C)(C)C)c1O',
 'CC1=CC(=C(C(=C1)C(C)(C)C)O)CC2=C(C(=CC(=C2)C)C(C)(C)C)OCC1=CC(=C(C(=C1)C(C)(C)C)O)CC2=C(C(=CC(=C2)C)C(C)(C)C)O',
 'Nc1cc(-

In [58]:
class GeneticAlgorithm:
    def __init__(self, mutation_func, crossover_func, fitness_func, population_size, mutation_prob, crossover_prob):

        self.mutation_func = mutation_func
        self.crossover_func = crossover_func
        self.fitness_func = fitness_func
        self.population_size = population_size
        self.mutation_prob = mutation_prob
        self.crossover_prob = crossover_prob
        self.log = []

    def run(self, initial_population, iterations):

        population = initial_population.copy()

        for _ in range(iterations):

            fitness_values = [self.fitness_func(ind) for ind in population]
            
            min_fitness = min(fitness_values)
            max_fitness = max(fitness_values)
            avg_fitness = sum(fitness_values) / len(fitness_values)
            sorted_fitness = sorted(fitness_values)
            median_fitness = sorted_fitness[len(sorted_fitness) // 2] if len(sorted_fitness) % 2 == 1 else (
                sorted_fitness[len(sorted_fitness) // 2 - 1] + sorted_fitness[len(sorted_fitness) // 2])/ 2
            
            self.log.append({
                'min': min_fitness,
                'max': max_fitness,
                'avg': avg_fitness,
                'median': median_fitness
            })

            print(self.log[-1])
            pd.DataFrame({'SMILES': population, 'pred_pdsc': fitness_values}).to_csv('gen_algo_cache/population.csv', index=False)
            
            offspring = []
            for i in tqdm(range(min(len(population), self.population_size))):

                if random.random() < self.crossover_prob:
                    parent1 = self._tournament_selection(population, fitness_values)
                    parent2 = self._tournament_selection(population, fitness_values)
                    child1, child2 = self.crossover_func(parent1, parent2)
    
                    offspring.extend([child1, child2])
                

                if random.random() < self.mutation_prob:
                    mut_child = self.mutation_func(population[i])
                    offspring.append(mut_child)

            combined = population + offspring
            combined = list(set(combined))
            combined.sort(key=lambda x: -self.fitness_func(x))
            population = combined[:self.population_size]
        
        return population, self.log

    def _tournament_selection(self, population, fitness_values, tournament_size=3):
        participants = random.sample(list(zip(population, fitness_values)), tournament_size)
        participants.sort(key=lambda x: -x[1])
        return participants[0][0]

In [59]:
def calculate_fitness(smiles: str) -> float:
    
    mol = Chem.MolFromSmiles(smiles)
    descriptors = {desc_name: desc_f(mol) for desc_name, desc_f in descriptor_funcs}
    X = pd.DataFrame([descriptors])
    prediction = estimation_model.predict(X)[0]
    
    return prediction

def mutation_llm(smiles: str, max_attempts: int=10) -> str:

    for i in range(max_attempts):
        try:
            new_smiles = get_answer(prompt=PROMPT, n_molecules=1, example_smiles=smiles, model=model)[0]

            assert all_rools_valid(new_smiles)

            return new_smiles

            
        except Exception as e:
            continue
            
    return smiles

def crossover_llm(smiles1: str, smiles2: str, max_attempts=10):
    
    for i in range(max_attempts):
        try:
            
            new_smiles1 = get_answer(prompt=PROMPT, n_molecules=1, example_smiles=[smiles1, smiles2], model=model)[0]
            new_smiles2 = get_answer(prompt=PROMPT, n_molecules=1, example_smiles=[smiles1, smiles2], model=model)[0]
            
            if all_rools_valid(new_smiles1) and all_rools_valid(new_smiles2):
                return new_smiles1, new_smiles2
            
        except Exception as e:
            continue
            
    return smiles1, smiles2

In [60]:
llm_genetic_algo = GeneticAlgorithm(
    mutation_llm,
    crossover_llm,
    calculate_fitness,
    population_size=100,
    mutation_prob=0.8,
    crossover_prob=0.2
)

llm_genetic_algo.run(initial_molecules, 10)

{'min': 0.34483998527576176, 'max': 226.32312679309524, 'avg': 54.497945873930995, 'median': 14.04786935417129}


 67%|██████▋   | 29/43 [02:11<00:42,  3.01s/it][18:14:24] SMILES Parse Error: unclosed ring for input: 'Cc1cc(C)c(N)c(C)c1Nc1ccc(Nc2ccc(Nc3c(C)cc(C)c(N)c3C)cc1)c1c(C)c(C)cc(C)c1N'
 72%|███████▏  | 31/43 [02:18<00:38,  3.17s/it][18:16:02] non-ring atom 441 marked aromatic
 74%|███████▍  | 32/43 [04:17<04:50, 26.36s/it][18:16:29] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 15 16
[18:16:31] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 15 16
[18:16:36] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 15 16
[18:16:40] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 15 16
[18:16:42] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 16 17 18 19 20 21
 79%|███████▉  | 34/43 [04:48<03:13, 21.50s/it][18:16:58] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 16 17 18 19 20 21
 81%|████████▏ | 35/43 [04:55<02:22, 17.80s/it][18:17:10] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 7 8 10 11 12 13 14 15 16 17
 84%|████████▎ | 36/43 [05:06<01:51, 15.89s/

{'min': -4.20219574133273, 'max': 226.35277583914765, 'avg': 55.878034596109146, 'median': 19.819478662020536}


  6%|▌         | 4/71 [00:22<06:50,  6.13s/it][18:18:24] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 9 10
  7%|▋         | 5/71 [00:29<07:06,  6.46s/it][18:18:33] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 9 10
 11%|█▏        | 8/71 [00:44<05:33,  5.29s/it][18:18:45] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 16 17 18 19 20 21
 17%|█▋        | 12/71 [01:14<06:16,  6.38s/it][18:19:16] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 8 9 16 24
[18:19:18] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 9 10
[18:19:21] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 9 10
 25%|██▌       | 18/71 [02:29<14:54, 16.87s/it][18:20:30] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 9 10
 31%|███       | 22/71 [03:12<10:00, 12.25s/it][18:21:14] SMILES Parse Error: unclosed ring for input: 'c1ccc2c(c1)cc1c3ccccc3c3cc4c(cc3c2c1)cccc4c1cc2c(cc1-3)c1ccccc1-2'
 38%|███▊      | 27/71 [03:51<06:24,  8.73s/it][18:21:52] Can't kekulize mol.  Unkekulize

{'min': 9.230725270546763, 'max': 226.35277583914765, 'avg': 77.85233991510022, 'median': 59.77228960756985}


  7%|▋         | 7/100 [01:20<25:44, 16.61s/it][18:27:03] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 9 10
 10%|█         | 10/100 [01:37<14:09,  9.44s/it][18:27:20] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 16 17 18 19 20 21
[18:27:25] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 15 16
[18:27:27] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 15 16 17 18 19 20
[18:27:31] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 15 16
[18:27:42] SMILES Parse Error: extra open parentheses while parsing: c1ccc2c(c1)sc(Nc1ccccc1)nc2-c1cc(Nc2cccc(C(F)(F)F)c2)nc(-c2cc(Nc3cccc(C(F)(F)F)c3)nc(-c3cc(Nc4cccc(C(F)(F)F)c4)nc(-c4cc(Nc5cccc(C(F)(F)F)c5)nc(C(F)(F)F)c4)c3)c1
[18:27:42] SMILES Parse Error: check for mistakes around position 56:
[18:27:42] 2cccc(C(F)(F)F)c2)nc(-c2cc(Nc3cccc(C(F)(F
[18:27:42] ~~~~~~~~~~~~~~~~~~~~^
[18:27:42] SMILES Parse Error: Failed parsing SMILES 'c1ccc2c(c1)sc(Nc1ccccc1)nc2-c1cc(Nc2cccc(C(F)(F)F)c2)nc(-c2cc(Nc3cccc(C(F)(F)F)c3)nc(-c

{'min': 23.67350077685592, 'max': 226.35277583914765, 'avg': 97.6117735583234, 'median': 83.37933971371737}


 14%|█▍        | 14/100 [00:48<03:07,  2.18s/it][18:39:05] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 9 10
 27%|██▋       | 27/100 [01:20<02:22,  1.95s/it][18:40:10] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 8 9 24 25
 30%|███       | 30/100 [02:13<08:43,  7.48s/it][18:40:30] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 15 16 17 18 19 20
 39%|███▉      | 39/100 [03:10<06:25,  6.32s/it][18:41:41] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 15 16
[18:41:43] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 15 16
 45%|████▌     | 45/100 [03:51<05:58,  6.51s/it][18:42:09] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 15 16
[18:42:12] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 15 16 17 18 19 20 21 22 23 24 25
 56%|█████▌    | 56/100 [04:41<02:14,  3.06s/it][18:42:59] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 16 17
[18:43:03] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 1

{'min': 44.51966421472663, 'max': 226.35277583914765, 'avg': 112.46921153976088, 'median': 101.09147343396677}


  6%|▌         | 6/100 [00:13<03:04,  1.96s/it][18:49:53] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 9 10
 15%|█▌        | 15/100 [02:12<10:57,  7.73s/it][18:55:13] non-ring atom 461 marked aromatic
[18:56:22] non-ring atom 461 marked aromatic
[18:57:32] SMILES Parse Error: extra open parentheses while parsing: Nc1cc(-c2cc(N)c(-c3cc(N)c(-c4ccccc4)c(-c4ccccc4)c3)c(-c3ccccc3)c2)cc(-c2ccccc2)c1O.Nc1cc(S(=O)(=O)Nc2ccc(-c3ccc(N)cc3S(=O)(=O)Nc3ccccc3)cc2)cc2c1-c1cccc(S(=O)(=O)Nc3ccc(-c4ccc(N)cc4S(=O)(=O)Nc4ccccc4)cc3)c1N2.Nc1cc(-c2cc(N)c(-c3cc(N)c(-c4ccccc4)c(-c4ccccc4)c3)c(-c3ccccc3)c2)cc(-c2ccccc2)c1O.Nc1cc(S(=O)(=O)Nc2ccc(-c3ccc(N)cc3S(=O)(=O)Nc3ccccc3)cc2)cc2c1-c1cccc(S(=O)(=O)Nc3ccc(-c4ccc(N)cc4S(=O)(=O)Nc4ccccc4)cc3)c1N2.Nc1cc(-c2cc(N)c(-c3cc(N)c(-c4ccccc4)c(-c4ccccc4)c3)c(-c3ccccc3)c2)cc(-c2ccccc2)c1O.Nc1cc(S(=O)(=O)Nc2ccc(-c3ccc(N)cc3S(=O)(=O)Nc3ccccc3)cc2)cc2c1-c1cccc(S(=O)(=O)Nc3ccc(-c4ccc(N)cc4S(=O)(=O)Nc4ccccc4)cc3)c1N2.Nc1cc(-c2cc(N)c(-c3cc(N)c(-c4ccccc4)c(-c4ccccc4)c

{'min': 63.82112893669073, 'max': 226.35277583914765, 'avg': 126.1112934694906, 'median': 118.49311678440839}


  6%|▌         | 6/100 [00:52<11:20,  7.24s/it][19:17:36] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 9 10
[19:17:43] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 9 10
  9%|▉         | 9/100 [01:27<13:12,  8.71s/it][19:18:03] Can't kekulize mol.  Unkekulized atoms: 1 2 3 33 34 36 37 38 39
 18%|█▊        | 18/100 [02:44<09:03,  6.62s/it][19:19:13] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 15 16
[19:19:16] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 15 16
[19:19:19] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 16 17 18 19 20 21
[19:19:21] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 16 17 18 19 20 21
[19:19:24] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 16 17 18 19 20 21
[19:19:28] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 15 16
 19%|█▉        | 19/100 [03:04<13:47, 10.21s/it][19:19:35] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 9 10
 22%|██▏       | 22/100 [03:23<11:02,  8.49s/it][19:19:53] Can'

{'min': 74.84114499893197, 'max': 226.35277583914765, 'avg': 135.71704820665906, 'median': 131.26204185455856}


  0%|          | 0/100 [00:00<?, ?it/s][19:35:48] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 9 10
  3%|▎         | 3/100 [00:18<09:56,  6.14s/it][19:35:53] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 9 10
  8%|▊         | 8/100 [00:46<09:06,  5.94s/it][19:36:31] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 16 17 18 19 20 21
 11%|█         | 11/100 [01:07<08:42,  5.88s/it][19:36:50] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 9 10
 12%|█▏        | 12/100 [01:24<13:34,  9.25s/it][19:37:03] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 9 10
 17%|█▋        | 17/100 [01:49<08:54,  6.44s/it][19:37:32] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
 18%|█▊        | 18/100 [02:02<11:01,  8.07s/it][19:38:51] SMILES Parse Error: extra open parentheses while parsing: c1ccc(Nc2cc(-c3ccc(Nc4cc(-c5ccc(Nc6cc(-c7ccc(Nc8cc(-c9ccc(Nc%10cc(-c%11ccc(Nc%12cc(-c%13ccc(Nc%14cc(-c%15ccc(Nc%16cc(-c%17ccc(Nc%18cc(-c%19ccc(Nc%20cc(

{'min': 88.52686944111589, 'max': 233.04558899989934, 'avg': 147.6745481705356, 'median': 142.31099796997054}


  0%|          | 0/100 [00:00<?, ?it/s][19:53:32] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 9 10
 10%|█         | 10/100 [00:36<06:07,  4.09s/it][19:54:08] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 9 10
 11%|█         | 11/100 [00:40<06:06,  4.12s/it][19:54:14] Can't kekulize mol.  Unkekulized atoms: 1 2 3 29 30 32 33
 15%|█▌        | 15/100 [02:12<32:53, 23.21s/it][19:57:00] SMILES Parse Error: extra open parentheses while parsing: Nc1cc(-c2cc(N)c(-c3cc(N)c(-c4cc(N)c(-c5ccccc5)c(-c5ccccc5)c4)c(-c4ccccc4)c3)c(-c3ccccc3)c2)cc(-c2ccccc2)c1O.Cc1cc(N)c(C)c(C)c1O.Nc1cc(-c2cc(N)c(-c3cc(N)c(-c4cc(N)c(-c5ccccc5)c(-c5ccccc5)c4)c(-c4ccccc4)c3)c(-c3ccccc3)c2)cc(-c2ccccc2)c1O.Nc1cc(-c2cc(N)c(-c3cc(N)c(-c4cc(N)c(-c5ccccc5)c(-c5ccccc5)c4)c(-c4ccccc4)c3)c(-c3ccccc3)c2)cc(-c2ccccc2)c1O.Nc1cc(-c2cc(N)c(-c3cc(N)c(-c4cc(N)c(-c5ccccc5)c(-c5ccccc5)c4)c(-c4ccccc4)c3)c(-c3ccccc3)c2)cc(-c2ccccc2)c1O.Nc1cc(-c2cc(N)c(-c3cc(N)c(-c4cc(N)c(-c5ccccc5)c(-c5ccccc5)c4)c(-c4ccccc4)c3)c(-c3ccccc3

{'min': 100.91741392519373, 'max': 233.04558899989934, 'avg': 154.20434616204335, 'median': 145.71946828596018}


  0%|          | 0/100 [00:00<?, ?it/s][20:18:50] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
 10%|█         | 10/100 [01:12<08:21,  5.57s/it][20:20:12] Can't kekulize mol.  Unkekulized atoms: 4 5 6 28 50
[20:20:27] Can't kekulize mol.  Unkekulized atoms: 4 5 6 60 67
 11%|█         | 11/100 [02:04<23:56, 16.14s/it][20:20:52] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 9 10
 13%|█▎        | 13/100 [02:21<17:41, 12.20s/it][20:21:09] SMILES Parse Error: unclosed ring for input: 'c1ccc2c(c1)cc1c3ccccc3c(N)c3c(N)cc4c5ccccc5c(ccc4c23)c1c1cc2ccccc2c(N)c15'
 15%|█▌        | 15/100 [02:47<18:28, 13.04s/it][20:21:36] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 9 10
 17%|█▋        | 17/100 [02:59<12:51,  9.29s/it][20:21:59] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 9 10
 20%|██        | 20/100 [03:27<11:03,  8.29s/it][20:23:21] SMILES Parse Error: extra open parentheses while parsing: c1ccc2c(c1)sc1

{'min': 109.35727087851915, 'max': 233.04558899989934, 'avg': 157.83248152285935, 'median': 148.60558380074718}


  5%|▌         | 5/100 [00:15<03:50,  2.43s/it][20:45:45] SMILES Parse Error: syntax error while parsing: c1ccc(Nc2cc(-c3ccc(Nc4cc(-c5ccc(Nc6cc(-c7ccc(Nc8cc(-c9ccc(Nc%10cc(-c%11ccc(Nc%12cc(-c%13ccc(N)cc%13)cc%12)cc%11)cc%10)cc%9)cc%8)cc%7)cc%6)cc%5)cc%4)cc%3)cc%2)cc%1)cc2cc(-c3ccccc3)cc(N)c12
[20:45:45] SMILES Parse Error: check for mistakes around position 123:
[20:45:45] %12)cc%11)cc%10)cc%9)cc%8)cc%7)cc%6)cc%5)
[20:45:45] ~~~~~~~~~~~~~~~~~~~~^
[20:45:45] SMILES Parse Error: Failed parsing SMILES 'c1ccc(Nc2cc(-c3ccc(Nc4cc(-c5ccc(Nc6cc(-c7ccc(Nc8cc(-c9ccc(Nc%10cc(-c%11ccc(Nc%12cc(-c%13ccc(N)cc%13)cc%12)cc%11)cc%10)cc%9)cc%8)cc%7)cc%6)cc%5)cc%4)cc%3)cc%2)cc%1)cc2cc(-c3ccccc3)cc(N)c12' for input: 'c1ccc(Nc2cc(-c3ccc(Nc4cc(-c5ccc(Nc6cc(-c7ccc(Nc8cc(-c9ccc(Nc%10cc(-c%11ccc(Nc%12cc(-c%13ccc(N)cc%13)cc%12)cc%11)cc%10)cc%9)cc%8)cc%7)cc%6)cc%5)cc%4)cc%3)cc%2)cc%1)cc2cc(-c3ccccc3)cc(N)c12'
  9%|▉         | 9/100 [00:38<06:20,  4.18s/it][20:46:00] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3

(['Nc1cc(N)c(-c2cc(N)c(-c3cc(N)c(-c4cc(N)c(-c5ccccc5)c(-c5ccccc5)c4)c(-c4ccccc4)c3)c(-c3ccccc3)c2)c(O)c1O',
  'Nc1cc(-c2cc(N)c(-c3cc(N)c(-c4cc(N)c(-c5ccccc5)c(-c5ccccc5)c4)c(-c4ccccc4)c3)c(-c3ccccc3)c2)c(O)c(N)c1O',
  'Nc1cc(-c2cc(N)c(-c3cc(N)c(-c4ccc(N)cc4)c(-c4ccc(N)cc4)c3)c(-c3ccc(N)cc3)c2)cc(-c2ccc(N)cc2)c1O',
  'Nc1cc(-c2cc(N)c(-c3cc(N)c(-c4ccccc4)c(-c4ccccc4)c3)c(-c3ccccc3)c2)cc(-c2ccccc2)c1O.Cc1cc(N)c(C)c(C)c1O',
  'Nc1cc(-c2cc(N)c(-c3cc(N)c(-c4cc(N)c(-c5ccccc5)c(-c5ccccc5)c4)c(-c4ccccc4)c3)c(-c3ccccc3)c2)c(O)c(N)c1',
  'Nc1cc(-c2cc(N)c(-c3cc(N)c(-c4cc(N)c(-c5ccccc5)c(-c5ccccc5)c4)c(-c4ccccc4)c3)c(-c3ccccc3)c2)cc(-c2ccccc2)c1',
  'Nc1cc(N)c(-c2cc(N)c(-c3cc(N)c(-c4cc(N)c(-c5ccccc5)c(-c5ccccc5)c4)c(-c4ccccc4)c3)c(-c3ccccc3)c2)cc1-c1ccccc1',
  'Nc1cc(N)c(-c2cc(N)c(-c3cc(N)c(-c4ccc(N)cc4)c(-c4ccc(N)cc4)c3)c(-c3ccc(N)cc3)c2)c(-c2ccc(N)cc2)c1O',
  'Nc1cc(-c2cc(N)c(-c3cc(N)c(-c4cc(N)c(-c5ccccc5)c(-c5ccccc5)c4)c(-c4ccccc4)c3)c(-c3ccccc3)c2)cc(-c2ccccc2)c1O',
  'Nc1cc(-c2cc(-c3cc(-c4cc(-

In [62]:
new_mols_df = pd.read_csv('gen_algo_cache/population.csv')
new_mols_df = new_mols_df[~new_mols_df.SMILES.isin(initial_molecules)]

In [63]:
new_mols_df['SMILES'].head(20).to_csv('submit.csv', encoding='UTF-8', index=False)

In [70]:
import os
import zipfile
with zipfile.ZipFile('submission.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write('submit.csv', os.path.basename('submit.csv'))
    zipf.write('4_ml_PDSC_prediction.ipynb', os.path.basename('4_ml_PDSC_prediction.ipynb'))
    zipf.write('5_rl_genetic_algo.ipynb', os.path.basename('5_rl_genetic_algo.ipynb'))
    zipf.write('src/utils.py', os.path.basename('src/utils.py'))
    zipf.write('src/llm_interface.py', os.path.basename('src/llm_interface.py'))