Becuase of the Computional power needed, this code was run in Google Colab

In [None]:
#Imports needed 
import time 
import selfies
import rdkit
import random
import numpy as np
import random
from rdkit import Chem
from selfies import encoder, decoder
from rdkit.Chem import MolFromSmiles as smi2mol
from rdkit.Chem import AllChem
from rdkit.DataStructs.cDataStructs import TanimotoSimilarity
from rdkit.Chem import Mol
from rdkit.Chem.AtomPairs.Sheridan import GetBPFingerprint, GetBTFingerprint
from rdkit.Chem.Pharm2D import Generate, Gobbi_Pharm2D
from rdkit.Chem import Draw

from rdkit.Chem import MolToSmiles as mol2smi
from rdkit import RDLogger

In [66]:
# The needed functions in order to make the Stoned-Selfies Algorithm "https://github.com/aspuru-guzik-group/stoned-selfies/blob/main/stoned_selfies_tut.ipynb"

def randomize_smiles(mol):

    if not mol:
        return None

    Chem.Kekulize(mol)
    return rdkit.Chem.MolToSmiles(mol, canonical=False, doRandom=True, isomericSmiles=False,  kekuleSmiles=True) 


def sanitize_smiles(smi):

    try:
        mol = smi2mol(smi, sanitize=True)
        smi_canon = mol2smi(mol, isomericSmiles=False, canonical=True)
        return (mol, smi_canon, True)
    except:
        return (None, None, False)
    

def get_selfie_chars(selfie):

    chars_selfie = [] # A list of all SELFIE sybols from string selfie
    while selfie != '':
        chars_selfie.append(selfie[selfie.find('['): selfie.find(']')+1])
        selfie = selfie[selfie.find(']')+1:]
    return chars_selfie


class _FingerprintCalculator:


    def get_fingerprint(self, mol: Mol, fp_type: str):
        method_name = 'get_' + fp_type
        method = getattr(self, method_name)
        if method is None:
            raise Exception(f'{fp_type} is not a supported fingerprint type.')
        return method(mol)

    def get_AP(self, mol: Mol):
        return AllChem.GetAtomPairFingerprint(mol, maxLength=10)

    def get_PHCO(self, mol: Mol):
        return Generate.Gen2DFingerprint(mol, Gobbi_Pharm2D.factory)

    def get_BPF(self, mol: Mol):
        return GetBPFingerprint(mol)

    def get_BTF(self, mol: Mol):
        return GetBTFingerprint(mol)

    def get_PATH(self, mol: Mol):
        return AllChem.RDKFingerprint(mol)

    def get_ECFP4(self, mol: Mol):
        return AllChem.GetMorganFingerprint(mol, 2)

    def get_ECFP6(self, mol: Mol):
        return AllChem.GetMorganFingerprint(mol, 3)

    def get_FCFP4(self, mol: Mol):
        return AllChem.GetMorganFingerprint(mol, 2, useFeatures=True)

    def get_FCFP6(self, mol: Mol):
        return AllChem.GetMorganFingerprint(mol, 3, useFeatures=True)


def get_fingerprint(mol: Mol, fp_type: str):


    return _FingerprintCalculator().get_fingerprint(mol=mol, fp_type=fp_type)

def mutate_selfie(selfie, max_molecules_len, write_fail_cases=False):

    valid=False
    fail_counter = 0
    chars_selfie = get_selfie_chars(selfie)
    
    while not valid:
        fail_counter += 1
                
        alphabet = list(selfies.get_semantic_robust_alphabet()) # 34 SELFIE characters 

        choice_ls = [1, 2, 3] # 1=Insert; 2=Replace; 3=Delete
        random_choice = np.random.choice(choice_ls, 1)[0]
        
        # Insert a character in a Random Location
        if random_choice == 1: 
            random_index = np.random.randint(len(chars_selfie)+1)
            random_character = np.random.choice(alphabet, size=1)[0]
            
            selfie_mutated_chars = chars_selfie[:random_index] + [random_character] + chars_selfie[random_index:]

        # Replace a random character 
        elif random_choice == 2:                         
            random_index = np.random.randint(len(chars_selfie))
            random_character = np.random.choice(alphabet, size=1)[0]
            if random_index == 0:
                selfie_mutated_chars = [random_character] + chars_selfie[random_index+1:]
            else:
                selfie_mutated_chars = chars_selfie[:random_index] + [random_character] + chars_selfie[random_index+1:]
                
        # Delete a random character
        elif random_choice == 3: 
            random_index = np.random.randint(len(chars_selfie))
            if random_index == 0:
                selfie_mutated_chars = chars_selfie[random_index+1:]
            else:
                selfie_mutated_chars = chars_selfie[:random_index] + chars_selfie[random_index+1:]
                
        else: 
            raise Exception('Invalid Operation trying to be performed')

        selfie_mutated = "".join(x for x in selfie_mutated_chars)
        sf = "".join(x for x in chars_selfie)
        
        try:
            smiles = decoder(selfie_mutated)
            mol, smiles_canon, done = sanitize_smiles(smiles)
            if len(selfie_mutated_chars) > max_molecules_len or smiles_canon=="":
                done = False
            if done:
                valid = True
            else:
                valid = False
        except:
            valid=False
            if fail_counter > 1 and write_fail_cases == True:
                f = open("selfie_failure_cases.txt", "a+")
                f.write('Tried to mutate SELFIE: '+str(sf)+' To Obtain: '+str(selfie_mutated) + '\n')
                f.close()
    
    return (selfie_mutated, smiles_canon)

def get_mutated_SELFIES(selfies_ls, num_mutations): 


    for _ in range(num_mutations): 
        selfie_ls_mut_ls = []
        for str_ in selfies_ls: 
            
            str_chars = get_selfie_chars(str_)
            max_molecules_len = len(str_chars) + num_mutations
            
            selfie_mutated, _ = mutate_selfie(str_, max_molecules_len)
            selfie_ls_mut_ls.append(selfie_mutated)
        
        selfies_ls = selfie_ls_mut_ls.copy()
    return selfies_ls


def get_fp_scores(smiles_back, target_smi, fp_type): 


    smiles_back_scores = []
    target    = Chem.MolFromSmiles(target_smi)

    fp_target = get_fingerprint(target, fp_type)

    for item in smiles_back: 
        mol    = Chem.MolFromSmiles(item)
        fp_mol = get_fingerprint(mol, fp_type)
        score  = TanimotoSimilarity(fp_mol, fp_target)
        smiles_back_scores.append(score)
    return smiles_back_scores

In [None]:
import pandas as pd
import time
from rdkit import Chem

# Stoned-Selfies algorithm
ligand_bindingbd_csv = "C:\Quantum_Computing\Project\Data\EFGR\EFGR_Inhibitors_SMILES_DB.csv"
ligand_selfies_new = "C:\Quantum_Computing\Project\Data\EFGR\EFGR_Inhibitors_SMILES_Selfies_1.csv"

df = pd.read_csv(ligand_bindingbd_csv)

new_data = []

for index, row in df.iterrows():
    smi = row['SMILE']
    fp_type = 'ECFP4'

    try:
        total_time = time.time()

        num_random_samples = 10
        num_mutation_ls = [1, 2, 3, 4, 5]

        # Check if the SMILES is valid
        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            print(f"Skipping invalid SMILES at index {index}: {smi}")
            continue

        
        start_time = time.time()
        randomized_smile_orderings = [randomize_smiles(mol) for _ in range(num_random_samples)]
        selfies_ls = [encoder(x) for x in randomized_smile_orderings]

        all_smiles_collect = []
        all_smiles_collect_broken = []

        for num_mutations in num_mutation_ls:
            selfies_mut = get_mutated_SELFIES(selfies_ls.copy(), num_mutations=num_mutations)
            smiles_back = [decoder(x) for x in selfies_mut]
            all_smiles_collect.extend(smiles_back)
            all_smiles_collect_broken.append(smiles_back)

        canon_smi_ls = []
        for item in all_smiles_collect:
            mol, smi_canon, did_convert = sanitize_smiles(item)
            if mol is None or smi_canon == '' or not did_convert:
                print(f"Invalid mutated SMILES encountered: {item}, skipping...")
                continue
            canon_smi_ls.append(smi_canon)

        
        canon_smi_ls = list(set(canon_smi_ls))
        new_data.append({"SMILE": canon_smi_ls})

        print(f"Total time for index {index}: {time.time() - total_time}")

    except Exception as e:
        print(f"Error processing row {index}: {e}")
        continue

# Convert to a DataFrame and save to CSV
df_new = pd.DataFrame(new_data)
df_new.to_csv(ligand_selfies_new, index=False)

print(f"SMILES strings saved to {ligand_selfies_new}")


In [None]:
# Re-formating the CSV
ligand_selfies_new = "C:\Quantum_Computing\Project\Data\EFGR\EFGR_Inhibitors_SMILES_Selfies_1.csv"
df = pd.read_csv(ligand_selfies_new)
smile_new = ''

data = []
csv_new = "C:\Quantum_Computing\Project\Data\EFGR\EFGR_Inhibitors_SMILES_Selfies_2.csv"
for entry in df["SMILE"]:
    smile_list = entry.split(',')
    for smile in smile_list:
        smile_new = ''
        for element in smile:
            if element == "[" or element == "]":
                continue
            elif element == ' ':
                continue
            else:
                smile_new += element
                 
        data.append({ "SMILE":smile_new})

    smile_new = ''

df_2 = pd.DataFrame(data)

df_2.to_csv(csv_new , index=False)   
print(f"SMILES strings saved to {csv_new }")