In [16]:
import os
from rdkit import Chem
import pandas as pd
from tqdm import tqdm

In [17]:
dir = r'D:/R/PycharmProjects/interest/Tox2024/Tox_3'
python_dir = dir + '/python'
os.chdir(python_dir)
csv_dir = dir + '/data/CSV'
aug_dir = dir + '/data/AUG'

In [18]:
# import SmilesEnumerator
from util.SmilesEnumerator import SmilesEnumerator

In [21]:
# Test
sme = SmilesEnumerator(canonical=False, enum=True)
test_smi = "CCC(=O)O[C@@]1(CC[NH+](C[C@H]1CC=C)C)c2ccccc2"
tries = []
for i in tqdm(range(100)):
    tries.append(sme.randomize_smiles(test_smi))

tries = list(set(tries))
print(len(tries))

100%|██████████| 100/100 [00:00<00:00, 4810.09it/s]

69





In [26]:
test_pos = 0  # 1 of 10 used for test data
test_neg = 0  # 1 of 10 used for test data
MAX_TRIES = 200
enum_factor_positive = 35
sme = SmilesEnumerator(canonical=False, enum=True)

csv_file = [x for x in os.listdir(csv_dir) if x.endswith('.csv')]
csv_file = ['ahr.csv']
for nr in csv_file:
    print(f"Processing {nr}:")
    df = pd.read_csv(os.path.join(csv_dir, nr))

    pos = len(df[df['ACTIVITY'] == 1])
    neg = len(df[df['ACTIVITY'] == 0])
    print(f"Positive: {pos}, Negative: {neg}, Ratio: {neg/pos}")
    enum_factor_negative = enum_factor_positive * (pos/neg)

    print(pos * enum_factor_positive)
    print(neg * enum_factor_negative)

    df_aug = pd.DataFrame(columns=['SMILES', 'ACTIVITY'])
    df_test = pd.DataFrame(columns=['SMILES', 'ACTIVITY'])

    for smi, act in tqdm(zip(df['SMILES'], df['ACTIVITY']), total=df['SMILES'].shape[0], desc="Enumerating SMILES"):
        mol = Chem.MolFromSmiles(smi)
        if mol is None:continue
        tries = set()
        tries.add(smi)

        for i in range(MAX_TRIES):
            tries.add(sme.randomize_smiles(smi))
            if act == 1:
                if len(tries) >= enum_factor_positive: break
            else:
                if len(tries) >= enum_factor_negative: break
        
        tries = list(tries)

        if act == 1:
            if test_pos % 10 == 0:
                for i in tries:
                    new_row = pd.DataFrame({'SMILES': [i], 'ACTIVITY': [act]})
                    df_test = pd.concat([df_test, new_row], ignore_index=True)
            else:
                for i in tries:
                    new_row = pd.DataFrame({'SMILES': [i], 'ACTIVITY': [act]})
                    df_aug = pd.concat([df_aug, new_row], ignore_index=True)
            test_pos += 1
        else:
            if test_neg % 10 == 0:
                for i in tries:
                    new_row = pd.DataFrame({'SMILES': [i], 'ACTIVITY': [act]})
                    df_test = pd.concat([df_test, new_row], ignore_index=True)
            else:
                for i in tries:
                    new_row = pd.DataFrame({'SMILES': [i], 'ACTIVITY': [act]})
                    df_aug = pd.concat([df_aug, new_row], ignore_index=True)
            test_neg += 1

    df_aug.to_csv(os.path.join(aug_dir, nr.split('.')[0] + f'_{enum_factor_positive}x_aug.csv'), index=False)
    df_test.to_csv(os.path.join(aug_dir, nr.split('.')[0] + f'_{enum_factor_positive}x_test.csv'), index=False)

Processing ahr.csv:
Positive: 950, Negative: 7219, Ratio: 7.598947368421053
33250
33250.0


Enumerating SMILES:  19%|█▉        | 1557/8169 [00:11<00:36, 179.78it/s][23:54:45] Explicit valence for atom # 3 Si, 8, is greater than permitted
Enumerating SMILES:  31%|███       | 2496/8169 [00:16<00:27, 209.51it/s][23:54:50] Explicit valence for atom # 0 Cl, 2, is greater than permitted
Enumerating SMILES:  52%|█████▏    | 4285/8169 [00:28<00:38, 101.24it/s][23:55:03] Explicit valence for atom # 2 Cl, 2, is greater than permitted
Enumerating SMILES:  80%|███████▉  | 6505/8169 [00:41<00:07, 211.67it/s][23:55:15] Explicit valence for atom # 3 Si, 8, is greater than permitted
Enumerating SMILES: 100%|██████████| 8169/8169 [00:48<00:00, 166.98it/s]
