In [1]:
import os
import numpy as np
import pandas as pd
from PIL import Image
from rdkit import Chem
from rdkit.Chem import Draw
import random

In [2]:
def Smiles2Img(smis, size=224, savePath=None):
    '''
        smis: e.g. COC1=C(C=CC(=C1)NS(=O)(=O)C)C2=CN=CN3C2=CC=C3
        path: E:/a/b/c.png
    '''
    try:
        mol = Chem.MolFromSmiles(smis)
        Draw.MolToFile(mol,savePath,size=(224,224))
    except:
        return False
    return True

In [3]:
df_positive = pd.read_csv('./data/positive.csv')
smiles_positive = df_positive['smiles'].values
y_positive = df_positive['label'].values

df_negtive = pd.read_csv('./data/negtive.csv')
smiles_negtive = df_negtive['smiles'].values
y_negtive = df_negtive['label'].values

In [4]:
idx_positive = [i for i in range(len(smiles_positive))]
idx_negtive = [i for i in range(len(smiles_negtive))]

In [5]:
sample_list = [5,10,15,20,25,30,35,40,45,50]

In [7]:
# train_data = pd.read_csv('./data/3CL_enzymatic_activity/train.csv')
train_data = pd.read_csv('./data/3CL_enzymatic/train.csv')
train_data_list = train_data['name'].tolist()

In [8]:
for sample_num in sample_list:
    random.seed(60)
    train_data = train_data_list.copy()

    idx_p = random.sample(idx_positive, sample_num)
    idx_n = random.sample(idx_negtive, sample_num)

    extra_smiles = np.concatenate([smiles_positive[idx_p],smiles_negtive[idx_n]])
    extra_lables = np.concatenate([y_positive[idx_p],y_negtive[idx_n]])
    extra_idx = np.concatenate([idx_p,idx_n])

    for i in range(len(extra_idx)):
        extra_idx[i] = extra_idx[i] + 20000

    extra_data = []
    for i in range(len(extra_idx)):
        extra_data.append([extra_smiles[i], extra_idx[i], extra_lables[i]])
    pd_extra = pd.DataFrame(data=extra_data, columns=['smiles', 'index', 'label'])

    dir_name = f'3CL2_extra_{sample_num*2}'
    os.mkdir(f'./data/3CL_extra/{dir_name}')
    os.mkdir(f'./data/3CL_extra/{dir_name}/processed')
    os.mkdir(f'./data/3CL_extra/{dir_name}/processed/224')

    for item in pd_extra.values:
        smiles = item[0]
        idx = item[1]
        savePath = f'./data/3CL_extra/{dir_name}/processed/224/{idx}.png'
        if Smiles2Img(smiles, size=224, savePath=savePath):
            train_data.append(f'{idx}.png')

    pd_raw = pd.read_csv('./data/processed/3CL_processed_ac.csv')
    outputs = pd.concat([pd_raw,pd_extra])
    outputs.to_csv(f'./data/3CL_extra/{dir_name}/processed/{dir_name}_processed_ac.csv', index=False)

    train_data = pd.DataFrame(data=train_data, columns=['name'])
    train_data.to_csv(f'./data/3CL_extra/{dir_name}/train.csv', index=False)