In [5]:
import sys
sys.path.append("../")
import logging
import pandas as pd
import os
from rdkit import Chem
import multiprocessing
from multiprocessing import Pool, cpu_count
Chem.SetDefaultPickleProperties(Chem.PropertyPickleOptions.AllProps)
from IPython.display import clear_output

In [6]:
USE_SUBSET=True
SUBSET_SIZE=1_0_000
THREAD_COUND=max(1,int(cpu_count()-2))

SOURCE_FILE="../dev/black_box/notebooks/data/CID-SMILES"
DATA_FOLDER="/media/julian/ex1ext4_1/temp/"

LOG_FOLDER=os.path.join(DATA_FOLDER,"logs")
SUBSET_DATA=os.path.join(DATA_FOLDER,"CID-SMILES_subset_{}.pckl".format(SUBSET_SIZE))
FULL_DATA=os.path.join(DATA_FOLDER,"CID-SMILES.pckl")
MOL_FOLDER=os.path.join(DATA_FOLDER,"CID-MOLES")


os.makedirs(DATA_FOLDER,exist_ok=True)
os.makedirs(LOG_FOLDER,exist_ok=True)
os.makedirs(MOL_FOLDER,exist_ok=True)

logger = logging.getLogger("ecdf_1")
logger.setLevel(logging.DEBUG)
for h in list(logger.handlers):
    logger.removeHandler(h)

fh = logging.FileHandler(os.path.join(LOG_FOLDER,"ecdf_1.log"))
fh.setLevel(logging.DEBUG)
logger.addHandler(fh)

ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
logger.addHandler(ch)


In [7]:
def load_df():
    try:
        logger.info("load pickle")
        dataset = pd.read_pickle(SUBSET_DATA if USE_SUBSET else FULL_DATA)
        if len(dataset)<SUBSET_SIZE:
            raise ValueError("wrong size")
    except Exception as e:
        logger.exception(e)
        try:
            logger.info("load bu pickle")
            dataset = pd.read_pickle(FULL_DATA)
        except Exception as e:
            logger.exception(e)
            logger.info("reload textfile")
            dataset = pd.read_csv(SOURCE_FILE,sep="\t",header=None,index_col=0)
            dataset.columns=["smiles"]
            logger.info("save bu pickle")
            dataset.to_pickle(FULL_DATA)
        if USE_SUBSET:
            dataset = dataset.sample(n = SUBSET_SIZE)
            logger.info("save subset pickle")
            dataset.to_pickle(SUBSET_DATA)
    return dataset

In [8]:
%%time
try:
    dataset
except NameError:
    dataset = load_df()
if len(dataset)>SUBSET_SIZE or len(dataset)<SUBSET_SIZE:
        logger.error("wrong size")
        dataset = load_df()

CPU times: user 33 µs, sys: 3 µs, total: 36 µs
Wall time: 40.8 µs


In [9]:
def load_mol_file(path):
    with open(path,"rb") as f:
        mol = Chem.Mol(f.read())
    return mol

def smiles_to_molfile(smiles,uuid,path,clean=True,load_exists=True):
    fn=os.path.join(path,"{}.mol".format(uuid))
    mol=None
    change=False
    if load_exists and os.path.exists(fn):
        mol = load_mol_file(fn)
        
    if mol is None:
        mol = Chem.MolFromSmiles(smiles)
        change=True
    if mol is None:
        raise ValueError("cannot generate mol")
    if clean:
        if not has_confomers(mol,iterations=0):
            change=True
            if not has_confomers(mol,iterations=1):
                if not has_confomers(mol,iterations=10):
                    raise ValueError("cannot clean")
    if change:
        with open(fn,"w+b") as f:
            f.write(mol.ToBinary())
    return mol

In [10]:
def sgar_smiles_to_molfile(arg):
    try:
        return smiles_to_molfile(*arg) is not None
    except ValueError as e:
        return False
    #except:
    #    return False
    
def batch_smiles_to_molfile(smiles,uuids,path,clean=True,processes=0,load_exist=True):
    if processes<=0:
        processes = THREAD_COUND
    logger.info("Using {} cpu(s)".format(processes))
    data=zip(smiles,uuids,[path]*len(smiles),[clean]*len(smiles),[load_exist]*len(smiles))    
    if processes<=1:
        succs=[sgar_smiles_to_molfile(d) for d in data]
    else:        
        with Pool(processes=processes) as pool:
            res = pool.map(sgar_smiles_to_molfile,data)
            succs=res
    
    return np.array(succs,dtype=bool)

In [11]:
%%time
def clean_up(df,redo=False,check_file_exist=True):
    run_since_files_missing=False
    
    if redo or "valid" not in df.columns or df["valid"].sum()<len(df)*0.97:
        succs=batch_smiles_to_molfile(df["smiles"],df.index,MOL_FOLDER,True,processes=0,load_exist=not redo)
        df["valid"]=succs
    elif check_file_exist:
        def _c(uuid):
            return os.path.exists(os.path.join(MOL_FOLDER,"{}.mol".format(uuid)))
        e=df[df["valid"]].index.map(_c).values.astype(bool)
        fm=len(e)-e.sum()
        logger.info("{},files missing".format(fm))
        if fm>0:
            missingdf=df[df["valid"]].loc[~e]
            if fm<100:
                for i,d in missingdf.iterrows():
                    smiles_to_molfile(d["smiles"],i,MOL_FOLDER)
            else:
                batch_smiles_to_molfile(missingdf["smiles"],missingdf.index,MOL_FOLDER,True,processes=0,load_exist=not redo)
        

if USE_SUBSET:
    clean_up(dataset)
    dataset.to_pickle(SUBSET_DATA)

0,files missing


CPU times: user 29.1 ms, sys: 23.4 ms, total: 52.5 ms
Wall time: 50.4 ms
