In [96]:
import sys
sys.path.append("../")
import logging
import pandas as pd
import numpy as np
import molNet
import matplotlib.pyplot as plt
import os
from rdkit import Chem
import multiprocessing
from multiprocessing import Pool, cpu_count
Chem.SetDefaultPickleProperties(Chem.PropertyPickleOptions.AllProps)

In [2]:
USE_SUBSET=True
SUBSET_SIZE=1_0_000
THREAD_COUND=max(1,int(cpu_count()-2))
STEPSIZE=int(np.ceil(SUBSET_SIZE/THREAD_COUND))#1_000_000

SOURCE_FILE="../dev/black_box/notebooks/data/CID-SMILES"
DATA_FOLDER="/media/julian/ex1ext4_1/temp/"

SUBSET_DATA=os.path.join(DATA_FOLDER,"CID-SMILES_subset_{}.pckl".format(SUBSET_SIZE))
FULL_DATA=os.path.join(DATA_FOLDER,"CID-SMILES.pckl")
MOL_FOLDER=os.path.join(DATA_FOLDER,"CID-MOLES")
PROPVEC_FOLDER=os.path.join(DATA_FOLDER,"prop_vecs")
VAL_DIST_FILE=os.path.join(DATA_FOLDER,"val_dist.pckl")
GLF_PARAMS_FILE=os.path.join(DATA_FOLDER,"glf_params.pckl")

os.makedirs(DATA_FOLDER,exist_ok=True)
os.makedirs(MOL_FOLDER,exist_ok=True)
os.makedirs(PROPVEC_FOLDER,exist_ok=True)


In [3]:
from molNet.featurizer.molecule_featurizer import (
    molecule_mol_wt,
    molecule_num_atoms,
    molecule_num_rings,
    molecule_asphericity,
)
from molNet.utils.mol.properties import has_confomers
from molNet.featurizer import molecule_featurizer
from molNet.featurizer.molecule_featurizer import *

to_ecdf={
    "molecule_mol_wt":molecule_mol_wt,
    "molecule_num_atoms":molecule_num_atoms,
    "molecule_num_rings":molecule_num_rings,
    "molecule_asphericity":molecule_asphericity,
    "molecule_autocorr2d":molecule_autocorr2d 
        }

In [4]:
from rdkit.Chem.rdMolDescriptors import CalcNumAtomStereoCenters
testmol=Chem.MolFromSmiles("c1ccccc1")
CalcNumAtomStereoCenters(testmol)
for fs,_f in molecule_featurizer._available_featurizer.items():
    if fs.startswith("molecule_"):
        to_ecdf[fs]=_f
#to_ecdf

In [5]:
def load_df():
    try:
        print("load pickle")
        dataset = pd.read_pickle(SUBSET_DATA if USE_SUBSET else FULL_DATA)
        if len(dataset)<SUBSET_SIZE:
            raise ValueError("wrong size")
    except:
        try:
            print("load bu pickle")
            dataset = pd.read_pickle(FULL_DATA)
        except:
            print("reload textfile")
            dataset = pd.read_csv(SOURCE_FILE,sep="\t",header=None,index_col=0)
            dataset.columns=["smiles"]
            print("save bu pickle")
            dataset.to_pickle(FULL_DATA)
        if USE_SUBSET:
            dataset = dataset.sample(n = SUBSET_SIZE)
            print("save subset pickle")
            dataset.to_pickle(SUBSET_DATA)
            
            
    return dataset

In [6]:
%%time
try:
    if len(dataset)>SUBSET_SIZE or len(dataset)<SUBSET_SIZE:
        raise ValueError("wrong size")
except Exception as e:
    print(e)
    dataset = load_df()
len(dataset)

name 'dataset' is not defined
load pickle
CPU times: user 0 ns, sys: 10.8 ms, total: 10.8 ms
Wall time: 141 ms


10000

In [7]:
def load_mol_file(path):
    with open(path,"rb") as f:
        mol = Chem.Mol(f.read())
    return mol

def smiles_to_molfile(smiles,uuid,path,clean=True,load_exists=True):
    fn=os.path.join(path,"{}.mol".format(uuid))
    mol=None
    change=False
    if load_exists and os.path.exists(fn):
        mol = load_mol_file(fn)
        
    if mol is None:
        mol = Chem.MolFromSmiles(smiles)
        change=True
    if mol is None:
        raise ValueError("cannot generate mol")
    if clean:
        if not has_confomers(mol,iterations=0):
            change=True
            if not has_confomers(mol,iterations=1):
                if not has_confomers(mol,iterations=10):
                    raise ValueError("cannot clean")
    if change:
        with open(fn,"w+b") as f:
            f.write(mol.ToBinary())
    return mol


In [8]:
def sgar_smiles_to_molfile(arg):
    try:
        return smiles_to_molfile(*arg) is not None
    except ValueError as e:
        return False
    #except:
    #    return False
    
def batch_smiles_to_molfile(smiles,uuids,path,clean=True,processes=0,load_exist=True):
    if processes<=0:
        processes = THREAD_COUND
    print("Using {} cpu(s)".format(processes))
    data=zip(smiles,uuids,[path]*len(smiles),[clean]*len(smiles),[load_exist]*len(smiles))    
    if processes<=1:
        succs=[sgar_smiles_to_molfile(d) for d in data]
    else:        
        with Pool(processes=processes) as pool:
            res = pool.map(sgar_smiles_to_molfile,data)
            succs=res
    
    return np.array(succs,dtype=bool)

In [9]:
%%time
def clean_up(df,redo=False,check_file_exist=True):
    run_since_files_missing=False
    
    if redo or "valid" not in df.columns or df["valid"].sum()<len(df)*0.97:
        succs=batch_smiles_to_molfile(df["smiles"],df.index,MOL_FOLDER,True,processes=0,load_exist=not redo)
        df["valid"]=succs
    elif check_file_exist:
        def _c(uuid):
            return os.path.exists(os.path.join(MOL_FOLDER,"{}.mol".format(uuid)))
        e=df[df["valid"]].index.map(_c).values.astype(bool)
        fm=len(e)-e.sum()
        print(fm,"files missing")
        if fm>0:
            missingdf=df[df["valid"]].loc[~e]
            if fm<100:
                for i,d in missingdf.iterrows():
                    smiles_to_molfile(d["smiles"],i,MOL_FOLDER)
            else:
                batch_smiles_to_molfile(missingdf["smiles"],missingdf.index,MOL_FOLDER,True,processes=0,load_exist=not redo)
        

if USE_SUBSET:
    clean_up(dataset)
    dataset.to_pickle(SUBSET_DATA)

0 files missing
CPU times: user 70.4 ms, sys: 115 ms, total: 185 ms
Wall time: 1.06 s


In [56]:
from io import StringIO, BytesIO
pass_on_ex=True

def get_mol_data_dict(mol,skip_errors=False,restrict=None):
    if restrict is None:
        restrict=to_ecdf.keys()
    d={}
    for c,f in to_ecdf.items():
        if c in restrict:
            try:
                v=f(mol)
                d[c]=v
            except Exception as e:
                if not skip_errors:
                    raise e
    return d

def molfile_to_prop_dict(molfile,skip_errors=False, skip_fragmented=True,restrict=None):
    mol = load_mol_file(molfile)
    
    if skip_fragmented:
        if len(Chem.rdmolops.GetMolFrags(mol)) > 1:
            return None 
    
    d=get_mol_data_dict(mol,skip_errors=skip_errors,restrict=restrict)
    return d

def molfile_to_prop_file(molfile,propfile,target_size=None,ignore_errors=True,skip_errors=False, skip_fragmented=True):
    if os.path.exists(propfile):
        if target_size is None:
            return True
        if os.path.getsize(propfile) == target_size:
            return True
    
    try:
        d=molfile_to_prop_dict(molfile,skip_errors=skip_errors,skip_fragmented=skip_fragmented)
        if d is None:
            return False
    except Exception as e:
        if ignore_errors:
            return False
        else:
            raise e

    with open(propfile,"w+b") as f:
        np.savez(f, **d)
    return True

def determine_size(df,n_samples=100):
    samples=[]
    for i,d in df.iterrows():
        try:
            mol = load_mol_file(os.path.join(MOL_FOLDER,"{}.mol".format(i)))
        except FileNotFoundError:
            continue
        if len(Chem.rdmolops.GetMolFrags(mol)) > 1:
            continue       
        try:
            d=get_mol_data_dict(mol)
        except Exception as e:
            continue
            
        file_obj=BytesIO()
        np.savez(file_obj, **d)
        s=len(file_obj.getvalue())
        samples.append(s)
        if len(samples)>=n_samples:
            break
    print(np.min(samples),np.median(samples),np.mean(samples),np.max(samples))
    return int(np.median(samples))

def iter_mol_props_file(molfile,target_size=None):
    pre, ext = os.path.splitext(molfile)
    datafile=pre+".mdt"
    #print(datafile)
    ex=None
    try:
        molfile_to_prop_file(
                    molfile=molfile,
                    propfile = datafile,
                    target_size=target_size,
                    ignore_errors=False,
                )
    except Exception as e:
            stre=str(e)
            if stre.startswith("ERROR: No Gasteiger Partial Charge"):
                ex=e
            elif stre.startswith("too few atoms"):
                    ex=e
            elif stre.startswith("numStereoCenters called without stereo being assigned"):
                try:
                    os.remove(molfile)
                except:pass
                ex=e
            elif stre.startswith("numUnspecifiedStereoCenters called without stereo being assigned"):
                try:
                    os.remove(molfile)
                except:pass
                ex=e
            elif stre.startswith("Can't kekulize mol"):
                try:
                    os.remove(molfile)
                except:pass
                ex=e
            elif stre.startswith("Explicit valence for atom"):
                try:
                    os.remove(molfile)
                except:pass
                ex=e
            elif stre.startswith("Bad pickle format"):
                try:
                    os.remove(molfile)
                except:pass
                ex=e
            else:
                print(stre)
    if ex is not None:
        return ex
    return True

In [11]:
datasize=determine_size(dataset[dataset["valid"]])
datasize

157066 157066.0 157066.0 157066


157066

In [5]:
rem=list(set([f.rsplit(".mol")[0] for f in os.listdir(MOL_FOLDER) if f.endswith(".mol")])-set([f.rsplit(".mdt")[0] for f in os.listdir(MOL_FOLDER) if f.endswith(".mdt")]))
rem=[os.path.join(MOL_FOLDER,r+".mol") for r in rem]

In [18]:
def _iter_mol_props_file(f):
    iter_mol_props_file(f,target_size=datasize)
    
with Pool(processes=THREAD_COUND) as pool:
    res = pool.map(iter_mol_props_file, rem)

float division by zero


In [7]:
len(rem),len(mdts)

(142273, 1957269)

In [66]:
mdts=[os.path.join(MOL_FOLDER,f) for f in os.listdir(MOL_FOLDER) if f.endswith(".mdt")]

In [94]:
MAX_RAM=10 * 2**30 # gb * fac
n_files = len(mdts)
item_sizes={k:(v.size*v.itemsize*n_files) for k,v in np.load(mdts[0]).items() if k in to_ecdf}

for item,size in list(item_sizes.items()):
    f=os.path.join(PROPVEC_FOLDER,item+".npy")
    if os.path.exists(f):
        s=os.path.getsize(f)
        if min(s,size)/max(s,size) > 0.95:
            del item_sizes[item]

item_sizes = {k: v for k, v in sorted(item_sizes.items(), key=lambda item: item[1])}

blocks=[]
remaining_ram=-1#MAX_RAM
block_index=-1
block_sizes=[]
for n,az in item_sizes.items():
    if remaining_ram - az<0:
        block_index+=1
        blocks.append([])
        block_sizes.append(0)
        remaining_ram=MAX_RAM
    blocks[block_index].append(n)
    remaining_ram -=az
    block_sizes[block_index]+=az
    
len(blocks),block_sizes,THREAD_COUND

(4, [9181548879, 10154311572, 8016973824, 4008486912], 26)

In [95]:
blocks

[['molecule_fr_sulfone',
  'molecule_fr_ar_n',
  'molecule_fr_ester',
  'molecule_vsa_e_state7',
  'molecule_fr_lactam',
  'molecule_fr_aniline',
  'molecule_radius_of_gyration',
  'molecule_hall_kier_alpha',
  'molecule_fr_nitro_arom_nonortho',
  'molecule_fr_ndealkylation1',
  'molecule_e_state_vsa9',
  'molecule_slog_p_vsa10',
  'molecule_heavy_atom_mol_wt',
  'molecule_pmi2',
  'molecule_chi2v',
  'molecule_fr_nh2',
  'molecule_chi4n',
  'molecule_chi3n',
  'molecule_chi4v',
  'molecule_num_bridgehead_atoms',
  'molecule_smr_vsa8',
  'molecule_pmi3',
  'molecule_num_radical_electrons',
  'molecule_chi1n',
  'molecule_chi0',
  'molecule_peoe_vsa2',
  'molecule_fr_ndealkylation2',
  'molecule_fr_oxazole',
  'molecule_balaban_j',
  'molecule_num_aliphatic_carbocycles',
  'molecule_slog_p_vsa2',
  'molecule_e_state_vsa1',
  'molecule_kappa3',
  'molecule_mol_log_p',
  'molecule_fr_c_s',
  'molecule_slog_p_vsa9',
  'molecule_bcut2d_mwhi',
  'molecule_vsa_e_state1',
  'molecule_pmi1',
  

In [None]:
def load_block(block):
    data={b:[] for b in block}
    deltaprint=10**(max(0,int(np.log10(len(mdts))))-4)
    for i,mdt in enumerate(mdts):
        if i%deltaprint==0:
            print(i,end="\r")
        try:
            d=np.load(mdt)
            for b in block:
                data[b].append(d[b])
        except Exception:
            continue
            
    for b in block:
        print(b)
        np.save(os.path.join(PROPVEC_FOLDER,b),np.stack(data[b]))
        
for b in blocks:
    load_block(b)

244100

In [12]:
mol_files=mdts=[os.path.join(MOL_FOLDER,f) for f in os.listdir(MOL_FOLDER) if f.endswith(".mol")]

In [43]:
for mf in mol_files:
    try:
        mol=load_mol_file(mf)
        mdict=get_mol_data_dict(mol)
        break
    except Exception:
        pass
    
MAX_RAM=14 * 2**30 # gb * fac
item_sizes={k:(v.size*v.itemsize) for k,v in mdict.items() if k in to_ecdf}
item_sizes = {k: v for k, v in sorted(item_sizes.items(), key=lambda item: item[1])}
n_files = len(mdts)
blocks=[[]]
remaining_ram=MAX_RAM
block_index=0
block_sizes=[0]
for n,az in item_sizes.items():
    tot_size= n_files*az
    if remaining_ram - tot_size<0:
        block_index+=1
        blocks.append([])
        block_sizes.append(0)
        remaining_ram=MAX_RAM
    blocks[block_index].append(n)
    remaining_ram -=tot_size
    block_sizes[block_index]+=tot_size
    
len(blocks)#,list(zip([", ".join(b) for b in blocks],np.array(block_sizes)/2**30)),item_sizes

6

In [65]:
_mol_files=mol_files[:1000]
deltaprint=max(1,10**(max(0,int(np.log10(len(_mol_files))))-4))
print(deltaprint)
for block in blocks:
    for i, mf in enumerate(_mol_files):
        if i%deltaprint==0:
            print(i,end="\r")
        #d = molfile_to_prop_dict(mf,skip_errors=True,restrict=block)
        #print(len(d))
        
    break

1
999