In [1]:
import os,sys
modp = os.path.dirname(os.path.abspath(""))
while not "molNet" in os.listdir(modp):
    modp = os.path.dirname(modp)
    if os.path.dirname(modp) == modp:
        raise ValueError("connot determine local molNet")
if modp not in sys.path:
    sys.path.insert(0, modp)
    sys.path.append(modp)

import molNet
import molNet.dataloader.molecular as mol_dl_mod
from molNet.dataloader.molecular.dataloader import MolDataLoader
import inspect
import numpy as np
import pickle
import matplotlib.pyplot as plt

INFO:rdkit:Enabling RDKit 2021.09.2 jupyter extensions


In [2]:
BASEDIR=os.path.abspath(os.path.join(molNet.get_user_folder(), "autodata", "feats_raw_filebased"))
OUT_DIR = os.path.abspath("ecdf_data")

os.makedirs(OUT_DIR,exist_ok=True)

In [3]:


possible_dataloader=set()
for modname, mod in inspect.getmembers(mol_dl_mod, inspect.ismodule):
    for classname,c in inspect.getmembers(mod, inspect.isclass):
        if issubclass(c,MolDataLoader) and not c==MolDataLoader:
            possible_dataloader.add(c)

found_dataloader_dirs=set()
for pdl in possible_dataloader:
    dataset_name=f"{pdl.__module__}.{pdl.__name__}"
    ds_dir=os.path.join(BASEDIR,dataset_name)
    if os.path.isdir(ds_dir):
        found_dataloader_dirs.add(ds_dir)
found_dataloader_dirs

{'C:\\Users\\be34gof\\.molNet\\autodata\\feats_raw_filebased\\molNet.dataloader.molecular.ESOL.ESOL'}

In [4]:
def try_generate_distribution(featpath):
    outpath=os.path.join(OUT_DIR,os.path.relpath(featpath,BASEDIR))
    os.makedirs(outpath,exist_ok=True)
    #print(outpath)
    counter=[]
    for f in os.listdir(featpath):
        if not f.startswith("feats_"):
            continue
        try:
            int(f[-5:-4])
        except ValueError:
            continue
        data_fp=os.path.join(featpath,f)
        ignored_fp=os.path.join(featpath,f[:-4]+"_ignored_indices.npy")
        data_array=np.load(data_fp)
        ignored_array=np.load(ignored_fp)
        selector = np.ones(data_array.shape[0],dtype=bool)
        selector[ignored_array]=False
        data_array=data_array[selector]
        
        for i in range(data_array.shape[1]):
            if len(counter)<=i:
                counter.append({})
            count_dict=counter[i]
            datas,counts = np.unique(data_array[:,i],return_counts=True)
            datas,counts = datas.tolist(),counts.tolist()
            for j in range(len(datas)):
                if datas[j] not in count_dict:
                    count_dict[datas[j]] = 0
                
                count_dict[datas[j]] += counts[j]
    
    np_counter = []
    for c in counter:
        np_counter.append({
            "data":np.array(list(c.keys())),
            "counts":np.array(list(c.values())),
        })
            
    #print(np_counter)
    with open(os.path.join(outpath,"histo_data.pckl"),"w+b") as f:
        pickle.dump(np_counter,f)
        
    histo_path=os.path.join(outpath,"histos")
    os.makedirs(histo_path,exist_ok=True)
    for i,d in enumerate(np_counter):
        #print(i)
        path1=os.path.join(histo_path,f"{i}.png")
        path2=os.path.join(histo_path,f"{i}_bin30.png")
        if os.path.exists(path2):
            continue
        plt.figure()
        bins=d["data"]
        nat_bins=bins[(~np.isneginf(bins))&(~np.isinf(bins))]
        if len(nat_bins)==0:
            bins[np.isneginf(bins)]=-1e32
            bins[np.isinf(bins)]=1e32
        else:
            bins[np.isneginf(bins)] = nat_bins.min()-(nat_bins.max()-nat_bins.min())*0.1
            bins[np.isinf(bins)] = nat_bins.max()+(nat_bins.max()-nat_bins.min())*0.1
        sort=np.argsort(bins)
        bins=bins[sort]
        counts = d["counts"][sort]
        centroids = bins
        counts_, bins_, _ = plt.hist(centroids, bins=len(counts),
                                     weights=counts, range=(min(bins), max(bins)))

        plt.savefig(path1)
        plt.close()
        
        plt.figure()
        counts_, bins_, _ = plt.hist(centroids, bins=min(30,len(counts)),
                                     weights=counts, range=(min(bins), max(bins)))

        plt.savefig(path2)
        plt.close()

In [76]:
def generate_ecdf(data, resolution_y=None, smooth=False, unique_only=False):
    if data.ndim > 1:
        data = np.squeeze(data)
        if data.ndim > 1:
            return [
                generate_ecdf(data[..., i], res_1_99=res_1_99, smooth=smooth, unique_only=unique_only)
                for i in range(data.shape[-1])
            ]
    x = np.sort(data)
    nfi=np.isfinite(x)
    x = x[nfi]
    n = len(x)
    y = np.arange(1, n + 1) / n
    if smooth:
        unique_only = True
        x, uindices = np.unique(x, return_index=True)
        y = np.array([a.mean() for a in np.split(y, uindices[1:])])
        y[0] = 0
        y[-1] = 1
    if resolution_y:
        ylin=np.linspace(0,1,resolution_y)
        indices = np.unique(np.abs(np.subtract.outer(ylin,y)).argmin(1))
        
        y=y[indices]
        x=x[indices]
        
    if unique_only:
        x, uindices = np.unique(x, return_index=True)
        y = y[uindices]
    return x, y

def try_generate_ecdf(featpath,res=5000,redo=False):
    outpath=os.path.join(OUT_DIR,os.path.relpath(featpath,BASEDIR))
    
    if not os.path.isdir(outpath):
        return False
    histo_data_path=os.path.join(outpath,"histo_data.pckl")
    if not os.path.isfile(histo_data_path):
        return False
    escf_plots=os.path.join(outpath,"escf_plots")
    os.makedirs(escf_plots,exist_ok=True)
    
    ecdf_data_path=os.path.join(outpath,"ecdf_data.pckl")
    dobreak=False
    while True:
        if redo:
            break
        if not os.path.exists(ecdf_data_path):
            break
            
        with open(ecdf_data_path,"rb") as f:
            ecdf_data = pickle.load(f)
            
        if not isinstance(ecdf_data,dict):
            break
            
        if not "resolution" in ecdf_data:
            break
            
        if ecdf_data["resolution"]!=res:
            break
        
        return True
    
    with open(histo_data_path,"rb") as f:
        histo_data = pickle.load(f)
    pp=True
    ecdf_data=[]
    featurname=os.path.basename(outpath)
    dataset_name=os.path.basename(os.path.dirname(outpath))
    
    for i,d in enumerate(histo_data):
        if len(d["data"])>0:
            if pp:
                print(outpath)
                pp=False
                
            unpacked_data = np.repeat(d["data"], d["counts"], axis=0)
            x1, y1 = generate_ecdf(unpacked_data)
            plt.plot(x1,y1,label="ecdf")

            x3, y3 = generate_ecdf(unpacked_data,smooth=True,resolution_y=res)
            xy=np.concatenate([[x3], [y3]],axis=0)
            ecdf_data.append(xy)
            
            plt.plot(xy[0],xy[1],label="smoothed ecdf")
            plt.title(f"{dataset_name}\n{featurname} {i}", fontdict = {'fontsize' : 8})
            plt.tight_layout()
            plt.savefig(os.path.join(escf_plots,f"{i}.png"),bbox_inches='tight')
            plt.close()
    with open(ecdf_data_path,"w+b") as f:
        pickle.dump({"data":ecdf_data,"resolution":res},f)
    return True

In [78]:
import molNet.featurizer
featurizer = molNet.featurizer.get_molecule_featurizer_info()
for idx in featurizer.index:
    for dldir in found_dataloader_dirs:
        featpath=os.path.join(dldir,idx)
        if os.path.isdir(featpath):
            try_generate_distribution(featpath)
            try_generate_ecdf(featpath)

In [79]:
featurizer

Unnamed: 0,length,dtype,instance,class,module
molNet.featurizer._manual_molecule_featurizer.molecule_num_fragments,1,<class 'numpy.int32'>,NumFragments_Featurizer,<class 'molNet.featurizer._manual_molecule_fea...,molNet.featurizer._manual_molecule_featurizer
molNet.featurizer._molecule_featurizer.molecule_functional_group_12_Diamines_featurizer,1,<class 'bool'>,"1, 2-Diamines",<class 'molNet.featurizer._molecule_featurizer...,molNet.featurizer._molecule_featurizer
molNet.featurizer._molecule_featurizer.molecule_functional_group_1H_Azirines_Hs_featurizer,1,<class 'bool'>,1H-Azirines (HS),<class 'molNet.featurizer._molecule_featurizer...,molNet.featurizer._molecule_featurizer
molNet.featurizer._molecule_featurizer.molecule_functional_group_1H_Diazirenes_Hs_featurizer,1,<class 'bool'>,1H-Diazirenes (HS),<class 'molNet.featurizer._molecule_featurizer...,molNet.featurizer._molecule_featurizer
molNet.featurizer._molecule_featurizer.molecule_functional_group_1H_Triazirene_Hs_featurizer,1,<class 'bool'>,1H-Triazirene (HS),<class 'molNet.featurizer._molecule_featurizer...,molNet.featurizer._molecule_featurizer
...,...,...,...,...,...
molNet.featurizer._autogen_rdkit_feats_vec_molecule_featurizer.molecule_AllChem_RDKFingerprint_featurizer,2048,<class 'bool'>,Molecule_AllChem_RDKFingerprint_Featurizer,<class 'molNet.featurizer._autogen_rdkit_feats...,molNet.featurizer._autogen_rdkit_feats_vec_mol...
molNet.featurizer._autogen_rdkit_feats_vec_molecule_featurizer.molecule_Chem_LayeredFingerprint_featurizer,2048,<class 'bool'>,Molecule_Chem_LayeredFingerprint_Featurizer,<class 'molNet.featurizer._autogen_rdkit_feats...,molNet.featurizer._autogen_rdkit_feats_vec_mol...
molNet.featurizer._autogen_rdkit_feats_vec_molecule_featurizer.molecule_Chem_PatternFingerprint_featurizer,2048,<class 'bool'>,Molecule_Chem_PatternFingerprint_Featurizer,<class 'molNet.featurizer._autogen_rdkit_feats...,molNet.featurizer._autogen_rdkit_feats_vec_mol...
molNet.featurizer._autogen_rdkit_feats_vec_molecule_featurizer.molecule_Chem_RDKFingerprint_featurizer,2048,<class 'bool'>,Molecule_Chem_RDKFingerprint_Featurizer,<class 'molNet.featurizer._autogen_rdkit_feats...,molNet.featurizer._autogen_rdkit_feats_vec_mol...
