# Preprocessing for Smart PMI Random Forest Model

In [1]:
import numpy as np
import pandas as pd
import sys, os
import random
import re

from rdkit import Chem
from rdkit.Chem import PandasTools
from mordred import Calculator, descriptors

## Preprocessing

In [3]:
# -- helper functions
np.random.seed(42)
random.seed(42)

# -- get RDKit featurizers
descriptor_names = list(Chem.rdMolDescriptors.Properties.GetAvailableProperties())
get_descriptors = Chem.rdMolDescriptors.Properties(descriptor_names)
# -- get Mordred featurizers
calc = Calculator(descriptors, ignore_3D=True)

def smi_to_descriptors(smile):
    # -- RDK helper func
    mol = Chem.MolFromSmiles(smile)
    descriptors = []
    if mol:
        # -- use RDKit featurizers
        descriptors = np.array(get_descriptors.ComputeProperties(mol))
    return descriptors 

# -- apply molecular featurization on SMILES
def featurize(df, use_rdk = False, use_m = False):
    df = df.copy()
    smiles = df.SMILES
    append = [df]

    # -- use RDKit featurizers
    if use_rdk:
        rdkit_desc = list(df.SMILES.apply(smi_to_descriptors))
        desc_df = pd.DataFrame(rdkit_desc, columns=descriptor_names)
        append += [desc_df]
    # -- use Mordred featurizers
    if use_m:
        mols = [Chem.MolFromSmiles(smi) for smi in smiles if Chem.MolFromSmiles(smi) != None]
        # drop = ['SpAbs_Dt',	'SpMax_Dt'	,'SpDiam_Dt'	,'SpAD_Dt',	'SpMAD_Dt'	,'LogEE_Dt'	,'SM1_Dt'	,'VE1_Dt',	'VE2_Dt',
	    #         'VE3_Dt',	'VR1_Dt',	'VR2_Dt',	'VR3_Dt'	,'DetourIndex']
        mord_df = calc.pandas(mols) #.select_dtypes(include=['int64', 'float64'])
        append += [mord_df]
    
    return pd.concat(append, axis=1)

# -- find columns uncommon to dfs
def validate_cols(df1, df2):
    cols1, cols2 = df1.columns, df2.columns
    validate_cols = [feature for feature in cols1 if feature not in cols2]
    uncommon = set(cols1) ^ set(cols2)
    print('outlying features (empty is good):', uncommon)

In [4]:
# test imports : TODO: preprocess X

## get SMILES data from test
test_smi_path = './data/testdata.sdf'
test_smis = Chem.PandasTools.LoadSDF(test_smi_path, smilesName = 'SMILES')[['ID','SMILES']]

## get attrs from test
attr_path = './data/parsed_attributes.csv'
test_attr= pd.read_csv(attr_path)

## naming
test = pd.concat([test_smis, test_attr], ignore_index=False, axis=1)
true_test = [3.1, 3.0, 3.1, 3.3, 1.9,4.0, 3.1, 4.1, 3.4, 
            2.5, 3.5, 2.8, 4.2, 2.5, 3.9, 2.8, 2.9, 3.3,
            4.7, 3.9, 2.9, 4.3, 4.2, 2.9, 3.5, 4.4, 4.3, 3.2]
test['TRUE'] = true_test

In [5]:
# -- load training data [train_y, train_x]
train_path = './data/ci5001778_si_001.txt'
train = pd.read_csv(train_path).drop(['MOLECULE', 'Source', 'Votes'], axis=1)

## -- naming
train['meanComplexity'] = train.meanComplexity.apply(lambda x: np.round(x, decimals=3))
con_pre = [col.replace('DESCRIPTORCOMPLEXITY_','') for col in train.columns]
con_pre2 = [col.replace('SP3CARBONS_', '') for col in con_pre]
no_moe_cols = [col for col in con_pre2 if 'MOE_2D_' not in col]
## -- get rid of moe columns
train.columns = con_pre2

In [6]:
flag_nulls = featurize(train[no_moe_cols], True, True)
exclude = flag_nulls[flag_nulls.isna().any(axis=1)].index
trim = lambda df : df[~df.index.isin(exclude)]

[11:20:08] Explicit valence for atom # 14 N, 4, is greater than permitted
[11:20:08] Explicit valence for atom # 9 N, 4, is greater than permitted
[11:20:08] Explicit valence for atom # 22 N, 4, is greater than permitted
[11:20:08] Explicit valence for atom # 17 N, 4, is greater than permitted
[11:20:09] Explicit valence for atom # 26 N, 4, is greater than permitted
[11:20:09] Explicit valence for atom # 21 N, 4, is greater than permitted
[11:20:09] Explicit valence for atom # 11 N, 4, is greater than permitted
[11:20:09] Explicit valence for atom # 21 N, 4, is greater than permitted
[11:20:09] Explicit valence for atom # 8 N, 4, is greater than permitted
[11:20:09] Explicit valence for atom # 12 N, 4, is greater than permitted
[11:20:09] Explicit valence for atom # 7 N, 4, is greater than permitted
[11:20:09] Explicit valence for atom # 18 N, 4, is greater than permitted
[11:20:09] Explicit valence for atom # 28 N, 4, is greater than permitted
[11:20:09] Explicit valence for atom # 17

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 17%|█▋        | 301/1731 [00:17<02:34,  9.24it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 21%|██        | 362/1731 [00:21<01:54, 11.92it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 32%|███▏      | 556/1731 [00:30<00:53, 22.02it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 40%|████      | 698/1731 [00:39<00:45, 22.55it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 65%|██████▌   | 1129/1731 [01:10<00:43, 13.99it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 74%|███████▍  | 1284/1731 [01:22<00:53,  8.29it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 75%|███████▍  | 1293/1731 [01:23<00:47,  9.30it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 78%|███████▊  | 1357/1731 [01:30<00:41,  8.92it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 1731/1731 [01:53<00:00, 15.30it/s]


In [None]:
# -- target values
test_y = test.TRUE.to_numpy()
train_y = trim(train).meanComplexity.to_numpy()

test_yr = np.round(test_y * 2)/2
train_yr = np.round(train_y * 2)/2

# -- feature engineered dfs
no_moe = trim(train[no_moe_cols].drop(['SMILES', 'meanComplexity', 'stdevComplexity'], axis=1))
big_train = trim(flag_nulls.drop(['SMILES', 'meanComplexity', 'stdevComplexity'], axis=1))

big_test = featurize(test, True, True)[big_train.columns]

In [None]:
print(big_train.shape, train_y.shape)