In [6]:
import pandas as pd
import numpy as np
from IPython.display import display
# import all rdkit needed libraries
import rdkit
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors

In [18]:
def load_data(file):
    df = pd.read_csv(file)
    return df

def get_mol(df):
    df['mol'] = df['SMILES'].apply(rdkit.Chem.MolFromSmiles)
    return df

def feature_extraction(df):
    df['num_atoms'] = df['mol'].apply(lambda x: x.GetNumAtoms())
    df['num_heavy_atoms'] = df['mol'].apply(lambda x: x.GetNumHeavyAtoms())
    df['exact_mol_wt'] = df['mol'].apply(lambda x: Descriptors.ExactMolWt(x))
    df['AI_COO'] = df['mol'].apply(lambda x: Descriptors.fr_Al_COO(x))
    df['morgan_fp'] = df['mol'].apply(lambda x: AllChem.GetMorganFingerprintAsBitVect(x,2,nBits=124))

    return df
training = 'Resources/training_smiles.csv'
testing = 'Resources/test_smiles.csv'
df_training = feature_extraction(get_mol(load_data(training)))
df_testing = feature_extraction(get_mol(load_data(testing)))



## Data Exploration

In [21]:
df_training.head()

Unnamed: 0,INDEX,SMILES,ACTIVE,mol,num_atoms,num_heavy_atoms,exact_mol_wt,AI_COO,morgan_fp
0,1,CC(C)N1CC(=O)C(c2nc3ccccc3[nH]2)=C1N,0.0,<rdkit.Chem.rdchem.Mol object at 0x000002027CD...,19,19,256.132411,0,"[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, ..."
1,2,COc1ccc(-c2ccc3c(N)c(C(=O)c4ccc(OC)c(OC)c4)sc3...,0.0,<rdkit.Chem.rdchem.Mol object at 0x000002027CD...,30,30,420.114378,0,"[0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, ..."
2,3,CCc1ccc(C(=O)COC(=O)CCc2nc(=O)c3ccccc3[nH]2)cc1,0.0,<rdkit.Chem.rdchem.Mol object at 0x000002027CD...,27,27,364.142307,0,"[0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, ..."
3,4,O=C(CN1CCOCC1)Nc1ccc(S(=O)(=O)N2CCCCCC2)cc1,0.0,<rdkit.Chem.rdchem.Mol object at 0x000002027CD...,26,26,381.172227,0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
4,5,C=CCC(Nc1ccccc1)c1ccc(OC)c(OC)c1,0.0,<rdkit.Chem.rdchem.Mol object at 0x000002027CD...,21,21,283.157229,0,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, ..."


In [22]:
df_training.dtypes

INDEX                int64
SMILES              object
ACTIVE             float64
mol                 object
num_atoms            int64
num_heavy_atoms      int64
exact_mol_wt       float64
AI_COO               int64
morgan_fp           object
dtype: object

In [14]:
df_training.count()

INDEX              156258
SMILES             156258
ACTIVE             156258
mol                156258
num_atoms          156258
num_heavy_atoms    156258
exact_mol_wt       156258
AI_COO             156258
morgan_fp          156258
dtype: int64

In [23]:
df_training["ACTIVE"].value_counts()

0.0    154528
1.0      1730
Name: ACTIVE, dtype: int64

Definitely to be considered imbalanced data maybe use Synthetic Minority Over-sampling Technique (SMOT). Random Over-sampling or Under-sampling

In [27]:
df_training.describe()

Unnamed: 0,INDEX,ACTIVE,num_atoms,num_heavy_atoms,exact_mol_wt,AI_COO
count,156258.0,156258.0,156258.0,156258.0,156258.0,156258.0
mean,78129.5,0.011071,24.351892,24.351822,348.31262,0.031307
std,45107.943519,0.104637,5.559917,5.559868,78.296593,0.181454
min,1.0,0.0,2.0,2.0,33.021464,0.0
25%,39065.25,0.0,21.0,21.0,294.121572,0.0
50%,78129.5,0.0,24.0,24.0,344.173607,0.0
75%,117193.75,0.0,28.0,28.0,399.079518,0.0
max,156258.0,1.0,101.0,101.0,1447.4302,4.0


This means that the columns num_atoms, num_heavy_atoms, exact_mol_wt, AI_COO can be normalized. Discreatization can probably be done for num_atoms, num_heavy_atoms, exact_mol_wt but not for AI_COO. There is also a decision to be made regarding weather to keep num_atoms or num_heavy_atoms, since the data is the same in the columns.

In [28]:
df_training.isnull().sum()

INDEX              0
SMILES             0
ACTIVE             0
mol                0
num_atoms          0
num_heavy_atoms    0
exact_mol_wt       0
AI_COO             0
morgan_fp          0
dtype: int64

According to above no features have null values i.e. no imputation needed?

## Data Preparation

In [31]:
from sklearn.preprocessing import KBinsDiscretizer

discretizer = KBinsDiscretizer(n_bins=10, encode='uniform',
                               strategy='uniform')

In [32]:
from sklearn.model_selection import train_test_split

# Definitely not sure about this since the data sets are pre-split.
# Might be that only the second split is needed into test/valid

#df1_training= df_training.copy()
#y = df1_training['ACTIVE']
#X = df1_training.drop(columns = 'ACTIVE')

#X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size = 0.8)
#X_valid, X_test, y_valid, y_test = train_test_split(X,y, test_size=0.5)


## Build a binary classifier