In [11]:
from rdkit import Chem
from rdkit.Chem import MACCSkeys, AllChem
from rdkit.Avalon import pyAvalonTools as fpAvalon
from rdkit.Chem.AtomPairs import Pairs, Torsions
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem.ChemicalFeatures import BuildFeatureFactory
from rdkit.Chem import rdMolDescriptors
import numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import Descriptors, AllChem
from rdkit.ML.Descriptors import MoleculeDescriptors
import numpy as np
from rdkit.Chem import MACCSkeys
from rdkit import Chem, DataStructs
import numpy as np
from rdkit.ML.Descriptors import MoleculeDescriptors
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, average_precision_score, f1_score
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.model_selection import train_test_split

In [12]:
def mol2fp(mol):
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048) # rdkit.DataStructs => convert to np
    ar = np.zeros((1,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(fp, ar)
    return ar

def mol2maccs(mol):
    fp = MACCSkeys.GenMACCSKeys(mol) # rdkit.DataStructs => convert to np
    ar = np.zeros((1,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(fp, ar)
    return ar

def mol2rdkit(mol):
    des_list = [x[0] for x in Descriptors._descList]
    calculator = MoleculeDescriptors.MolecularDescriptorCalculator(des_list)
    d = calculator.CalcDescriptors(mol)
    return d

In [3]:
def target_bin(data, thresh, target_col):
        t1 = data[target_col] < thresh
        data.loc[t1, target_col] = 0
        t2 = data[target_col] >= thresh
        data.loc[t2, target_col] = 1
        data[target_col] = data[target_col].astype('int64')
        return data


df = pd.read_csv('./Data/Data.csv').drop(['Unnamed: 0'], axis =1)
df['Molecule'] = df['Canomicalsmiles'].apply(Chem.MolFromSmiles)
df0 = target_bin(df, thresh = 7, target_col ='pChEMBL')
df0.head(2)

Unnamed: 0,pChEMBL,Canomicalsmiles,Molecule
0,1,CCc1cc2c(=O)c3c4ccc(C#N)cc4[nH]c3n(C(C)C)c2cc1...,<rdkit.Chem.rdchem.Mol object at 0x7fd317031540>
1,1,CCc1cc2c(=O)c3c4ccc(C#N)cc4[nH]c3n(C(C)C)c2cc1...,<rdkit.Chem.rdchem.Mol object at 0x7fd317031620>


In [None]:
rdk=  df0.copy()
rdk["RDK"] = rdk.Molecule.apply(mol2rdkit)

X = np.stack(rdk.RDK.values)

X_df = pd.DataFrame(X, columns = [x[0] for x in Descriptors._descList])
rdk_df = pd.concat([rdk, X_df], axis = 1).drop(["RDK",'Canomicalsmiles','Molecule'], axis =1)
display(rdk_df.head(2))

train, test = train_test_split(rdk_df, random_state = 42, stratify = maccs['pChEMBL'])
X_train = train.drop(['pChEMBL'], axis =1)
X_test = test.drop(['pChEMBL'], axis =1)
y_train = train['pChEMBL']
y_test = test['pChEMBL']
clf = RandomForestClassifier(random_state=42)

clf.fit(X_train, y_train)
print(classification_report(y_test, clf.predict(X_test)))
print('AP', average_precision_score(y_test, clf.predict_proba(X_test)[:,1]))
print('F1', f1_score(y_test, clf.predict(X_test)))


cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
score1 = cross_val_score(clf, X_train, y_train,cv=cv, scoring ='f1')
score2 = cross_val_score(clf, X_train, y_train,cv=cv, scoring ='average_precision')
f1_pd = pd.DataFrame(score1, columns = ['f1'])
ap_pd = pd.DataFrame(score2, columns = ['ap'])
ecfp4_df = pd.concat([f1_pd, ap_pd], axis =1)
display(ecfp4_df.describe())

In [13]:
maccs=  df0.copy()

maccs["maccs"] = maccs.Molecule.apply(mol2maccs)
X = np.stack(maccs.maccs.values)
X_df = pd.DataFrame(X)
maccs0 = pd.concat([maccs, X_df], axis = 1).drop(["maccs",'Canomicalsmiles','Molecule'], axis =1)
display(maccs0.head(2))

train, test = train_test_split(maccs0, random_state = 42, stratify = maccs['pChEMBL'])
X_train = train.drop(['pChEMBL'], axis =1)
X_test = test.drop(['pChEMBL'], axis =1)
y_train = train['pChEMBL']
y_test = test['pChEMBL']
clf = RandomForestClassifier(random_state=42)

clf.fit(X_train, y_train)
print(classification_report(y_test, clf.predict(X_test)))
print('AP', average_precision_score(y_test, clf.predict_proba(X_test)[:,1]))
print('F1', f1_score(y_test, clf.predict(X_test)))


cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
score1 = cross_val_score(clf, X_train, y_train,cv=cv, scoring ='f1')
score2 = cross_val_score(clf, X_train, y_train,cv=cv, scoring ='average_precision')
f1_pd = pd.DataFrame(score1, columns = ['f1'])
ap_pd = pd.DataFrame(score2, columns = ['ap'])
maccs_df = pd.concat([f1_pd, ap_pd], axis =1)
display(maccs_df.describe())

Unnamed: 0,pChEMBL,0,1,2,3,4,5,6,7,8,...,157,158,159,160,161,162,163,164,165,166
0,1,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,1,1,1,1,0
1,1,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,1,1,1,1,0


              precision    recall  f1-score   support

           0       0.89      0.90      0.89       296
           1       0.74      0.72      0.73       120

    accuracy                           0.85       416
   macro avg       0.81      0.81      0.81       416
weighted avg       0.85      0.85      0.85       416

AP 0.8402721617995635
F1 0.73109243697479


Unnamed: 0,f1,ap
count,30.0,30.0
mean,0.744632,0.824059
std,0.054949,0.042725
min,0.657534,0.728728
25%,0.709341,0.793497
50%,0.736878,0.822761
75%,0.78349,0.85227
max,0.891892,0.913469


In [14]:
ecfp=  df0.copy()

ecfp["ecfp0"] = ecfp.Molecule.apply(mol2fp)
X = np.stack(ecfp.ecfp0.values)
X_df = pd.DataFrame(X)
ecfp4 = pd.concat([ecfp, X_df], axis = 1).drop(["ecfp0",'Canomicalsmiles','Molecule'], axis =1)
display(ecfp4.head(2))

train, test = train_test_split(rdk_df, random_state = 42, stratify = maccs['pChEMBL'])
X_train = train.drop(['pChEMBL'], axis =1)
X_test = test.drop(['pChEMBL'], axis =1)
y_train = train['pChEMBL']
y_test = test['pChEMBL']
clf = RandomForestClassifier(random_state=42)

clf.fit(X_train, y_train)
print(classification_report(y_test, clf.predict(X_test)))
print('AP', average_precision_score(y_test, clf.predict_proba(X_test)[:,1]))
print('F1', f1_score(y_test, clf.predict(X_test)))


cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
score1 = cross_val_score(clf, X_train, y_train,cv=cv, scoring ='f1')
score2 = cross_val_score(clf, X_train, y_train,cv=cv, scoring ='average_precision')
f1_pd = pd.DataFrame(score1, columns = ['f1'])
ap_pd = pd.DataFrame(score2, columns = ['ap'])
ecfp4_df = pd.concat([f1_pd, ap_pd], axis =1)
display(ecfp4_df.describe())

Unnamed: 0,pChEMBL,0,1,2,3,4,5,6,7,8,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


              precision    recall  f1-score   support

           0       0.91      0.91      0.91       296
           1       0.78      0.78      0.78       120

    accuracy                           0.87       416
   macro avg       0.84      0.84      0.84       416
weighted avg       0.87      0.87      0.87       416

AP 0.8508719186094893
F1 0.775


Unnamed: 0,f1,ap
count,30.0,30.0
mean,0.748052,0.830297
std,0.046742,0.047014
min,0.645161,0.75311
25%,0.718473,0.787986
50%,0.73606,0.834842
75%,0.788519,0.861262
max,0.823529,0.913678
