In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, rdMolDescriptors
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import StratifiedKFold
import time
import pickle
global_random_state = 42
k_fold_splits = 2

In [2]:
df = pd.read_csv("AID_1032_datatable_all.csv")
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Activation
0,RESULT_TYPE,,,,,,,FLOAT
1,RESULT_DESCR,,,,,,,Normalized percent activation of the primary s...
2,RESULT_UNIT,,,,,,,PERCENT
3,RESULT_ATTR_CONC_MICROMOL,,,,,,,8
4,1,842121.0,6603008.0,Inactive,6.0,,,6.82


In [3]:
# Merge in the SMILES info for all compound IDs
cs = pd.read_csv("1032_CID_SMILES_Mapping.csv",sep='\t',header=0)
df = df.merge(cs,on="PUBCHEM_CID")
df.head()

Unnamed: 0,PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Activation,SMILES
0,1,842121.0,6603010.0,Inactive,6.0,,,6.82,CCOCCCNCC(=O)NC1=CC=C(C=C1)OC(F)(F)F.Cl
1,2,842122.0,6602570.0,Inactive,2.0,,,2.45,COCCN1C(=NN=N1)CN2CCC(CC2)CC3=CC=CC=C3.Cl
2,3,842123.0,6602620.0,Inactive,1.0,,,1.41,COCCN1C(=NN=N1)CN2CCC(CC2)(C3=CC(=CC=C3)C(F)(F...
3,4,842124.0,644371.0,Inactive,0.0,,,-5.09,C1CCCN(CC1)CC(=O)NCCC2=CC=C(C=C2)F.C(=O)(C(=O)O)O
4,5,842125.0,6603130.0,Inactive,0.0,,,-2.24,COC1=CC=C(C=C1)C(=O)C(C2=CC=CC=C2)N3CCOCC3.Cl


In [4]:
# Binarize the inactive/active column

df["IS_ACTIVE"] = df["PUBCHEM_ACTIVITY_OUTCOME"] == "Active"
df["IS_ACTIVE"].astype(bool)

df_active = df[df["IS_ACTIVE"] == True]
df_inactive = df[df["IS_ACTIVE"] == False]

df["IS_ACTIVE"].head()

0    False
1    False
2    False
3    False
4    False
Name: IS_ACTIVE, dtype: bool

In [5]:
# Plot histogram of active
%matplotlib inline

print("Active are: {}, Inactive are: {}".format(df_active.count()["PUBCHEM_CID"], df_inactive.count()["PUBCHEM_CID"]))

Active are: 670, Inactive are: 195584


In [7]:
# Now let's calculate fixed-length features which we need to train an ML algorithm

fingerprints = list()
activities = list()

num_parsed = 0
num_active = 0
num_inactive = 0

for index, row in df.iterrows() :
    cid = row["PUBCHEM_CID"]
    smiles_string = row["SMILES"]
    mol = Chem.MolFromSmiles(smiles_string)
    is_active = row["PUBCHEM_ACTIVITY_OUTCOME"] == "Active"
    
    if num_active == 670 and num_inactive >= 670 :
        break
    
    if mol is None:
        print("Molecule failed featurization")
        print(index)
    else: 
        fingerprint = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol,2,nBits=2048,useChirality=False,
                                                                     useBondTypes=False,useFeatures=False)
        # From RDKit documentation
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(fingerprint, arr)
        fingerprint = arr
        
        
        
        if(is_active) :
            num_active = num_active + 1
        else :
            num_inactive = num_inactive + 1
            

        fingerprints.append(fingerprint)
        activities.append(is_active)
        
        num_parsed = num_parsed + 1
    
    if index % 10000 == 0:
        print("Processed index: {0}".format(index))

fingerprints = np.array(fingerprints)
activities = np.array(activities,dtype=int)

rus = RandomUnderSampler(random_state=global_random_state)
X, y = rus.fit_sample(fingerprints, activities)

# Pickle the data to save time in the future
with open('data.classification.undersampled.1032.pickle', 'wb') as f:
    pickle.dump((X,y), f, pickle.HIGHEST_PROTOCOL)

Processed index: 0
Processed index: 10000
Processed index: 20000
Processed index: 30000
Processed index: 40000
Processed index: 50000
Processed index: 60000
Processed index: 70000
Processed index: 80000


KeyboardInterrupt: 

In [4]:
with open('data.classification.undersampled.1032.pickle','rb') as f:
    (X,y) = pickle.load(f)

In [10]:
# Note - minor code used from https://github.com/LRParser/pubchem-sklearn/blob/master/pubchem_bioassay_sklearn.ipynb
skf = StratifiedKFold(n_splits=k_fold_splits,shuffle=True,random_state=global_random_state)

roc_auc_avg = 0
avg_predict_time = 0
for train_index, test_index in skf.split(X,y) :

    X_train = X[train_index]
    X_test = X[test_index]
    y_train = y[train_index]
    y_test = y[test_index]
    
    print("Number of training samples is: {}".format(len(X_train)))
    print("Number of test samples is: {}".format(len(X_test)))

    classifier = DummyClassifier(random_state=global_random_state)
    classifier.fit(X_train,y_train)
    start = time.time()
    y_pred = classifier.predict(X_test)
    elapsed = time.time() - start
    avg_predict_time = avg_predict_time + elapsed
    auc = roc_auc_score(y_test, y_pred, average='macro', sample_weight=None)
    roc_auc_avg = roc_auc_avg + auc
    
avg_predict_time = avg_predict_time / k_fold_splits
roc_auc_avg = roc_auc_avg / k_fold_splits
print("Average roc_auc score of {} folds is: {}".format(k_fold_splits, roc_auc_avg))
print("Average elapsed prediction time over {} folds in s is: {}".format(k_fold_splits, avg_predict_time))

Number of training samples is: 670
Number of test samples is: 670
Number of training samples is: 670
Number of test samples is: 670
Average roc_auc score of 2 folds is: 0.48507462686567165
Average elapsed prediction time over 2 folds in s is: 0.0021752119064331055


In [11]:
roc_auc_avg = 0
avg_predict_time = 0
for train_index, test_index in skf.split(X,y) :

    X_train = X[train_index]
    X_test = X[test_index]
    y_train = y[train_index]
    y_test = y[test_index]
    
    print("Number of training samples is: {}".format(len(X_train)))
    print("Number of test samples is: {}".format(len(X_test)))

    classifier = RandomForestClassifier(random_state=global_random_state)
    classifier.fit(X_train,y_train)
    start = time.time()
    y_pred = classifier.predict(X_test)
    elapsed = time.time() - start
    avg_predict_time = avg_predict_time + elapsed
    auc = roc_auc_score(y_test, y_pred, average='macro', sample_weight=None)
    roc_auc_avg = roc_auc_avg + auc
    
avg_predict_time = avg_predict_time / k_fold_splits
roc_auc_avg = roc_auc_avg / k_fold_splits
print("Average roc_auc score of {} folds is: {}".format(k_fold_splits, roc_auc_avg))
print("Average elapsed prediction time over {} folds in s is: {}".format(k_fold_splits, avg_predict_time))

Number of training samples is: 670
Number of test samples is: 670
Number of training samples is: 670
Number of test samples is: 670
Average roc_auc score of 2 folds is: 0.6970149253731344
Average elapsed prediction time over 2 folds in s is: 0.01775991916656494
