## Overview ##

PubChem is a site run by the NIH which hosts raw data associated with chemical experiments; here we analyze the data hosted at PubChem for assay 1030, which looks for inhibitors of the protein encoding gene ALDH1A1. You can access the page for this assay [here](https://pubchem.ncbi.nlm.nih.gov/bioassay/1030)

## Results ##

We use the SMILES string, a common representation for a molecule amongst chemists, to begin the featurization process. Because the length of this string varies, it is normalized in the form of a Morgan Fingerprint; these are then used to train various binary classifiers

In [1]:
import pickle
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, rdMolDescriptors
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import roc_auc_score
import sys
import time

import warnings
warnings.filterwarnings('ignore')

global_random_state = 42
k_fold_splits = 2

np.random.seed(global_random_state)

import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')

fh = logging.FileHandler('log_sklearn.txt')
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
logger.addHandler(fh)

ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.DEBUG)
ch.setFormatter(formatter)
logger.addHandler(ch)

class_weights = None


In [2]:
# Load assay info. Note: This CSV was obtained from PubChem bioassay aka PCBA, via searching for AID 1030 
# and downloading the datatable

ba_df = pd.read_csv("AID_1030_datatable_all.csv")

# Load compound info
cs_df = pd.read_csv("AID_1030_compound_smiles.csv",sep='\t',header=0)

# Merge the two
full_df = ba_df.merge(cs_df,on='PUBCHEM_CID')

# Cleanup the compound ID column
full_df["PUBCHEM_CID"] = full_df["PUBCHEM_CID"].astype(int)

# Delete CID 3246048, which fails featurization
full_df = full_df[full_df["PUBCHEM_CID"] != 3246048]

# Delete all inconclusive results
full_df = full_df[full_df["PUBCHEM_ACTIVITY_OUTCOME"] != "Inconclusive"]

# Delete CID 3246048, which fails featurization

compound_ids = list()
smiles_list = list()
fingerprints = list()
activities = list()

#fingerprint_df = 

for index, row in full_df.iterrows() :
    cid = row["PUBCHEM_CID"]
    smiles_string = row["Smiles"]
    mol = Chem.MolFromSmiles(smiles_string)
    is_active = row["PUBCHEM_ACTIVITY_OUTCOME"] == "Active"
    if mol is None:
        logger.info("Molecule failed featurization")
        logger.info(index)
    else: 
        fingerprint = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol,2,nBits=2048,useChirality=False,
                                                                     useBondTypes=False,useFeatures=False)
        
        # From RDKit documentation
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(fingerprint, arr)
        fingerprint = arr
        
        compound_ids.append(cid)
        smiles_list.append(smiles_string)
        fingerprints.append(fingerprint)
        activities.append(is_active)
    
    if index % 10000 == 0:
        logger.info("Processed index: {0}".format(index))

fingerprints = np.array(fingerprints)
activities = np.array(activities,dtype=int)
        
logger.info("Undersampling")

rus = RandomUnderSampler(random_state=global_random_state)
X, y = rus.fit_sample(fingerprints, activities)

# Pickle the data to save time in the future
with open('data.classification.undersampled.pickle', 'wb') as f:
    pickle.dump((X,y), f, pickle.HIGHEST_PROTOCOL)

2017-09-28 16:14:30,370 - INFO - Processed index: 10000
2017-09-28 16:14:36,023 - INFO - Processed index: 30000
2017-09-28 16:14:38,928 - INFO - Processed index: 40000
2017-09-28 16:14:41,862 - INFO - Processed index: 50000
2017-09-28 16:14:44,916 - INFO - Processed index: 60000
2017-09-28 16:14:50,920 - INFO - Processed index: 80000
2017-09-28 16:14:53,836 - INFO - Processed index: 90000
2017-09-28 16:14:56,839 - INFO - Processed index: 100000
2017-09-28 16:14:59,765 - INFO - Processed index: 110000
2017-09-28 16:15:02,805 - INFO - Processed index: 120000
2017-09-28 16:15:05,925 - INFO - Processed index: 130000
2017-09-28 16:15:09,084 - INFO - Processed index: 140000
2017-09-28 16:15:12,216 - INFO - Processed index: 150000
2017-09-28 16:15:18,478 - INFO - Processed index: 170000
2017-09-28 16:15:30,787 - INFO - Processed index: 210000
2017-09-28 16:15:34,424 - INFO - Processed index: 220000
2017-09-28 16:15:34,908 - INFO - Undersampling
2017-09-28 16:15:35,011 - INFO - Compute classes

In [3]:
# Setup imports

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.cross_validation import cross_val_predict
import pickle
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, rdMolDescriptors
from collections import Counter

k_fold_splits = 2
global_random_state = 42

with open('data.classification.undersampled.pickle', 'rb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    (X, y) = pickle.load(f)


In [4]:
# First, let's look at the performance of a Dummy Classifier

# What is the performance of a dummy classifier on the test set?

from sklearn.dummy import DummyClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.utils.class_weight import compute_class_weight

import pickle

with open('data.classification.undersampled.pickle', 'rb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    (X, y) = pickle.load(f)

print("Number of samples is: {}".format(len(X)))
    
skf = StratifiedKFold(n_splits=k_fold_splits,shuffle=True,random_state=global_random_state)

roc_auc_avg = 0
avg_predict_time = 0
for train_index, test_index in skf.split(X,y) :

    X_train = X[train_index]
    X_test = X[test_index]
    y_train = y[train_index]
    y_test = y[test_index]
    
    print("Number of training samples is: {}".format(len(X_train)))
    print("Number of test samples is: {}".format(len(X_test)))

    classifier = DummyClassifier(random_state=global_random_state)
    classifier.fit(X_train,y_train)
    start = time.time()
    y_pred = classifier.predict(X_test)
    elapsed = time.time() - start
    avg_predict_time = avg_predict_time + elapsed
    #logger.info(classification_report(y_test, y_pred))
    # What is the AUC-ROC score?
    auc = roc_auc_score(y_test, y_pred, average='macro', sample_weight=None)
    roc_auc_avg = roc_auc_avg + auc
    
avg_predict_time = avg_predict_time / k_fold_splits
roc_auc_avg = roc_auc_avg / k_fold_splits
logger.info("Average roc_auc score of {} folds is: {}".format(k_fold_splits, roc_auc_avg))
logger.info("Average elapsed prediction time over {} folds in s is: {}".format(k_fold_splits, avg_predict_time))

Number of samples is: 32222
Number of training samples is: 16110
Number of test samples is: 16112
Number of training samples is: 16112
Number of test samples is: 16110
2017-09-28 16:15:41,859 - INFO - Average roc_auc score of 2 folds is: 0.4997517224247154
2017-09-28 16:15:41,860 - INFO - Average elapsed prediction time over 2 folds in s is: 0.02162623405456543


In [5]:
skf = StratifiedKFold(n_splits=k_fold_splits,shuffle=True, random_state=global_random_state)

roc_auc_avg = 0
avg_predict_time = 0

for train_index, test_index in skf.split(X,y) :

    
    X_train = X[train_index]
    X_test = X[test_index]
    y_train = y[train_index]
    y_test = y[test_index]

    classifier = DecisionTreeClassifier(random_state=global_random_state,class_weight=class_weights)
    classifier.fit(X_train,y_train)
    start = time.time()
    y_pred = classifier.predict(X_test)
    elapsed = time.time() - start
    avg_predict_time = avg_predict_time + elapsed
    auc = roc_auc_score(y_test, y_pred, average='macro', sample_weight=None)
    #logger.info("Computed roc_auc score of: {}".format(auc))
    #logger.info(classification_report(y_test, y_pred))
    roc_auc_avg = roc_auc_avg + auc
    
roc_auc_avg = roc_auc_avg / k_fold_splits
logger.info("Average roc_auc score of {} folds is: {}".format(k_fold_splits, roc_auc_avg))
logger.info("Average elapsed prediction time over {} folds in s is: {}".format(k_fold_splits, avg_predict_time))

# Note: As comparison, there is a mean test ROC_AUC calculated in MoleculeNet for PCBA-128 dataset
# of .781 using a logistic regression model
# That however applies to a different task (multiclass prediction across 128 bioassays simultaneously vs binary classification here)

2017-09-28 16:16:04,165 - INFO - Average roc_auc score of 2 folds is: 0.6647324100631397
2017-09-28 16:16:07,042 - INFO - Average elapsed prediction time over 2 folds in s is: 3.502115488052368


In [6]:
# Let's try using a Random forest

skf = StratifiedKFold(n_splits=k_fold_splits,shuffle=True,random_state=global_random_state)

roc_auc_avg = 0
avg_predict_time = 0

for train_index, test_index in skf.split(X,y) :

    
    X_train = X[train_index]
    X_test = X[test_index]
    y_train = y[train_index]
    y_test = y[test_index]

    classifier = RandomForestClassifier(n_estimators=100, random_state=global_random_state, n_jobs=-1)
    classifier.fit(X_train,y_train)
    start = time.time()
    y_pred = classifier.predict(X_test)
    elapsed = time.time() - start
    avg_predict_time = avg_predict_time + elapsed
    auc = roc_auc_score(y_test, y_pred, average='macro', sample_weight=None)
    #logger.info("Computed roc_auc score of: {}".format(auc))
    #logger.info(classification_report(y_test, y_pred))
    roc_auc_avg = roc_auc_avg + auc
    
roc_auc_avg = roc_auc_avg / k_fold_splits
logger.info("Average roc_auc score is: {}".format(roc_auc_avg))
logger.info("Average elapsed prediction time over {} folds in s is: {}".format(k_fold_splits, avg_predict_time))


2017-09-28 16:17:51,513 - INFO - Average roc_auc score is: 0.7425673798309413
2017-09-28 16:17:51,625 - INFO - Average elapsed prediction time over 2 folds in s is: 4.380889177322388


In [7]:
# Let's try using a Logistic Regression
from sklearn.linear_model import LogisticRegression

skf = StratifiedKFold(n_splits=k_fold_splits,shuffle=True,random_state=global_random_state)

roc_auc_avg = 0
avg_predict_time = 0

for train_index, test_index in skf.split(X,y) :

    
    X_train = X[train_index]
    X_test = X[test_index]
    y_train = y[train_index]
    y_test = y[test_index]

    classifier = LogisticRegression(random_state=global_random_state, n_jobs=-1)
    classifier.fit(X_train,y_train)
    start = time.time()
    y_pred = classifier.predict(X_test)
    elapsed = time.time() - start
    avg_predict_time = avg_predict_time + elapsed
    auc = roc_auc_score(y_test, y_pred, average='macro', sample_weight=None)
    roc_auc_avg = roc_auc_avg + auc
    
roc_auc_avg = roc_auc_avg / k_fold_splits
logger.info("Average roc_auc score is: {}".format(roc_auc_avg))
logger.info("Average elapsed prediction time over {} folds in s is: {}".format(k_fold_splits, avg_predict_time))


2017-09-28 16:17:58,592 - INFO - Average roc_auc score is: 0.697101308068844
2017-09-28 16:17:58,593 - INFO - Average elapsed prediction time over 2 folds in s is: 0.23391437530517578


In [12]:
from sklearn.model_selection import RandomizedSearchCV

estimator = RandomForestClassifier(n_estimators=100, random_state=global_random_state, n_jobs=-1)
param_dist = {"n_estimators": [10, 100, 1000],
              "min_samples_split": [1,5, 20, 50],
              "min_samples_leaf": [1,5, 20, 50],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}
randomized_search = RandomizedSearchCV(estimator,param_dist, scoring="roc_auc",n_jobs=-1, random_state=global_random_state)
randomized_search.fit(X,y)


AttributeError: 'RandomizedSearchCV' object has no attribute '_best_score'

In [16]:
print("Best found score of: {}".format(randomized_search.best_score_))
print("Best found model: {}".format(randomized_search.best_estimator_))

Best found score of: 0.7287595195687173
Best found model: RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=5,
            min_samples_split=5, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=-1, oob_score=False, random_state=42,
            verbose=0, warm_start=False)


In [None]:
# Let's look at the predictions that the best-performing model makes on the 

In [None]:
# Let's try to improve the Random Forest via hyperparameter search