## Overview ##

PubChem is a site run by the NIH which hosts raw data associated with chemical experiments; here we analyze the data hosted at PubChem for assay 1030, which looks for inhibitors of the protein encoding gene ALDH1A1. You can access the page for this assay [here](https://pubchem.ncbi.nlm.nih.gov/bioassay/1030)

## Results ##

We use the SMILES string, a common representation for a molecule amongst chemists, to begin the featurization process. Because the length of this string varies, it is normalized in the form of a Morgan Fingerprint; these are then used to train various binary classifiers

In [1]:
# Exploratory data analysis and visualization

In [2]:
import pickle
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, rdMolDescriptors
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import roc_auc_score
import sys

import warnings
warnings.filterwarnings('ignore')

global_random_state = 42
k_fold_splits = 2

np.random.seed(global_random_state)

import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')

fh = logging.FileHandler('log_sklearn.txt')
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
logger.addHandler(fh)

ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.DEBUG)
ch.setFormatter(formatter)
logger.addHandler(ch)

class_weights = None


In [3]:
# Load assay info. Note: This CSV was obtained from PubChem bioassay aka PCBA, via searching for AID 1030 
# and downloading the datatable

ba_df = pd.read_csv("AID_1030_datatable_all.csv")

# Load compound info
cs_df = pd.read_csv("AID_1030_compound_smiles.csv",sep='\t',header=0)

# Merge the two
full_df = ba_df.merge(cs_df,on='PUBCHEM_CID')

# Cleanup the compound ID column
full_df["PUBCHEM_CID"] = full_df["PUBCHEM_CID"].astype(int)

# Delete CID 3246048, which fails featurization

compound_ids = list()
smiles_list = list()
fingerprints = list()
activities = list()

#fingerprint_df = 

for index, row in full_df.iterrows() :
    cid = row["PUBCHEM_CID"]
    smiles_string = row["Smiles"]
    mol = Chem.MolFromSmiles(smiles_string)
    is_active = row["PUBCHEM_ACTIVITY_OUTCOME"] == "Active"
    if mol is None:
        logger.info("Molecule failed featurization")
        logger.info(index)
    else: 
        fingerprint = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol,2,nBits=2048,useChirality=False,
                                                                     useBondTypes=False,useFeatures=False)
        
        # From RDKit documentation
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(fingerprint, arr)
        fingerprint = arr
        
        compound_ids.append(cid)
        smiles_list.append(smiles_string)
        fingerprints.append(fingerprint)
        activities.append(is_active)
    
    if index % 10000 == 0:
        logger.info("Processed index: {0}".format(index))

fingerprints = np.array(fingerprints)
activities = np.array(activities,dtype=int)
        
logger.info("Sampling")

rus = RandomUnderSampler(random_state=global_random_state)
X, y = rus.fit_sample(fingerprints, activities)

logger.info("Processed all, pickling")

#compound_ids_and_features = (compound_ids, smiles_list, fingerprints, activities)

# Pickle the data to save time in the future
with open('data.classification.undersampled.pickle', 'wb') as f:
    pickle.dump((X,y), f, pickle.HIGHEST_PROTOCOL)

2017-09-24 10:47:07,188 - INFO - Processed index: 0
2017-09-24 10:47:14,685 - INFO - Processed index: 10000
2017-09-24 10:47:21,884 - INFO - Processed index: 20000
2017-09-24 10:47:29,346 - INFO - Processed index: 30000
2017-09-24 10:47:36,966 - INFO - Processed index: 40000
2017-09-24 10:47:43,923 - INFO - Processed index: 50000
2017-09-24 10:47:50,960 - INFO - Processed index: 60000
2017-09-24 10:47:57,810 - INFO - Processed index: 70000
2017-09-24 10:48:05,306 - INFO - Processed index: 80000
2017-09-24 10:48:13,079 - INFO - Processed index: 90000
2017-09-24 10:48:20,616 - INFO - Processed index: 100000
2017-09-24 10:48:27,684 - INFO - Processed index: 110000
2017-09-24 10:48:35,354 - INFO - Processed index: 120000
2017-09-24 10:48:44,147 - INFO - Processed index: 130000
2017-09-24 10:48:52,683 - INFO - Processed index: 140000
2017-09-24 10:49:00,565 - INFO - Processed index: 150000
2017-09-24 10:49:08,363 - INFO - Processed index: 160000
2017-09-24 10:49:16,213 - INFO - Processed in

In [4]:
# Setup imports

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.cross_validation import cross_val_predict
import pickle
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, rdMolDescriptors
from collections import Counter

global_random_state = 42

with open('data.classification.undersampled.pickle', 'rb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    (X, y) = pickle.load(f)


In [5]:
skf = StratifiedKFold(n_splits=k_fold_splits,shuffle=True, random_state=global_random_state)

roc_auc_avg = 0

for train_index, test_index in skf.split(X,y) :

    
    X_train = X[train_index]
    X_test = X[test_index]
    y_train = y[train_index]
    y_test = y[test_index]

    classifier = DecisionTreeClassifier(random_state=global_random_state,class_weight=class_weights)
    classifier.fit(X_train,y_train)
    y_pred = classifier.predict(X_test)
    auc = roc_auc_score(y_test, y_pred, average='macro', sample_weight=None)
    logger.info("Computed roc_auc score of: {}".format(auc))
    logger.info(classification_report(y_test, y_pred))
    roc_auc_avg = roc_auc_avg + auc
    
roc_auc_avg = roc_auc_avg / k_fold_splits
logger.debug("Average roc_auc score is: {}".format(roc_auc_avg))


# Note: Unfortunately it's not directly comparable to ROC_AUC calculated in MoleculeNet at: https://arxiv.org/pdf/1703.00564.pdf 
# This is because MoleculeNet looks at a different metric (roc_auc) and also a different task (multiclass prediction across 128 bioassays simultaneously vs binary classification here)

2017-09-24 10:50:17,334 - INFO - Computed roc_auc score of: 0.6342477656405163
2017-09-24 10:50:17,340 - INFO -              precision    recall  f1-score   support

          0       0.63      0.63      0.63      8056
          1       0.63      0.64      0.63      8056

avg / total       0.63      0.63      0.63     16112

2017-09-24 10:50:30,245 - INFO - Computed roc_auc score of: 0.6350093109869646
2017-09-24 10:50:30,250 - INFO -              precision    recall  f1-score   support

          0       0.64      0.62      0.63      8055
          1       0.63      0.65      0.64      8055

avg / total       0.64      0.64      0.63     16110

2017-09-24 10:50:30,251 - DEBUG - Average roc_auc score is: 0.6346285383137404


In [6]:
# Does an MLP classifier help?

from sklearn.neural_network import MLPClassifier

skf = StratifiedKFold(n_splits=k_fold_splits,shuffle=True,random_state=global_random_state)

roc_auc_avg = 0

for train_index, test_index in skf.split(X,y) :

    
    X_train = X[train_index]
    X_test = X[test_index]
    y_train = y[train_index]
    y_test = y[test_index]

    classifier = MLPClassifier(random_state=global_random_state)
    classifier.fit(X_train,y_train)
    y_pred = classifier.predict(X_test)
    auc = roc_auc_score(y_test, y_pred, average='macro', sample_weight=None)
    logger.info("Computed roc_auc score of: {}".format(auc))
    logger.info(classification_report(y_test, y_pred))
    roc_auc_avg = roc_auc_avg + auc
    
roc_auc_avg = roc_auc_avg / k_fold_splits
logger.debug("Average roc_auc score is: {}".format(roc_auc_avg))



2017-09-24 11:02:00,506 - INFO - Computed roc_auc score of: 0.6899205561072493
2017-09-24 11:02:00,511 - INFO -              precision    recall  f1-score   support

          0       0.68      0.71      0.70      8056
          1       0.70      0.67      0.68      8056

avg / total       0.69      0.69      0.69     16112

2017-09-24 11:16:08,598 - INFO - Computed roc_auc score of: 0.6929857231533209
2017-09-24 11:16:08,602 - INFO -              precision    recall  f1-score   support

          0       0.70      0.67      0.69      8055
          1       0.69      0.71      0.70      8055

avg / total       0.69      0.69      0.69     16110

2017-09-24 11:16:08,603 - DEBUG - Average roc_auc score is: 0.6914531396302851


In [7]:
# Let's try using a Random forest

skf = StratifiedKFold(n_splits=k_fold_splits,shuffle=True,random_state=global_random_state)

roc_auc_avg = 0

for train_index, test_index in skf.split(X,y) :

    
    X_train = X[train_index]
    X_test = X[test_index]
    y_train = y[train_index]
    y_test = y[test_index]

    classifier = RandomForestClassifier(n_estimators=100, random_state=global_random_state, n_jobs=-1)
    classifier.fit(X_train,y_train)
    y_pred = classifier.predict(X_test)
    auc = roc_auc_score(y_test, y_pred, average='macro', sample_weight=None)
    logger.info("Computed roc_auc score of: {}".format(auc))
    logger.info(classification_report(y_test, y_pred))
    roc_auc_avg = roc_auc_avg + auc
    
roc_auc_avg = roc_auc_avg / k_fold_splits
logger.info("Average roc_auc score is: {}".format(roc_auc_avg))

2017-09-24 11:16:15,693 - INFO - Computed roc_auc score of: 0.7089126117179742
2017-09-24 11:16:15,697 - INFO -              precision    recall  f1-score   support

          0       0.70      0.74      0.72      8056
          1       0.72      0.68      0.70      8056

avg / total       0.71      0.71      0.71     16112

2017-09-24 11:16:23,796 - INFO - Computed roc_auc score of: 0.7093109869646183
2017-09-24 11:16:23,801 - INFO -              precision    recall  f1-score   support

          0       0.70      0.72      0.71      8055
          1       0.72      0.69      0.70      8055

avg / total       0.71      0.71      0.71     16110

2017-09-24 11:16:23,802 - INFO - Average roc_auc score is: 0.7091117993412962


In [8]:
# Let's try using a hyperparameter tuning package
from hpsklearn import HyperoptEstimator, any_classifier
from hyperopt import tpe


X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=global_random_state)

estim = HyperoptEstimator( classifier=any_classifier('clf'),  
                            algo=tpe.suggest, trial_timeout=300)

# Search the space of classifiers and preprocessing steps and their
# respective hyperparameters in sklearn to fit a model to the data
estim.fit( X_train, y_train )

# Make a prediction using the optimized model
test_label = estim.predict( X_test )

# Report the accuracy of the classifier on a given set of data
score = estim.score( X_test, test_label )

# Return instances of the classifier and preprocessing steps
model = estim.best_model()

estim.fit( X_train, y_train )

logger.info(estim.score(X_test, y_test))
logger.info(estim.best_model())

2017-09-24 11:16:24,199 - INFO - tpe_transform took 0.047793 seconds
2017-09-24 11:16:24,200 - INFO - TPE using 0 trials
2017-09-24 11:21:24,647 - INFO - tpe_transform took 0.045656 seconds
2017-09-24 11:21:24,648 - INFO - TPE using 1/1 trials with best loss inf
2017-09-24 11:26:25,001 - INFO - tpe_transform took 0.046115 seconds
2017-09-24 11:26:25,002 - INFO - TPE using 2/2 trials with best loss inf
2017-09-24 11:28:28,659 - INFO - tpe_transform took 0.045682 seconds
2017-09-24 11:28:28,661 - INFO - TPE using 3/3 trials with best loss 0.319129
2017-09-24 11:32:39,734 - INFO - tpe_transform took 0.129502 seconds
2017-09-24 11:32:39,735 - INFO - TPE using 4/4 trials with best loss 0.319129
2017-09-24 11:37:40,121 - INFO - tpe_transform took 0.043525 seconds
2017-09-24 11:37:40,123 - INFO - TPE using 5/5 trials with best loss 0.319129
2017-09-24 11:42:40,576 - INFO - tpe_transform took 0.046590 seconds
2017-09-24 11:42:40,577 - INFO - TPE using 6/6 trials with best loss 0.319129
2017-09

In [10]:
# Let's take a look at the found model

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(copy=True, feature_range=(-1.0, 1.0))

X = scaler.fit_transform(X)

for train_index, test_index in skf.split(X,y) :

    
    X_train = X[train_index]
    X_test = X[test_index]
    y_train = y[train_index]
    y_test = y[test_index]

    classifier = RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='log2', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=72, n_jobs=1, oob_score=False, random_state=3,
            verbose=False, warm_start=False)    
    classifier.fit(X_train,y_train)
    y_pred = classifier.predict(X_test)
    auc = roc_auc_score(y_test, y_pred, average='macro', sample_weight=None)
    logger.info("Computed roc_auc score of: {}".format(auc))
    logger.info(classification_report(y_test, y_pred))
    roc_auc_avg = roc_auc_avg + auc
    
roc_auc_avg = roc_auc_avg / k_fold_splits
logger.info("Average roc_auc score is: {}".format(roc_auc_avg))


2017-09-24 13:49:21,987 - INFO - Computed roc_auc score of: 0.7023957298907646
2017-09-24 13:49:21,992 - INFO -              precision    recall  f1-score   support

          0       0.69      0.74      0.71      8056
          1       0.72      0.66      0.69      8056

avg / total       0.70      0.70      0.70     16112

2017-09-24 13:49:35,778 - INFO - Computed roc_auc score of: 0.706455617628802
2017-09-24 13:49:35,782 - INFO -              precision    recall  f1-score   support

          0       0.69      0.74      0.72      8055
          1       0.72      0.67      0.70      8055

avg / total       0.71      0.71      0.71     16110

2017-09-24 13:49:35,783 - INFO - Average roc_auc score is: 1.0589815734304313
