## Overview ##

PubChem is a site run by the NIH which hosts raw data associated with chemical experiments; here we analyze the data hosted at PubChem for assay 1030, which looks for inhibitors of the protein encoding gene ALDH1A1. You can access the page for this assay [here](https://pubchem.ncbi.nlm.nih.gov/bioassay/1030)

## Results ##

We use the SMILES string, a common representation for a molecule amongst chemists, to begin the featurization process. Because the length of this string varies, it is normalized in the form of a Morgan Fingerprint; these are then used to train various binary classifiers

In [13]:
# Exploratory data analysis (regression)

In [14]:
import pickle
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, rdMolDescriptors
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

import warnings
warnings.filterwarnings('ignore')

global_random_state = 42
k_fold_splits = 2

np.random.seed(global_random_state)


active_pct = 0.073125471
inactive_pct = 1 - active_pct

# We set the inactive to have the weight of the active, and vice versa, to account for imbalance
#class_weights = { 0: active_pct, 1: inactive_pct }

class_weights = 'balanced'


import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')

fh = logging.FileHandler('log_regression.txt')
fh.setLevel(logging.INFO)
fh.setFormatter(formatter)
logger.addHandler(fh)

ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
logger.addHandler(ch)

logger.info('This is a test log message.')

2017-09-24 08:47:33,620 - INFO - This is a test log message.
2017-09-24 08:47:33,620 - INFO - This is a test log message.
2017-09-24 08:47:33,620 - INFO - This is a test log message.
2017-09-24 08:47:33,620 - INFO - This is a test log message.


In [15]:
# Load assay info. Note: This CSV was obtained from PubChem bioassay aka PCBA, via searching for AID 1030 
# and downloading the datatable

ba_df = pd.read_csv("AID_1030_datatable_all.csv")

# Load compound info
cs_df = pd.read_csv("AID_1030_compound_smiles.csv",sep='\t',header=0)

# Merge the two
full_df = ba_df.merge(cs_df,on='PUBCHEM_CID')

# Cleanup the compound ID column
full_df["PUBCHEM_CID"] = full_df["PUBCHEM_CID"].astype(int)
full_df["PUBCHEM_ACTIVITY_SCORE"] = full_df["PUBCHEM_ACTIVITY_SCORE"].astype(int)

# Normalize pubchem activity score to 0...1
full_df["PUBCHEM_ACTIVITY_SCORE"] = full_df["PUBCHEM_ACTIVITY_SCORE"] / 100
full_df["PUBCHEM_ACTIVITY_SCORE"] = pd.qcut(full_df['PUBCHEM_ACTIVITY_SCORE'], 10)

print(full_df["PUBCHEM_ACTIVITY_SCORE"].head())

compound_ids = list()
smiles_list = list()
fingerprints = list()
activities = list()

#fingerprint_df = 

for index, row in full_df.iterrows() :
    cid = row["PUBCHEM_CID"]
    smiles_string = row["Smiles"]
    mol = Chem.MolFromSmiles(smiles_string)
    is_active = row["PUBCHEM_ACTIVITY_OUTCOME"] == "Active"
    activity_score = row["PUBCHEM_ACTIVITY_SCORE"]
    if mol is None:
        print("Molecule failed featurization")
        print(index)
    else: 
        fingerprint = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol,2,nBits=2048,useChirality=False,
                                                                     useBondTypes=False,useFeatures=False)
        
        # From RDKit documentation
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(fingerprint, arr)
        fingerprint = arr
        
        compound_ids.append(cid)
        smiles_list.append(smiles_string)
        fingerprints.append(fingerprint)
        activities.append(activity_score)
    
    if index % 10000 == 0:
        logger.info("Processed index: {0}".format(index))

# Convert activities to np array of ints

X = np.array(fingerprints)
y = np.array(activities,dtype=float)

rus = RandomUnderSampler(random_state=global_random_state)
X, y = rus.fit_sample(X, y)

# Pickle the data to save time in the future
with open('data.regression.undersampled.pickle', 'wb') as f:
    pickle.dump((X,y), f, pickle.HIGHEST_PROTOCOL)


2017-09-24 08:47:36,334 - INFO - Processed index: 0
2017-09-24 08:47:36,334 - INFO - Processed index: 0
2017-09-24 08:47:36,334 - INFO - Processed index: 0
2017-09-24 08:47:36,334 - INFO - Processed index: 0
2017-09-24 08:47:43,463 - INFO - Processed index: 10000
2017-09-24 08:47:43,463 - INFO - Processed index: 10000
2017-09-24 08:47:43,463 - INFO - Processed index: 10000
2017-09-24 08:47:43,463 - INFO - Processed index: 10000
2017-09-24 08:47:50,342 - INFO - Processed index: 20000
2017-09-24 08:47:50,342 - INFO - Processed index: 20000
2017-09-24 08:47:50,342 - INFO - Processed index: 20000
2017-09-24 08:47:50,342 - INFO - Processed index: 20000
2017-09-24 08:47:57,480 - INFO - Processed index: 30000
2017-09-24 08:47:57,480 - INFO - Processed index: 30000
2017-09-24 08:47:57,480 - INFO - Processed index: 30000
2017-09-24 08:47:57,480 - INFO - Processed index: 30000
2017-09-24 08:48:04,845 - INFO - Processed index: 40000
2017-09-24 08:48:04,845 - INFO - Processed index: 40000
2017-09-

Molecule failed featurization
218052


2017-09-24 08:50:16,370 - INFO - Processed index: 220000
2017-09-24 08:50:16,370 - INFO - Processed index: 220000
2017-09-24 08:50:16,370 - INFO - Processed index: 220000
2017-09-24 08:50:16,370 - INFO - Processed index: 220000


In [16]:
# First we look at using a DecisionTreeRegressor, but since the performance on active compounds is so poor, we don't

import pickle
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, rdMolDescriptors
from collections import Counter
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, classification_report
from sklearn.model_selection import KFold, train_test_split

smiles_list = None
compound_ids = None
fingerprints = None
activities = None

global_random_state = 42

with open('data.regression.undersampled.pickle', 'rb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    (X, y) = pickle.load(f)

# Print the number of compounds loaded
logger.info("Successfully loaded {0} compounds.".format(len(X)))

kf = KFold(n_splits=k_fold_splits,shuffle=True,random_state=global_random_state,class_weight=class_weights)

mse_avg = 0

for train_index, test_index in kf.split(X,y) :

    X_train = X[train_index]
    X_test = X[test_index]
    y_train = y[train_index]
    y_test = y[test_index]

    classifier = DecisionTreeRegressor(random_state=global_random_state, class_weight=class_weights)
    classifier.fit(X_train,y_train)
    y_pred = classifier.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    logger.info("Computed mse score of: {}".format(mse))
    mse_avg = mse_avg + mse
    
    y_pred_binary = y_pred > .4
    y_test_binary = y_test[0:] > .4

    logger.info("How good is it as a classifier?")
    logger.info(classification_report(y_test_binary, y_pred_binary))
    
mse_avg = mse_avg / k_fold_splits
logger.info("Average mse score is: {}".format(mse_avg))

# Note: Unfortunately it's not directly comparable to ROC_AUC calculated in MoleculeNet at: https://arxiv.org/pdf/1703.00564.pdf 
# This is because MoleculeNet looks at a different metric (roc_auc) and also a different task (multiclass prediction across 128 bioassays simultaneously vs binary classification here)

2017-09-24 08:50:44,536 - INFO - Successfully loaded 220364 compounds.
2017-09-24 08:50:44,536 - INFO - Successfully loaded 220364 compounds.
2017-09-24 08:50:44,536 - INFO - Successfully loaded 220364 compounds.
2017-09-24 08:50:44,536 - INFO - Successfully loaded 220364 compounds.
2017-09-24 08:53:20,142 - INFO - Computed mse score of: 0.03150658092444017
2017-09-24 08:53:20,142 - INFO - Computed mse score of: 0.03150658092444017
2017-09-24 08:53:20,142 - INFO - Computed mse score of: 0.03150658092444017
2017-09-24 08:53:20,142 - INFO - Computed mse score of: 0.03150658092444017
2017-09-24 08:53:20,147 - INFO - How good is it as a classifier?
2017-09-24 08:53:20,147 - INFO - How good is it as a classifier?
2017-09-24 08:53:20,147 - INFO - How good is it as a classifier?
2017-09-24 08:53:20,147 - INFO - How good is it as a classifier?
2017-09-24 08:53:20,169 - INFO -              precision    recall  f1-score   support

      False       0.95      0.95      0.95    103101
       True 

In [17]:
import keras
print(keras.backend.backend())

Using TensorFlow backend.


tensorflow


In [18]:
# What about a deep neural network?
# Sample code from: https://machinelearningmastery.com/tutorial-first-neural-network-python-keras/

from keras.models import Sequential
from keras.layers import Dense
from keras import metrics
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, classification_report

kf = KFold(n_splits=k_fold_splits,shuffle=True,random_state=global_random_state)


global_random_state = 42

logger.info("Trying a large DNN")

with open('data.regression.nonsampled.pickle', 'rb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    (X,y) = pickle.load(f)

def create_model() :
    model = Sequential()
    model.add(Dense(1024, input_dim=2048, activation='relu'))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=["mae"])
    return model

f1_score_avg = 0

for train_index, test_index in kf.split(X,y) :

    X_train = X[train_index]
    X_test = X[test_index]
    y_train = y[train_index]
    y_test = y[test_index]

    model = KerasRegressor(build_fn=create_model, epochs=1, batch_size=1, verbose=1,class_weight=class_weights)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    y_pred_binarized = y_pred[0:] >= .4
    y_test_binarized = y_test[0:] >= .4
    fscore = f1_score(y_pred_binarized,y_test_binarized)
    logger.info("When using regressor as an active/inactive classifier, f1 score of: {}".format(fscore))
    f1_score_avg = f1_score_avg + fscore
    logger.debug(classification_report(y_test_binarized,y_pred_binarized))

    
roc_auc_avg = roc_auc_avg / k_fold_splits
f1_score_avg = f1_score_avg / k_fold_splits
logger.info("Average roc_auc score is: {}".format(roc_auc_avg))
logger.info("Average f1_score is: {}".format(f1_score_avg))


2017-09-24 08:56:37,048 - INFO - Trying a large DNN
2017-09-24 08:56:37,048 - INFO - Trying a large DNN
2017-09-24 08:56:37,048 - INFO - Trying a large DNN
2017-09-24 08:56:37,048 - INFO - Trying a large DNN


Epoch 1/1


AttributeError: 'DecisionTreeRegressor' object has no attribute 'model'