## Overview ##

PubChem is a site run by the NIH which hosts raw data associated with chemical experiments; here we analyze the data hosted at PubChem for assay 1030, which looks for inhibitors of the protein encoding gene ALDH1A1. You can access the page for this assay [here](https://pubchem.ncbi.nlm.nih.gov/bioassay/1030)

## Results ##

We use the SMILES string, a common representation for a molecule amongst chemists, to begin the featurization process. Because the length of this string varies, it is normalized in the form of a Morgan Fingerprint; these are then used to train various regression models, which we then use as binary classifiers. This is to see if the continuous input in the activity score has predictive value. More specifically, our goal here is to find the highest precision in the 'True' label class so as not to miss any potentially useful compounds

In [1]:
# Regression analysis
import logging
import sys
logger = logging.getLogger()
logger.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')

fh = logging.FileHandler('log_regression.txt')
fh.setLevel(logging.INFO)
fh.setFormatter(formatter)
logger.addHandler(fh)

ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
logger.addHandler(ch)

In [2]:
import pickle
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, rdMolDescriptors
from sklearn.tree import DecisionTreeRegressor
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.model_selection import KFold
from keras.wrappers.scikit_learn import KerasRegressor
from keras.models import Sequential
from keras.layers import Dense
import sys

import warnings
warnings.filterwarnings('ignore')

global_random_state = 42
k_fold_splits = 2

np.random.seed(global_random_state)

Using TensorFlow backend.


In [9]:
# Load assay info. Note: This CSV was obtained from PubChem bioassay aka PCBA, via searching for AID 1030 
# and downloading the datatable

ba_df = pd.read_csv("AID_1030_datatable_all.csv")

# Load compound info
cs_df = pd.read_csv("AID_1030_compound_smiles.csv",sep='\t',header=0)

# Merge the two
full_df = ba_df.merge(cs_df,on='PUBCHEM_CID')

# Cleanup the compound ID column
full_df["PUBCHEM_CID"] = full_df["PUBCHEM_CID"].astype(int)
full_df["PUBCHEM_ACTIVITY_SCORE"] = full_df["PUBCHEM_ACTIVITY_SCORE"].astype(int).round(-1)

# Quantize into deciles


compound_ids = list()
smiles_list = list()
fingerprints = list()
activities = list()

#fingerprint_df = 

for index, row in full_df.iterrows() :
    cid = row["PUBCHEM_CID"]
    smiles_string = row["Smiles"]
    mol = Chem.MolFromSmiles(smiles_string)
    is_active = row["PUBCHEM_ACTIVITY_OUTCOME"] == "Active"
    activity_score = row["PUBCHEM_ACTIVITY_SCORE"]
    if mol is None:
        print("Molecule failed featurization")
        print(index)
    else: 
        fingerprint = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol,2,nBits=2048,useChirality=False,
                                                                     useBondTypes=False,useFeatures=False)
        
        # From RDKit documentation
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(fingerprint, arr)
        fingerprint = arr
        
        compound_ids.append(cid)
        smiles_list.append(smiles_string)
        fingerprints.append(fingerprint)
        activities.append(activity_score)
    
    if index % 10000 == 0:
        logger.info("Processed index: {0}".format(index))

# Convert activities to np array of ints

X = np.array(fingerprints)
y = np.array(activities,dtype=float)


with open("data.regression.nonsampled.pickle","wb") as f:
    pickle.dump((X,y),f)

2017-09-24 19:06:23,083 - INFO - Processed index: 0
2017-09-24 19:06:30,179 - INFO - Processed index: 10000
2017-09-24 19:06:36,906 - INFO - Processed index: 20000
2017-09-24 19:06:44,006 - INFO - Processed index: 30000
2017-09-24 19:06:51,013 - INFO - Processed index: 40000
2017-09-24 19:06:57,766 - INFO - Processed index: 50000
2017-09-24 19:07:04,766 - INFO - Processed index: 60000
2017-09-24 19:07:11,160 - INFO - Processed index: 70000
2017-09-24 19:07:18,021 - INFO - Processed index: 80000
2017-09-24 19:07:24,798 - INFO - Processed index: 90000
2017-09-24 19:07:31,806 - INFO - Processed index: 100000
2017-09-24 19:07:38,764 - INFO - Processed index: 110000
2017-09-24 19:07:45,924 - INFO - Processed index: 120000
2017-09-24 19:07:53,226 - INFO - Processed index: 130000
2017-09-24 19:08:00,306 - INFO - Processed index: 140000
2017-09-24 19:08:07,353 - INFO - Processed index: 150000
2017-09-24 19:08:14,262 - INFO - Processed index: 160000
2017-09-24 19:08:21,171 - INFO - Processed in

In [8]:
# Ensure even distribution

with open("data.classification.undersampled.pickle","rb") as f:
    (X, y) = pickle.load(f)

In [None]:
from sklearn.preprocessing import MinMaxScaler
from keras.layers.convolutional import Convolution1D
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.layers import Dense, Dropout, Activation

kf = KFold(n_splits=k_fold_splits,shuffle=True,random_state=global_random_state)

global_random_state = 42

import numpy as np
import math

print(y[20:40])

#logger.info("Scaling inputs down")
#y_float = y.astype(float)

#y_scaled = np.divide(y,100.0)
#print(y_scaled[20:40])

#logger.info("Trying a large DNN")

#from sklearn.utils import class_weight

filters = 250
kernel_size = 1
hidden_dims = 5000

X_reshaped = np.expand_dims(X, axis=2)

print("Dimensions are: {}".format(X_reshaped.ndim))


def create_model() :
    model = Sequential()
    
    model.add(Conv1D(filters, kernel_size,padding='valid',activation='relu',strides=1,input_shape=(None,2048)))
    model.add(GlobalMaxPooling1D())

    # We add a vanilla hidden layer:
    model.add(Dense(hidden_dims))
    model.add(Dropout(0.2))
    model.add(Activation('relu'))

    # We project onto a single unit output layer, and squash it with a sigmoid:
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    
    #model.add(Convolution1D(256, input_dim=2048, activation='tanh'))
    #model.add(Convolution1D(128, activation='sigmoid'))
    #model.add(Convolution1D(64, activation='sigmoid'))
    #model.add(Dense(32, activation='sigmoid'))
    #model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["accuracy"])
    return model

for train_index, test_index in kf.split(X,y) :

    X_train = X[train_index]
    X_test = X[test_index]
    y_train = y[train_index]
    y_test = y[test_index]

    model = KerasRegressor(build_fn=create_model, epochs=10, batch_size=100, verbose=1)
    model.fit(X_train[:,None,:],y_train)
    y_pred = model.predict(X_test[:,None,:])
        
    y_pred_binary = [y > .4 for y in y_pred]
    y_test_binary = [y > .4 for y in y_test]

    logger.info("How good is it as a classifier at 0.4 threshold?")
    logger.info(classification_report(y_test_binary, y_pred_binary))
    
    y_pred_binary = [y > .2 for y in y_pred]
    y_test_binary = [y > .4 for y in y_test]

    logger.info("How good is it as a classifier at 0.2 threshold?")
    logger.info(classification_report(y_test_binary, y_pred_binary))

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
Dimensions are: 3
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10

In [None]:
with open('data.regression.sampled.pickle', 'rb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    (X, y) = pickle.load(f)

# Print the number of compounds loaded
logger.info("Successfully loaded {0} compounds.".format(len(X)))

kf = KFold(n_splits=k_fold_splits,shuffle=True,random_state=global_random_state)

mse_avg = 0

for train_index, test_index in kf.split(X,y) :

    X_train = X[train_index]
    X_test = X[test_index]
    y_train = y[train_index]
    y_test = y[test_index]

    classifier = DecisionTreeRegressor(random_state=global_random_state)
    classifier.fit(X_train,y_train)
    y_pred = classifier.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    logger.info("Computed mse score of: {}".format(mse))
    mse_avg = mse_avg + mse
    
    y_pred_binary = [y > 40 for y in y_pred]
    y_test_binary = [y > 40 for y in y_test]

    logger.info("How good is it as a classifier at 0.4 threshold?")
    logger.info(classification_report(y_test_binary, y_pred_binary))
        
mse_avg = mse_avg / k_fold_splits
logger.info("Average mse score is: {}".format(mse_avg))



In [None]:
%matplotlib inline
logger.info("Plot of test distribution histogram (activity score on X axis)")
pd.Series(y_test).plot.hist()

In [None]:
%matplotlib inline
logger.info("Plot of predicted distribution histogram (activity score on X axis)")
pd.Series(y_pred).plot.hist()

In [None]:
# Now looking at non-sampled data performance

kf = KFold(n_splits=k_fold_splits,shuffle=True,random_state=global_random_state)

global_random_state = 42

logger.info("Trying a large DNN")

with open('data.regression.nonsampled.pickle', 'rb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    (X,y) = pickle.load(f)

def create_model() :
    model = Sequential()
    model.add(Dense(1024, input_dim=2048, activation='relu'))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=["mae"])
    return model

for train_index, test_index in kf.split(X,y) :

    X_train = X[train_index]
    X_test = X[test_index]
    y_train = y[train_index]
    y_test = y[test_index]

    model = KerasRegressor(build_fn=create_model, epochs=10, batch_size=1000, verbose=1)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
        
    y_pred_binary = [y > 40 for y in y_pred]
    y_test_binary = [y > 40 for y in y_test]

    logger.info("How good is it as a classifier at 0.4 threshold?")
    logger.info(classification_report(y_test_binary, y_pred_binary))
    
    y_pred_binary = [y > 20 for y in y_pred]
    y_test_binary = [y > 40 for y in y_test]

    logger.info("How good is it as a classifier at 0.2 threshold?")
    logger.info(classification_report(y_test_binary, y_pred_binary))

In [None]:
import pickle
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, rdMolDescriptors
from collections import Counter
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, classification_report
from sklearn.model_selection import KFold, train_test_split


with open('data.regression.nonsampled.pickle', 'rb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    (X, y) = pickle.load(f)

# Print the number of compounds loaded
logger.info("Successfully loaded {0} compounds.".format(len(X)))

kf = KFold(n_splits=k_fold_splits,shuffle=True,random_state=global_random_state)

mse_avg = 0

for train_index, test_index in kf.split(X,y) :

    X_train = X[train_index]
    X_test = X[test_index]
    y_train = y[train_index]
    y_test = y[test_index]

    classifier = DecisionTreeRegressor(random_state=global_random_state)
    classifier.fit(X_train,y_train)
    y_pred = classifier.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    logger.info("Computed mse score of: {}".format(mse))
    mse_avg = mse_avg + mse
    
    y_pred_binary = [y > 40 for y in y_pred]
    y_test_binary = [y > 40 for y in y_test]

    logger.info("How good is it as a classifier at 0.4 threshold?")
    logger.info(classification_report(y_test_binary, y_pred_binary))
        
mse_avg = mse_avg / k_fold_splits
logger.info("Average mse score is: {}".format(mse_avg))

