## Overview ##

PubChem is a site run by the NIH which hosts raw data associated with chemical experiments; here we analyze the data hosted at PubChem for assay 1030, which looks for inhibitors of the protein encoding gene ALDH1A1. You can access the page for this assay [here](https://pubchem.ncbi.nlm.nih.gov/bioassay/1030)

## Results ##

We use the SMILES string, a common representation for a molecule amongst chemists, to begin the featurization process. Because the length of this string varies, it is normalized in the form of a Morgan Fingerprint; these are then used to train various binary classifiers

In [None]:
# Exploratory data analysis and visualization

In [3]:
import pickle
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, rdMolDescriptors
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

global_random_state = 42
k_fold_splits = 5

np.random.seed(global_random_state)


active_pct = 0.073125471
inactive_pct = 1 - active_pct

# We set the inactive to have the weight of the active, and vice versa, to account for imbalance
class_weights = { 0: active_pct, 1: inactive_pct }

In [None]:
import keras
print(keras.backend.backend())

print(len(X))
print(len(y))

In [None]:
# What about a deep neural network?
# Sample code from: https://machinelearningmastery.com/tutorial-first-neural-network-python-keras/

from keras.models import Sequential
from keras.layers import Dense
from keras import metrics
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
import pickle

k_fold_splits = 2
global_random_state = 42

with open('data.classification.undersampled.pickle', 'rb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    (X, y) = pickle.load(f)

def create_model() :
    model = Sequential()
    model.add(Dense(12, input_dim=2048, activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["accuracy"])
    return model

model = KerasClassifier(build_fn=create_model, epochs=3, batch_size=1, verbose=1)
results = cross_val_score(model, X, y, cv=k_fold_splits)
print(results.mean())

y_pred = cross_val_predict(classifier, X, y, cv=k_fold_splits)

print(classification_report(y, y_pred))

In [None]:
y_pred = model.predict_on_batch(X_test)
y_pred_binarized = y_pred[0:] > .5
print(classification_report(y_test, y_pred_binarized))


In [None]:
# What about a larger network size?
from keras.models import Sequential
from keras.layers import Dense
from keras import metrics

model = Sequential()
model.add(Dense(1024, input_dim=2048, activation='relu'))
model.add(Dense(512, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[metrics.accuracy])

model.fit(X_train, y_train_, epochs=10, batch_size=1)

score = model.evaluate(X_test, y_test_binary)

print("\n Loss on test set is: {0}".format(score))

# let's save it for future experimentation
model.save("pcba_1030_large_nn.h5")

In [None]:
from keras.models import load_model
model_large = load_model("pcba_1030_large_nn.h5")
y_pred = model_large.predict_on_batch(X_test)
y_pred_binarized = y_pred[0:] > .5
print(classification_report(y_test, y_pred_binarized))