In [27]:
!pip install deepchem
!pip install --pre deepchem[torch]
!conda create -c conda-forge -n my-rdkit-env rdkit    # when creating a new environment
!pip install -c conda-forge rdkit

Collecting dgllife
  Downloading dgllife-0.2.9.tar.gz (138 kB)
[K     |████████████████████████████████| 138 kB 5.1 MB/s 
Collecting dgl
  Downloading dgl-0.6.1-cp37-cp37m-manylinux1_x86_64.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 46.1 MB/s 
Building wheels for collected packages: dgllife
  Building wheel for dgllife (setup.py) ... [?25l[?25hdone
  Created wheel for dgllife: filename=dgllife-0.2.9-py3-none-any.whl size=219056 sha256=f5f49a4b5c21a7761edc75c12f3a0d09d362b740ab1234aa92a284819af64152
  Stored in directory: /root/.cache/pip/wheels/34/83/11/9772c74b559d9182c9083362e8ba8b0201c4963e41e03859fe
Successfully built dgllife
Installing collected packages: dgllife, dgl
Successfully installed dgl-0.6.1 dgllife-0.2.9
/bin/bash: conda: command not found
[31mERROR: Could not open requirements file: [Errno 2] No such file or directory: 'conda-forge'[0m


Summary for the dataset we used here:

The blood-brain barrier penetration (BBBP) dataset is designed for the modeling and prediction of barrier permeability. As a membrane separating circulating blood and brain extracellular fluid, the blood-brain barrier blocks most drugs, hormones and neurotransmitters. Thus penetration of the barrier forms a long-standing issue in development of drugs targeting central nervous system.

This dataset includes binary labels for over 2000 compounds on their permeability properties.

Scaffold splitting is recommended for this dataset.

The raw data csv file contains columns below:

“name” - Name of the compound

“smiles” - SMILES representation of the molecular structure

“p_np” - Binary labels for penetration/non-penetration


refrence: https://deepchem.readthedocs.io/en/latest/api_reference/moleculenet.html#bbbp-datasets

In [28]:
import deepchem
from deepchem import molnet
from deepchem.molnet.load_function.molnet_loader import TransformerGenerator, _MolnetLoader
from deepchem.data import Dataset
from rdkit import Chem
from rdkit.Chem import RDKFingerprint

import numpy as np
import pandas as pd

from sklearn import linear_model
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import average_precision_score


In this section, I get the smiles, convert them to fingerprints using rdkit and then convert them to numpy so I can use them as an input for my machine learning models.

In [29]:
lists, dataset, transformers = molnet.load_bbbp(featurizer = 'ECFP', splitter=None)

In [30]:
dataset[0]

<DiskDataset X.shape: (2039, 1024), y.shape: (2039, 1), w.shape: (2039, 1), task_names: ['p_np']>

In [31]:
smiles = dataset[0].ids
print(smiles)
print(len(smiles))

['[Cl].CC(C)NCC(O)COc1cccc2ccccc12'
 'C(=O)(OC(C)(C)C)CCCc1ccc(cc1)N(CCCl)CCCl'
 'c12c3c(N4CCN(C)CC4)c(F)cc1c(c(C(O)=O)cn2C(C)CO3)=O' ...
 '[O+]1=N[N](C=C1[N-]C(NC2=CC=CC=C2)=O)C(CC3=CC=CC=C3)C'
 'C1=C(OC)C(=CC2=C1C(=[N+](C(=C2CC)C)[NH-])C3=CC=C(C(=C3)OC)OC)OC'
 '[N+](=NCC(=O)N[C@@H]([C@H](O)C1=CC=C([N+]([O-])=O)C=C1)CO)=[N-]']
2039


In [32]:
# Converting the smiles to mols
mols = []
for i in range(len(smiles)):
    mols.append(Chem.MolFromSmiles(smiles[i]))

In [35]:
mols[0]

<rdkit.Chem.rdchem.Mol at 0x7f09e4074530>

In [34]:
#Converting to Fingerprint
fingerPrints = []
for i in range(len(mols)):
    fingerPrints.append(RDKFingerprint(mols[i]))

In [39]:
#Converting Fingerprint to numpy
n_fingerPrints = []
for i in range(len(fingerPrints)):
    n_fingerPrints.append(np.array(fingerPrints[i]))

In [47]:
features = np.array(n_fingerPrints)
print(features)
len(features)

[[0 0 0 ... 0 0 1]
 [0 0 1 ... 0 0 1]
 [1 1 0 ... 1 0 1]
 ...
 [1 0 1 ... 0 0 1]
 [1 1 0 ... 1 1 1]
 [0 0 0 ... 0 0 1]]


2039

In [46]:
labels = dataset[0].y
print(labels)
len(labels)

[[1.]
 [1.]
 [1.]
 ...
 [1.]
 [1.]
 [1.]]


2039

In this section, I create the five_fold indexes for my dataset. Then writing a general function that get the model as an input and train the model and calculate the average AUPR for five_fold.

In [48]:
five_fold = KFold(n_splits=5, shuffle=True, random_state=1)

In [64]:
def model(model_name, modelfunc, five_fold):
    print(model_name)
    print('~'*50)
    model_performances = [0]*5
    i = 0
    for train_index, test_index in five_fold.split(features):
        X_train, X_test, y_train, y_test = features[train_index], features[test_index], labels[train_index], labels[test_index]
        model = modelfunc
        model.fit(X_train, y_train.ravel())
        y_pred =  model.predict(X_test)
        model_performances[i] = average_precision_score(y_test, y_pred)
        i += 1
        print("fold " + str(i))
        print("AUPR score: " + str(model_performances[i-1]))
        print("-"*50)

    model_performance = sum(model_performances)/5
    print("-"*50)
    print("Average AUPR: " + str(model_performance))

    return model_performances, model_performance

### 2.Linear model

In [65]:
logregr_performances, logregr_performance = model("Logistic regression model", linear_model.LogisticRegression(), five_fold)

Logistic regression model
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


fold 1
AUPR score: 0.896655862747919
--------------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


fold 2
AUPR score: 0.9144761413337266
--------------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


fold 3
AUPR score: 0.8878930221023236
--------------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


fold 4
AUPR score: 0.9160646714826282
--------------------------------------------------
fold 5
AUPR score: 0.9030774432643591
--------------------------------------------------
--------------------------------------------------
Average AUPR: 0.9036334281861912


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### 3.Lasso

In [66]:
lasso_performances, lasso_performance = model("Lasso model", linear_model.LogisticRegression(solver = 'liblinear',penalty='l1'), five_fold)

Lasso model
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
fold 1
AUPR score: 0.8997773137885011
--------------------------------------------------
fold 2
AUPR score: 0.8957782805429865
--------------------------------------------------
fold 3
AUPR score: 0.8965268583317572
--------------------------------------------------
fold 4
AUPR score: 0.9132404204610011
--------------------------------------------------
fold 5
AUPR score: 0.908993718517528
--------------------------------------------------
--------------------------------------------------
Average AUPR: 0.9028633183283548


### 4.Elastic net

In [68]:
Elasticnet_performances, Elasticnet_performance = model("Elastic Net model", linear_model.LogisticRegressionCV(cv=None, penalty='elasticnet', l1_ratios=[0.35], solver='saga'), five_fold)

Elastic Net model
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~




fold 1
AUPR score: 0.8992541663129898
--------------------------------------------------




fold 2
AUPR score: 0.9071444275977466
--------------------------------------------------




fold 3
AUPR score: 0.8728037615804644
--------------------------------------------------




fold 4
AUPR score: 0.8559926890407361
--------------------------------------------------




fold 5
AUPR score: 0.882131400881401
--------------------------------------------------
--------------------------------------------------
Average AUPR: 0.8834652890826675




### 5.SVM

In [69]:
svm_performances, svm_performance = model("SVM model", SVC(), five_fold)

SVM model
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
fold 1
AUPR score: 0.890071516529317
--------------------------------------------------
fold 2
AUPR score: 0.9114937099099996
--------------------------------------------------
fold 3
AUPR score: 0.8735186963765
--------------------------------------------------
fold 4
AUPR score: 0.8757940774797155
--------------------------------------------------
fold 5
AUPR score: 0.8871159916874203
--------------------------------------------------
--------------------------------------------------
Average AUPR: 0.8875987983965905


### 6.Naive bayes

In [70]:
nb_performances, nb_performance = model("Naive Baysed - Gaussian model", GaussianNB(), five_fold)

Naive Baysed - Gaussian model
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
fold 1
AUPR score: 0.8300090728115146
--------------------------------------------------
fold 2
AUPR score: 0.8613100157048771
--------------------------------------------------
fold 3
AUPR score: 0.8349568123069163
--------------------------------------------------
fold 4
AUPR score: 0.8194766168489358
--------------------------------------------------
fold 5
AUPR score: 0.8341673770245199
--------------------------------------------------
--------------------------------------------------
Average AUPR: 0.8359839789393527


### Performance Table 

In [73]:
cols = [['Model Name', 'fold1 AUPR score', 'fold1 AUPR score', 'fold1 AUPR score', 'fold1 AUPR score', 'fold1 AUPR score', 'Average AUPR score']]
cols.append(["Linearmodel"] + logregr_performances + [logregr_performance])
cols.append(["Lasso model"] + lasso_performances + [lasso_performance])
cols.append(["Elastic Net model"] + Elasticnet_performances + [Elasticnet_performance])
cols.append(["SVM model"] +svm_performances + [svm_performance])
cols.append(["Naive Bayes"] + nb_performances + [nb_performance])

results = pd.DataFrame(cols)
print(results)


                   0                 1                 2                 3  \
0         Model Name  fold1 AUPR score  fold1 AUPR score  fold1 AUPR score   
1        Linearmodel          0.896656          0.914476          0.887893   
2        Lasso model          0.899777          0.895778          0.896527   
3  Elastic Net model          0.899254          0.907144          0.872804   
4          SVM model          0.890072          0.911494          0.873519   
5        Naive Bayes          0.830009           0.86131          0.834957   

                  4                 5                   6  
0  fold1 AUPR score  fold1 AUPR score  Average AUPR score  
1          0.916065          0.903077            0.903633  
2           0.91324          0.908994            0.902863  
3          0.855993          0.882131            0.883465  
4          0.875794          0.887116            0.887599  
5          0.819477          0.834167            0.835984  


Based on the Average AUPR Score we understand that Logistic regression has the best performance and Lasso also has a very good and close performane to Logistic regression. Also bayes net has the worse performance. 

So for this dataset Linear models are better.