In [1]:
# import libraries needed
import pandas as pd
import numpy as np
import time

from spe_vectorizers import spe_featurizer, spe_featurizer2, atom_featurizer, kmer_featurizer
from kaggle_chem import ecfp_featurizer, oned_featurizer
from rdkit import Chem

from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report, roc_auc_score

### Import Data

Import cleaned and standardized data for the Tox21 NR-AhR assay.  Use train and score datasets as train and test data, respectively.

In [2]:
train = pd.read_csv('../processed_data/nr_ahr_std_train.csv')
#train.head()

train_data = train.std_compounds
train_labels = train.label

print('Train data shape:', train_data.shape)
print('Train labels shape:', train_labels.shape)

active_train = train_data[train_labels == 1].reset_index(drop=True)
inactive_train = train_data[train_labels == 0].reset_index(drop=True)

print('Active compounds:', len(train_labels[train_labels == 1]))
print('Inactive compounds:', len(train_labels[train_labels == 0]))
print('Inactive : Active ~', len(train_labels[train_labels == 0]) // len(train_labels[train_labels == 1]))

Train data shape: (6709,)
Train labels shape: (6709,)
Active compounds: 761
Inactive compounds: 5948
Inactive : Active ~ 7


In [3]:
test = pd.read_csv('../processed_data/nr_ahr_test_std.csv')
#test.head()

test_data = test.std_compounds
test_labels = test.label

print('Test data shape:', test_data.shape)
print('Test labels shape:', test_labels.shape)

print('Active compounds:', len(test_labels[test_labels == 1]))
print('Inactive compounds:', len(test_labels[test_labels == 0]))
print('Inactive : Active ~', len(test_labels[test_labels == 0]) // len(test_labels[test_labels == 1]))

Test data shape: (607,)
Test labels shape: (607,)
Active compounds: 71
Inactive compounds: 536
Inactive : Active ~ 7


#### Classifier and Parameter Grid Lists

Create list of models to run and parameter grids for each for running grid search

In [4]:
bnb = BernoulliNB()
rf = RandomForestClassifier()
logreg = LogisticRegression()
knn = KNeighborsClassifier()
svm = SVC()

classifier_list = [bnb, rf, logreg, knn, svm]
classifier_names = ['bnb', 'rf', 'logreg', 'knn', 'svm']

In [5]:
bnb_params = {'alpha': [1.0e-10, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]}
rf_params = {'max_depth':[50, 100, None], 'n_estimators': [100, 200, 500]}
logreg_params = {'C': [0.5], 'solver': ['liblinear'], 'multi_class': ['auto']}
knn_params = {'n_neighbors': [1, 2, 3, 4, 5, 10]}
svm_params = {'C': [0.0001, 0.001, 0.01, 0.1, 0.5, 1], 
              'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'probability': [True]}

param_list = [bnb_params, rf_params, logreg_params, knn_params, svm_params]

#### Helper functions for running model matrix loops

In [6]:
def grid_searcher(classifier, param_grid, x_train, y_train):
    """Helper function for finding best parameters for each the model
       Uses ROC-AUC to evaluate performance of models"""
    
    grid_search = GridSearchCV(estimator=classifier,
                               param_grid=param_grid,
                               cv=5,
                               n_jobs=1,
                               verbose=2,
                               scoring='recall')
    
    grid_search.fit(x_train, y_train)
    best_params = grid_search.best_params_
    return best_params

In [7]:
def run_model(classifier, x_train, x_test):
    """Helper function for running and evaluating models
       Returns recall for each label and ROC-AUC score"""
    
    trained_classifier = classifier.fit(x_train, train_labels)
    
    # calculate recall for 0 and 1 
    report = classification_report(test_labels, trained_classifier.predict(x_test), output_dict=True)
    recall_0 = report['0']['recall']
    recall_1 = report['1']['recall']
    
    # calculate roc-auc score for model
    roc_auc = roc_auc_score(test_labels, trained_classifier.predict_proba(x_test)[:, 1])
    
    return recall_0, recall_1, roc_auc

#### Initialize list for constructing matrix results dataframe

In [8]:
results_list = []

## Run Model Matrix Experiment

Run the list of classifiers chosen using different featurizers and collect the results in a dataframe.

For each featurizer, loop through the list of classifiers:
- Find the best parameters using grid search
- Run the best parameters
- Calculate recall for each label and ROC-AUC score for the model

### SPE Featurizer

In [9]:
# generate features
x_spe, x_spe_test, spe_vocab = spe_featurizer(train_data, test_data)

print(x_spe.shape)
print(x_spe_test.shape)

(6709, 2378)
(607, 2378)


In [10]:
# find best parameters for each model
spe_best_params = []

start = time.time()

for i in range(len(classifier_list)):
    clf = classifier_list[i]
    param_grid = param_list[i]
    
    # run grid search
    spe_best_params.append(grid_searcher(clf, param_grid, x_spe, train_labels))
    print(f'{clf} grid search done')

tot_time = time.time() - start
print(f'Total time = {tot_time}')

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] END ........................................alpha=1e-10; total time=   0.0s
[CV] END ........................................alpha=1e-10; total time=   0.0s
[CV] END ........................................alpha=1e-10; total time=   0.0s
[CV] END ........................................alpha=1e-10; total time=   0.0s
[CV] END ........................................alpha=1e-10; total time=   0.0s
[CV] END .......................................alpha=0.0001; total time=   0.0s
[CV] END .......................................alpha=0.0001; total time=   0.0s
[CV] END .......................................alpha=0.0001; total time=   0.0s
[CV] END .......................................alpha=0.0001; total time=   0.0s
[CV] END .......................................alpha=0.0001; total time=   0.0s
[CV] END ........................................alpha=0.001; total time=   0.0s
[CV] END ........................................

[CV] END ......................................n_neighbors=1; total time=   0.1s
[CV] END ......................................n_neighbors=1; total time=   0.1s
[CV] END ......................................n_neighbors=2; total time=   0.1s
[CV] END ......................................n_neighbors=2; total time=   0.1s
[CV] END ......................................n_neighbors=2; total time=   0.1s
[CV] END ......................................n_neighbors=2; total time=   0.1s
[CV] END ......................................n_neighbors=2; total time=   0.1s
[CV] END ......................................n_neighbors=3; total time=   0.1s
[CV] END ......................................n_neighbors=3; total time=   0.1s
[CV] END ......................................n_neighbors=3; total time=   0.1s
[CV] END ......................................n_neighbors=3; total time=   0.1s
[CV] END ......................................n_neighbors=3; total time=   0.1s
[CV] END ...................

[CV] END ................C=0.1, kernel=rbf, probability=True; total time=   5.1s
[CV] END ................C=0.1, kernel=rbf, probability=True; total time=   5.1s
[CV] END ............C=0.1, kernel=sigmoid, probability=True; total time=   2.2s
[CV] END ............C=0.1, kernel=sigmoid, probability=True; total time=   2.3s
[CV] END ............C=0.1, kernel=sigmoid, probability=True; total time=   2.4s
[CV] END ............C=0.1, kernel=sigmoid, probability=True; total time=   2.5s
[CV] END ............C=0.1, kernel=sigmoid, probability=True; total time=   2.6s
[CV] END .............C=0.5, kernel=linear, probability=True; total time=   2.8s
[CV] END .............C=0.5, kernel=linear, probability=True; total time=   3.1s
[CV] END .............C=0.5, kernel=linear, probability=True; total time=   3.3s
[CV] END .............C=0.5, kernel=linear, probability=True; total time=   3.3s
[CV] END .............C=0.5, kernel=linear, probability=True; total time=   3.4s
[CV] END ...............C=0.

In [11]:
# best model list
bnb_spe = BernoulliNB(**spe_best_params[0])
rf_spe = RandomForestClassifier(**spe_best_params[1])
logreg_spe = LogisticRegression(**spe_best_params[2])
knn_spe = KNeighborsClassifier(**spe_best_params[3])
svm_spe = SVC(**spe_best_params[4])

spe_classifier_list = [bnb_spe, rf_spe, logreg_spe, knn_spe, svm_spe]

In [12]:
for i in range(len(spe_classifier_list)):
    clf = spe_classifier_list[i]
    clf_name = classifier_names[i]
    best_params = spe_best_params[i]
    
    # run best model
    recall_0, recall_1, roc_auc = run_model(clf, x_spe, x_spe_test)
    
    # make dictionary of metrics
    metrics = {'featurizer': 'spe', 'model': clf_name, 'best_params': best_params,
               'recall_0': recall_0, 'recall_1': recall_1, 'roc_auc': roc_auc}
    
    # collect in results list
    results_list.append(metrics)
    
len(results_list)

5

### K-mer Featurizer

In [13]:
# generate features
x_kmer, x_kmer_test, kmer_vocab = kmer_featurizer(train_data, test_data)

print(x_kmer.shape)
print(x_kmer_test.shape)

(6709, 7831)
(607, 7831)


In [14]:
# find best parameters for each model
kmer_best_params = []

start = time.time()

for i in range(len(classifier_list)):
    clf = classifier_list[i]
    param_grid = param_list[i]
    
    # run grid search
    kmer_best_params.append(grid_searcher(clf, param_grid, x_kmer, train_labels))
    print(f'{clf} grid search done')

tot_time = time.time() - start
print(f'Total time = {tot_time}')

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] END ........................................alpha=1e-10; total time=   0.0s
[CV] END ........................................alpha=1e-10; total time=   0.0s
[CV] END ........................................alpha=1e-10; total time=   0.0s
[CV] END ........................................alpha=1e-10; total time=   0.0s
[CV] END ........................................alpha=1e-10; total time=   0.0s
[CV] END .......................................alpha=0.0001; total time=   0.0s
[CV] END .......................................alpha=0.0001; total time=   0.0s
[CV] END .......................................alpha=0.0001; total time=   0.0s
[CV] END .......................................alpha=0.0001; total time=   0.0s
[CV] END .......................................alpha=0.0001; total time=   0.0s
[CV] END ........................................alpha=0.001; total time=   0.0s
[CV] END ........................................

[CV] END ......................................n_neighbors=1; total time=   0.2s
[CV] END ......................................n_neighbors=1; total time=   0.2s
[CV] END ......................................n_neighbors=1; total time=   0.2s
[CV] END ......................................n_neighbors=2; total time=   0.2s
[CV] END ......................................n_neighbors=2; total time=   0.2s
[CV] END ......................................n_neighbors=2; total time=   0.2s
[CV] END ......................................n_neighbors=2; total time=   0.2s
[CV] END ......................................n_neighbors=2; total time=   0.2s
[CV] END ......................................n_neighbors=3; total time=   0.2s
[CV] END ......................................n_neighbors=3; total time=   0.2s
[CV] END ......................................n_neighbors=3; total time=   0.2s
[CV] END ......................................n_neighbors=3; total time=   0.2s
[CV] END ...................

[CV] END ................C=0.1, kernel=rbf, probability=True; total time=  12.5s
[CV] END ................C=0.1, kernel=rbf, probability=True; total time=  13.9s
[CV] END ................C=0.1, kernel=rbf, probability=True; total time=  15.0s
[CV] END ............C=0.1, kernel=sigmoid, probability=True; total time=   7.8s
[CV] END ............C=0.1, kernel=sigmoid, probability=True; total time=   8.2s
[CV] END ............C=0.1, kernel=sigmoid, probability=True; total time=   7.6s
[CV] END ............C=0.1, kernel=sigmoid, probability=True; total time=   8.1s
[CV] END ............C=0.1, kernel=sigmoid, probability=True; total time=   8.6s
[CV] END .............C=0.5, kernel=linear, probability=True; total time=   8.7s
[CV] END .............C=0.5, kernel=linear, probability=True; total time=   9.4s
[CV] END .............C=0.5, kernel=linear, probability=True; total time=   9.8s
[CV] END .............C=0.5, kernel=linear, probability=True; total time=  10.7s
[CV] END .............C=0.5,

In [15]:
# best model list
bnb_kmer = BernoulliNB(**kmer_best_params[0])
rf_kmer = RandomForestClassifier(**kmer_best_params[1])
logreg_kmer = LogisticRegression(**kmer_best_params[2])
knn_kmer = KNeighborsClassifier(**kmer_best_params[3])
svm_kmer = SVC(**kmer_best_params[4])

kmer_classifier_list = [bnb_kmer, rf_kmer, logreg_kmer, knn_kmer, svm_kmer]

In [16]:
for i in range(len(kmer_classifier_list)):
    clf = kmer_classifier_list[i]
    clf_name = classifier_names[i]
    best_params = kmer_best_params[i]
    
    # run best model
    recall_0, recall_1, roc_auc = run_model(clf, x_kmer, x_kmer_test)
    
    # make dictionary of metrics
    metrics = {'featurizer': 'kmer', 'model': clf_name, 'best_params': best_params,
               'recall_0': recall_0, 'recall_1': recall_1, 'roc_auc': roc_auc}
    
    # collect in results list
    results_list.append(metrics)
    
len(results_list)

10

### Atom-wise Featurizer

In [17]:
# generate features
x_atom, x_atom_test, atom_vocab = atom_featurizer(train_data, test_data)

print(x_atom.shape)
print(x_atom_test.shape)

(6709, 131)
(607, 131)


In [18]:
# find best parameters for each model
atom_best_params = []

start = time.time()

for i in range(len(classifier_list)):
    clf = classifier_list[i]
    param_grid = param_list[i]
    
    # run grid search
    atom_best_params.append(grid_searcher(clf, param_grid, x_atom, train_labels))
    print(f'{clf} grid search done')

tot_time = time.time() - start
print(f'Total time = {tot_time}')

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] END ........................................alpha=1e-10; total time=   0.0s
[CV] END ........................................alpha=1e-10; total time=   0.0s
[CV] END ........................................alpha=1e-10; total time=   0.0s
[CV] END ........................................alpha=1e-10; total time=   0.0s
[CV] END ........................................alpha=1e-10; total time=   0.0s
[CV] END .......................................alpha=0.0001; total time=   0.0s
[CV] END .......................................alpha=0.0001; total time=   0.0s
[CV] END .......................................alpha=0.0001; total time=   0.0s
[CV] END .......................................alpha=0.0001; total time=   0.0s
[CV] END .......................................alpha=0.0001; total time=   0.0s
[CV] END ........................................alpha=0.001; total time=   0.0s
[CV] END ........................................

[CV] END ......................................n_neighbors=1; total time=   0.2s
[CV] END ......................................n_neighbors=1; total time=   0.2s
[CV] END ......................................n_neighbors=1; total time=   0.2s
[CV] END ......................................n_neighbors=2; total time=   0.2s
[CV] END ......................................n_neighbors=2; total time=   0.2s
[CV] END ......................................n_neighbors=2; total time=   0.2s
[CV] END ......................................n_neighbors=2; total time=   0.2s
[CV] END ......................................n_neighbors=2; total time=   0.2s
[CV] END ......................................n_neighbors=3; total time=   0.2s
[CV] END ......................................n_neighbors=3; total time=   0.2s
[CV] END ......................................n_neighbors=3; total time=   0.2s
[CV] END ......................................n_neighbors=3; total time=   0.2s
[CV] END ...................

[CV] END ................C=0.1, kernel=rbf, probability=True; total time=   3.3s
[CV] END ................C=0.1, kernel=rbf, probability=True; total time=   3.6s
[CV] END ................C=0.1, kernel=rbf, probability=True; total time=   3.4s
[CV] END ............C=0.1, kernel=sigmoid, probability=True; total time=   2.6s
[CV] END ............C=0.1, kernel=sigmoid, probability=True; total time=   2.9s
[CV] END ............C=0.1, kernel=sigmoid, probability=True; total time=   2.4s
[CV] END ............C=0.1, kernel=sigmoid, probability=True; total time=   2.4s
[CV] END ............C=0.1, kernel=sigmoid, probability=True; total time=   2.3s
[CV] END .............C=0.5, kernel=linear, probability=True; total time=   5.1s
[CV] END .............C=0.5, kernel=linear, probability=True; total time=   6.4s
[CV] END .............C=0.5, kernel=linear, probability=True; total time=   7.5s
[CV] END .............C=0.5, kernel=linear, probability=True; total time=   7.1s
[CV] END .............C=0.5,

In [19]:
# best model list
bnb_atom = BernoulliNB(**atom_best_params[0])
rf_atom = RandomForestClassifier(**atom_best_params[1])
logreg_atom = LogisticRegression(**atom_best_params[2])
knn_atom = KNeighborsClassifier(**atom_best_params[3])
svm_atom = SVC(**atom_best_params[4])

atom_classifier_list = [bnb_atom, rf_atom, logreg_atom, knn_atom, svm_atom]

In [20]:
for i in range(len(atom_classifier_list)):
    clf = atom_classifier_list[i]
    clf_name = classifier_names[i]
    best_params = atom_best_params[i]
    
    # run best model
    recall_0, recall_1, roc_auc = run_model(clf, x_atom, x_atom_test)
    
    # make dictionary of metrics
    metrics = {'featurizer': 'atom', 'model': clf_name, 'best_params': best_params,
               'recall_0': recall_0, 'recall_1': recall_1, 'roc_auc': roc_auc}
    
    # collect in results list
    results_list.append(metrics)
    
len(results_list)

15

## Export results

Make results available for exploration in another notebook

In [21]:
# convert results list to a dataframe
model_matrix = pd.DataFrame(results_list)

In [22]:
model_matrix.head()

Unnamed: 0,featurizer,model,best_params,recall_0,recall_1,roc_auc
0,spe,bnb,{'alpha': 0.1},0.863806,0.492958,0.820685
1,spe,rf,"{'max_depth': None, 'n_estimators': 100}",0.975746,0.098592,0.821185
2,spe,logreg,"{'C': 0.5, 'multi_class': 'auto', 'solver': 'l...",0.983209,0.197183,0.818951
3,spe,knn,{'n_neighbors': 1},0.902985,0.295775,0.59938
4,spe,svm,"{'C': 1, 'kernel': 'linear', 'probability': True}",0.934701,0.380282,0.786184


In [23]:
# generate csv file for use in further exploration
#model_matrix.to_csv('../processed_data/model_matrix2.csv')