In [1]:
# import libraries needed
import pandas as pd
import numpy as np
import ast

from spe_vectorizers import spe_featurizer, spe_featurizer2, atom_featurizer, kmer_featurizer
from kaggle_chem import ecfp_featurizer, oned_featurizer
from rdkit import Chem

from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report, roc_auc_score

from imblearn.over_sampling import SMOTE, RandomOverSampler

### Import and Randomly Oversample Data

Import cleaned and standardized data for the Tox21 NR-AhR assay.  Use train and score datasets as train and test data, respectively.  Use Random Oversampler to oversample the data.

In [2]:
train = pd.read_csv('../processed_data/nr_ahr_std_train.csv')
train.head()

oversample = RandomOverSampler(sampling_strategy='minority')
train_over, train_labels = oversample.fit_resample(train, train.label)

train_data = train_over.std_compounds

print('Train data shape:', train_data.shape)
print('Train labels shape:', train_labels.shape)

active_train = train_data[train_labels == 1].reset_index(drop=True)
inactive_train = train_data[train_labels == 0].reset_index(drop=True)

print('Active compounds:', len(train_labels[train_labels == 1]))
print('Inactive compounds:', len(train_labels[train_labels == 0]))
print('Inactive : Active ~', len(train_labels[train_labels == 0]) // len(train_labels[train_labels == 1]))

Train data shape: (11896,)
Train labels shape: (11896,)
Active compounds: 5948
Inactive compounds: 5948
Inactive : Active ~ 1


In [3]:
test = pd.read_csv('../processed_data/nr_ahr_test_std.csv')
#test.head()

test_data = test.std_compounds
test_labels = test.label

print('Test data shape:', test_data.shape)
print('Test labels shape:', test_labels.shape)

print('Active compounds:', len(test_labels[test_labels == 1]))
print('Inactive compounds:', len(test_labels[test_labels == 0]))
print('Inactive : Active ~', len(test_labels[test_labels == 0]) // len(test_labels[test_labels == 1]))

Test data shape: (607,)
Test labels shape: (607,)
Active compounds: 71
Inactive compounds: 536
Inactive : Active ~ 7


#### Upload results datasets for extracting best model parameters

In [4]:
classifier_names = ['bnb', 'rf', 'logreg', 'knn', 'svm']

In [5]:
# upload results dataset
matrix_model = pd.read_csv('../processed_data/model_matrix2.csv')
matrix_model = matrix_model.drop(['Unnamed: 0'], axis=1)

# group by featurizer
by_featurizer = matrix_model.groupby('featurizer')

matrix_model.head()

Unnamed: 0,featurizer,model,best_params,recall_0,recall_1,roc_auc
0,spe,bnb,{'alpha': 0.1},0.863806,0.492958,0.820685
1,spe,rf,"{'max_depth': None, 'n_estimators': 100}",0.975746,0.098592,0.821185
2,spe,logreg,"{'C': 0.5, 'multi_class': 'auto', 'solver': 'l...",0.983209,0.197183,0.818951
3,spe,knn,{'n_neighbors': 1},0.902985,0.295775,0.59938
4,spe,svm,"{'C': 1, 'kernel': 'linear', 'probability': True}",0.934701,0.380282,0.786184


In [6]:
# upload results dataset
conventional = pd.read_csv('../processed_data/model_matrix_conventional.csv')
conventional = conventional.drop(['Unnamed: 0'], axis=1)

# group by featurizer
by_method = conventional.groupby('featurizer')

conventional.head()

Unnamed: 0,featurizer,model,best_params,recall_0,recall_1,roc_auc
0,ecfp,bnb,{'alpha': 1e-10},0.652985,0.887324,0.843835
1,ecfp,rf,"{'max_depth': 100, 'n_estimators': 100}",0.979478,0.253521,0.89074
2,ecfp,logreg,"{'C': 0.5, 'multi_class': 'auto', 'solver': 'l...",0.945896,0.422535,0.858682
3,ecfp,knn,{'n_neighbors': 1},0.912313,0.549296,0.730805
4,ecfp,svm,"{'C': 0.1, 'kernel': 'linear', 'probability': ...",0.977612,0.239437,0.850983


#### Helper functions for running model loops

In [7]:
def run_model(classifier, x_train, x_test):
    """Helper function for running and evaluating models
       Returns recall for each label and ROC-AUC score"""
    
    trained_classifier = classifier.fit(x_train, train_labels)
    
    # calculate recall for 0 and 1 
    report = classification_report(test_labels, trained_classifier.predict(x_test), output_dict=True)
    recall_0 = report['0']['recall']
    recall_1 = report['1']['recall']
    
    # calculate roc-auc score for model
    roc_auc = roc_auc_score(test_labels, trained_classifier.predict_proba(x_test)[:, 1])
    
    return recall_0, recall_1, roc_auc

#### Initialize list for constructing results dataframe

In [8]:
results_list = []

## Oversampled Model Matrix Experiment

Re-run best models from model matrix experiment using oversampled data.

### SPE Featurizer

In [9]:
# generate features
x_spe, x_spe_test, spe_vocab = spe_featurizer(train_data, test_data)

print(x_spe.shape)
print(x_spe_test.shape)

(11896, 2378)
(607, 2378)


In [10]:
spe = by_featurizer.get_group('spe')
spe_best_params = [ast.literal_eval(x) for x in spe.best_params]
spe_best_params

[{'alpha': 0.1},
 {'max_depth': None, 'n_estimators': 100},
 {'C': 0.5, 'multi_class': 'auto', 'solver': 'liblinear'},
 {'n_neighbors': 1},
 {'C': 1, 'kernel': 'linear', 'probability': True}]

In [11]:
# best model list
bnb_spe = BernoulliNB(**spe_best_params[0])
rf_spe = RandomForestClassifier(**spe_best_params[1])
logreg_spe = LogisticRegression(**spe_best_params[2])
knn_spe = KNeighborsClassifier(**spe_best_params[3])
svm_spe = SVC(**spe_best_params[4])

spe_classifier_list = [bnb_spe, rf_spe, logreg_spe, knn_spe, svm_spe]

In [12]:
for i in range(len(spe_classifier_list)):
    clf = spe_classifier_list[i]
    clf_name = classifier_names[i]
    best_params = spe_best_params[i]
    
    # run best model
    recall_0, recall_1, roc_auc = run_model(clf, x_spe, x_spe_test)
    
    # make dictionary of metrics
    metrics = {'featurizer': 'spe', 'model': clf_name, 'best_params': best_params,
               'recall_0': recall_0, 'recall_1': recall_1, 'roc_auc': roc_auc}
    
    # collect in results list
    results_list.append(metrics)
    
len(results_list)

5

### K-mer Featurizer

In [13]:
# generate features
x_kmer, x_kmer_test, kmer_vocab = kmer_featurizer(train_data, test_data)

print(x_kmer.shape)
print(x_kmer_test.shape)

(11896, 7831)
(607, 7831)


In [14]:
kmer = by_featurizer.get_group('kmer')
kmer_best_params = [ast.literal_eval(x) for x in kmer.best_params]
kmer_best_params

[{'alpha': 0.1},
 {'max_depth': 100, 'n_estimators': 500},
 {'C': 0.5, 'multi_class': 'auto', 'solver': 'liblinear'},
 {'n_neighbors': 1},
 {'C': 1, 'kernel': 'linear', 'probability': True}]

In [15]:
# best model list
bnb_kmer = BernoulliNB(**kmer_best_params[0])
rf_kmer = RandomForestClassifier(**kmer_best_params[1])
logreg_kmer = LogisticRegression(**kmer_best_params[2])
knn_kmer = KNeighborsClassifier(**kmer_best_params[3])
svm_kmer = SVC(**kmer_best_params[4])

kmer_classifier_list = [bnb_kmer, rf_kmer, logreg_kmer, knn_kmer, svm_kmer]

In [16]:
for i in range(len(kmer_classifier_list)):
    clf = kmer_classifier_list[i]
    clf_name = classifier_names[i]
    best_params = kmer_best_params[i]
    
    # run best model
    recall_0, recall_1, roc_auc = run_model(clf, x_kmer, x_kmer_test)
    
    # make dictionary of metrics
    metrics = {'featurizer': 'kmer', 'model': clf_name, 'best_params': best_params,
               'recall_0': recall_0, 'recall_1': recall_1, 'roc_auc': roc_auc}
    
    # collect in results list
    results_list.append(metrics)
    
len(results_list)

10

### Atom-wise Featurizer

In [17]:
# generate features
x_atom, x_atom_test, atom_vocab = atom_featurizer(train_data, test_data)

print(x_atom.shape)
print(x_atom_test.shape)

(11896, 131)
(607, 131)


In [18]:
atom = by_featurizer.get_group('atom')
atom_best_params = [ast.literal_eval(x) for x in atom.best_params]
atom_best_params

[{'alpha': 1e-10},
 {'max_depth': 50, 'n_estimators': 200},
 {'C': 0.5, 'multi_class': 'auto', 'solver': 'liblinear'},
 {'n_neighbors': 1},
 {'C': 1, 'kernel': 'sigmoid', 'probability': True}]

In [19]:
# best model list
bnb_atom = BernoulliNB(**atom_best_params[0])
rf_atom = RandomForestClassifier(**atom_best_params[1])
logreg_atom = LogisticRegression(**atom_best_params[2])
knn_atom = KNeighborsClassifier(**atom_best_params[3])
svm_atom = SVC(**atom_best_params[4])

atom_classifier_list = [bnb_atom, rf_atom, logreg_atom, knn_atom, svm_atom]

In [20]:
for i in range(len(atom_classifier_list)):
    clf = atom_classifier_list[i]
    clf_name = classifier_names[i]
    best_params = atom_best_params[i]
    
    # run best model
    recall_0, recall_1, roc_auc = run_model(clf, x_atom, x_atom_test)
    
    # make dictionary of metrics
    metrics = {'featurizer': 'atom', 'model': clf_name, 'best_params': best_params,
               'recall_0': recall_0, 'recall_1': recall_1, 'roc_auc': roc_auc}
    
    # collect in results list
    results_list.append(metrics)
    
len(results_list)

15

## Export results

Make results available for exploration in another notebook

In [21]:
# convert results list to a dataframe
model_matrix = pd.DataFrame(results_list)

In [22]:
model_matrix

Unnamed: 0,featurizer,model,best_params,recall_0,recall_1,roc_auc
0,spe,bnb,{'alpha': 0.1},0.785448,0.605634,0.766423
1,spe,rf,"{'max_depth': None, 'n_estimators': 100}",0.955224,0.253521,0.814668
2,spe,logreg,"{'C': 0.5, 'multi_class': 'auto', 'solver': 'l...",0.822761,0.661972,0.807599
3,spe,knn,{'n_neighbors': 1},0.902985,0.295775,0.59938
4,spe,svm,"{'C': 1, 'kernel': 'linear', 'probability': True}",0.843284,0.549296,0.777591
5,kmer,bnb,{'alpha': 0.1},0.735075,0.802817,0.832852
6,kmer,rf,"{'max_depth': 100, 'n_estimators': 500}",0.953358,0.450704,0.893644
7,kmer,logreg,"{'C': 0.5, 'multi_class': 'auto', 'solver': 'l...",0.845149,0.704225,0.872609
8,kmer,knn,{'n_neighbors': 1},0.932836,0.408451,0.670643
9,kmer,svm,"{'C': 1, 'kernel': 'linear', 'probability': True}",0.873134,0.605634,0.830198


In [23]:
# generate csv file for use in further exploration
#model_matrix.to_csv('../processed_data/model_matrix_over2.csv')

## Oversampled Conventional Matrix Experiment

Re-run best models from conventional RDKit matrix experiment using oversampled data.

### ECFP Featurizer

In [24]:
# generate features
x_ecfp, x_ecfp_test = ecfp_featurizer(train_over, test)

print(x_ecfp.shape)
print(x_ecfp_test.shape)

(11896, 100)
(607, 100)


In [25]:
ecfp = by_method.get_group('ecfp')
ecfp_best_params = [ast.literal_eval(x) for x in ecfp.best_params]
ecfp_best_params

[{'alpha': 1e-10},
 {'max_depth': 100, 'n_estimators': 100},
 {'C': 0.5, 'multi_class': 'auto', 'solver': 'liblinear'},
 {'n_neighbors': 1},
 {'C': 0.1, 'kernel': 'linear', 'probability': True}]

In [26]:
# best model list
bnb_ecfp = BernoulliNB(**ecfp_best_params[0])
rf_ecfp = RandomForestClassifier(**ecfp_best_params[1])
logreg_ecfp = LogisticRegression(**ecfp_best_params[2])
knn_ecfp = KNeighborsClassifier(**ecfp_best_params[3])
svm_ecfp = SVC(**ecfp_best_params[4])

ecfp_classifier_list = [bnb_ecfp, rf_ecfp, logreg_ecfp, knn_ecfp, svm_ecfp]

In [27]:
for i in range(len(ecfp_classifier_list)):
    clf = ecfp_classifier_list[i]
    clf_name = classifier_names[i]
    best_params = ecfp_best_params[i]
    
    # run best model
    recall_0, recall_1, roc_auc = run_model(clf, x_ecfp, x_ecfp_test)
    
    # make dictionary of metrics
    metrics = {'featurizer': 'ecfp', 'model': clf_name, 'best_params': best_params,
               'recall_0': recall_0, 'recall_1': recall_1, 'roc_auc': roc_auc}
    
    # collect in results list
    results_list.append(metrics)
    
len(results_list)

20

### 1D Featurizer (Molecular Descriptors)

In [28]:
# generate features
x_oned, x_oned_test = oned_featurizer(train_over, test)

print(x_oned.shape)
print(x_oned_test.shape)

(11896, 9)
(607, 9)


In [29]:
oned = by_method.get_group('oned')
oned_best_params = [ast.literal_eval(x) for x in oned.best_params]
oned_best_params

[{'alpha': 1e-10},
 {'max_depth': 50, 'n_estimators': 100},
 {'C': 0.5, 'multi_class': 'auto', 'solver': 'liblinear'},
 {'n_neighbors': 1},
 {'C': 0.0001, 'kernel': 'linear', 'probability': True}]

In [30]:
# best model list
bnb_oned = BernoulliNB(**oned_best_params[0])
rf_oned = RandomForestClassifier(**oned_best_params[1])
logreg_oned = LogisticRegression(**oned_best_params[2])
knn_oned = KNeighborsClassifier(**oned_best_params[3])
svm_oned = SVC(**oned_best_params[4])

oned_classifier_list = [bnb_oned, rf_oned, logreg_oned, knn_oned, svm_oned]

In [31]:
for i in range(len(oned_classifier_list)):
    clf = oned_classifier_list[i]
    clf_name = classifier_names[i]
    best_params = oned_best_params[i]
    
    # run best model
    recall_0, recall_1, roc_auc = run_model(clf, x_oned, x_oned_test)
    
    # make dictionary of metrics
    metrics = {'featurizer': 'oned', 'model': clf_name, 'best_params': best_params,
               'recall_0': recall_0, 'recall_1': recall_1, 'roc_auc': roc_auc}
    
    # collect in results list
    results_list.append(metrics)
    
len(results_list)

25

## Export results

Make results available for exploration in another notebook

In [32]:
# convert results list to a dataframe
model_matrix_conventional = pd.DataFrame(results_list)

In [33]:
model_matrix_conventional2 = model_matrix_conventional[15:].reset_index(drop=True)
model_matrix_conventional2

Unnamed: 0,featurizer,model,best_params,recall_0,recall_1,roc_auc
0,ecfp,bnb,{'alpha': 1e-10},0.572761,0.971831,0.8426
1,ecfp,rf,"{'max_depth': 100, 'n_estimators': 100}",0.960821,0.450704,0.884736
2,ecfp,logreg,"{'C': 0.5, 'multi_class': 'auto', 'solver': 'l...",0.731343,0.830986,0.854189
3,ecfp,knn,{'n_neighbors': 1},0.912313,0.549296,0.730805
4,ecfp,svm,"{'C': 0.1, 'kernel': 'linear', 'probability': ...",0.718284,0.816901,0.838948
5,oned,bnb,{'alpha': 1e-10},0.36194,0.760563,0.554853
6,oned,rf,"{'max_depth': 50, 'n_estimators': 100}",0.934701,0.253521,0.697814
7,oned,logreg,"{'C': 0.5, 'multi_class': 'auto', 'solver': 'l...",0.307836,0.859155,0.618023
8,oned,knn,{'n_neighbors': 1},0.809701,0.43662,0.623161
9,oned,svm,"{'C': 0.0001, 'kernel': 'linear', 'probability...",0.298507,0.901408,0.642225


In [34]:
# generate csv file for use in further exploration
#model_matrix_conventional2.to_csv('../processed_data/model_matrix_conventional_over.csv')