In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn import metrics


from collections import Counter
from sklearn.metrics import confusion_matrix
from classifier_SMOTEENN_local import SMOTEENNBaggingClassifier #modified module is in the local directory
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [2]:
Train = pd.read_csv('trainDF_rdkit.csv', index_col = 'smiles_parent')
Test = pd.read_csv('testDF_rdkit.csv', index_col = 'smiles_parent')

In [3]:
y_train = Train.iloc[:,:12].astype('float32')
X_train = Train.iloc[:,12:]
print('Before dropping redundant features: {}'.format(X_train.shape))

y_test = Test.iloc[:,:12].astype('float32')
X_test = Test.iloc[:,12:]

X_train = X_train.drop(X_train.std()[(X_train.std() == 0)].index, axis=1) #Remove columns where all values are the sames
X_test = X_test[X_test.columns.intersection(X_train.columns)].values #Discard columns from Test that are not in Train

X_train_cols = X_train.columns.values #get columns for feature importance
X_train = X_train.values
print('After dropping redundant features: {}'.format(X_train.shape))

Before dropping redundant features: (7683, 2378)
After dropping redundant features: (7683, 2082)


In [4]:
X_train_cols.shape

(2082,)

In [None]:
#scale = StandardScaler().fit(X_train)
#X_train = scale.transform(X_train)
#X_test = scale.transform(X_test)

In [5]:
from sklearn.metrics import fbeta_score, make_scorer
fbeta_scorer = make_scorer(fbeta_score, beta=1.5) #vary beta

In [None]:
from sklearn.model_selection import StratifiedKFold
from imblearn.ensemble import BalancedBaggingClassifier 
#from imblearn.ensemble import BalancedRandomForestClassifier

#class SMNClassifier(BalancedBaggingClassifier):
#    sampling_strategy=SMOTEENN()

num_est = range(5,101,5)
param_grid = {'base_estimator__n_estimators':num_est, "base_estimator__max_depth": [1, 3, 5, None], 
              "base_estimator__max_features": ["auto", "log2", None, 0.5, 0.2], 
              "base_estimator__min_samples_leaf": [0.5, 0.3, 0.1], "base_estimator__criterion": ["gini", "entropy"]}
cv = StratifiedKFold(5)
res_table = []
for target in y_train.columns:
    print(target)
    rows_tr = np.isfinite(y_train[target]).values
    rows_te = np.isfinite(y_test[target]).values

    iX_train = X_train[rows_tr, :]
    iy_train = y_train[target][rows_tr]
    iX_test = X_test[rows_te, :]
    iy_test = y_test[target][rows_te]

    X_train_, X_test_, y_train_, y_test_ = iX_train, iX_test, iy_train, iy_test
    
    #--------------------------------------------------
    bbc = GridSearchCV(BalancedBaggingClassifier(base_estimator = RandomForestClassifier(), sampling_strategy=SMOTEENN()), 
                       param_grid, scoring = fbeta_scorer, n_jobs = -1, cv = cv)
    
    #bbc = RandomizedSearchCV(BalancedBaggingClassifier(base_estimator = RandomForestClassifier()), param_grid, scoring = 'f1', n_iter= 25)
    
    #Also try n_estimators = imb_ratio
    bbc.fit(X_train_, y_train_)
    #--------------------------------------------------------------------------
    y_pred = bbc.predict_proba(X_test_)
    y_pred_ = bbc.predict(X_test_)
    #------------------------------------------------------------------
    res = []
    res.append(target)
    
    cm = metrics.confusion_matrix(y_test_, y_pred_); print(cm)
    TN = cm[0][0]
    FN = cm[1][0]
    TP = cm[1][1]
    FP = cm[0][1]
    res.extend([TP, FN, TN, FP]) # extend appends multiple elements, append takes only one
    
    prec = metrics.precision_score(y_test_, y_pred_); print("Precision: {0:.4f}".format(prec)); res.append(prec)
    recall = metrics.recall_score(y_test_, y_pred_); print("Recall: {0:.4f}".format(recall)); res.append(recall)
    F1 = metrics.f1_score(y_test_, y_pred_); print("f1-Score: {0:.4f}".format(F1)); res.append(F1)
    MCC = metrics.matthews_corrcoef(y_test_, y_pred_); print("MCC: {0:.4f}".format(MCC)); res.append(MCC)
    
    auroc = metrics.roc_auc_score(y_test_, y_pred[:, 1]); print("AUROC Score: {0:.4f}".format(auroc)); res.append(auroc)
    auprc = metrics.average_precision_score(y_test_, y_pred[:, 1]); print("AUPRC Score: {0:.4f}".format(auprc)); res.append(auprc)
    brier = metrics.brier_score_loss(y_test_, y_pred[:, 1]); print("Brier Score: {0:.4f}".format(brier)); res.append(brier)
    res_table.append(res)
    print('-----------------------------------------------------')

resDF = pd.DataFrame(res_table, columns = ['Target', 'TP', 'FN', 'TN', 'FP', 'Precision', 'Recall', 'F1-Score', 'MCC',
                                          'AUROC', 'AUPRC', 'brier'])
#resDF.to_csv('Results/SMOTEENN_recall.csv')

In [None]:

#QUICK CHECK: Effect of varying probability threshold on CM
'''
import seaborn as sns
from matplotlib import pyplot as plt
f1_scores = []
for cutoff in np.arange(0.1,1.0,0.1):
    print('Cut-Off: ', cutoff)
    y_pred_cut = (y_pred[:,1] > cutoff).astype(int)
    f1_cut = metrics.f1_score(y_test_, y_pred_cut)
    f1_scores.append(f1_cut)
    #-------------------------------------------------
    cm = metrics.confusion_matrix(y_test_, y_pred_cut); print(cm)
    prec = metrics.precision_score(y_test_, y_pred_cut); print("Precision: {0:.2f}".format(prec))
    recall = metrics.recall_score(y_test_, y_pred_cut); print("Recall: {0:.2f}".format(recall))
    av_prec = metrics.average_precision_score(y_test_, y_pred_cut); print("Av_Precision: {0:.2f}".format(av_prec))
    F1 = metrics.f1_score(y_test_, y_pred_cut); print("f1-Score: {0:.2f}".format(F1))
    print('----------------------------')
    #--------------------------
    #sns.scatterplot(f1_scores, names = np.arange(0.1,0.9,0.1))
plt.scatter(np.arange(0.1,1.0,0.1), f1_scores)
#plt.title('F-score')
#plt.title('Cut-off')
plt.show()
'''