In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import seaborn as sns
from scipy.stats import norm
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV, cross_validate, cross_val_score
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, balanced_accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import RepeatedStratifiedKFold
from mlxtend.classifier import StackingCVClassifier, EnsembleVoteClassifier
from mlxtend.feature_selection import ColumnSelector
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

In [3]:
params = {'n_estimators': [300, 400, 500, 600, 700],
              'learning_rate': [0.01, 0.02, 0.03, 0.05, 0.07],
              'gamma': [0.5, 1, 1.5, 2, 5],
              'max_depth': [3, 4, 5, 6],
              'subsample': [0.6, 0.8, 1.0],
              'colsample_bytree': [0.6, 0.8, 1.0],
              'min_child_weight': [1, 2, 3, 4, 5]}

seed = 42
st_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

def calc_scores(clf, X_test, y_test):
    y_pred = clf.predict(X_test)
    recall_0, recall_1 = recall_score(y_test, y_pred, pos_label=0), recall_score(y_test, y_pred, pos_label=1)
    precision_0, precision_1 =  precision_score(y_test, y_pred, pos_label=0), precision_score(y_test, y_pred, pos_label=1)
    acc = balanced_accuracy_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
    arr = np.array([[acc, precision_0, recall_0, precision_1, recall_1,auc_score]])
    return pd.DataFrame(data=arr, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])

def recall_0(y_true, y_pred):
    return recall_score(y_true, y_pred, pos_label=0)

def precision_0(y_true, y_pred):
    return precision_score(y_true, y_pred, pos_label=0)

scoring = {"balanced_accuracy": make_scorer(balanced_accuracy_score),
           "recall_0": make_scorer(recall_0), "precision_0": make_scorer(precision_0),
           "recall_1": make_scorer(recall_score), "precision_1": make_scorer(precision_score), "auc": "roc_auc" }

#cross_validation

def print_score_comparison(raw_score, emb_score, target_feature="posOutcome",
                           header_1="Raw Score", header_2="Embedding Score"):
    print("\t\t{0}\n\t\t\t{1}\t\t{2}".format(target_feature, header_1, header_2))
    print("\t\t-----------------------------------------------")
    print("balanced_accuracy:\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["balanced_accuracy"].mean(), emb_score["balanced_accuracy"].mean()))
    print("precision_0:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["precision_0"].mean(), emb_score["precision_0"].mean()))
    print("recall_0:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["recall_0"].mean(), emb_score["recall_0"].mean()))
    print("precision_1:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["precision_1"].mean(), emb_score["precision_1"].mean()))
    print("recall_1:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["recall_1"].mean(), emb_score["recall_1"].mean()))
    print("auc:\t\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["auc"].mean(), emb_score["auc"].mean()))

def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time

    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

def param_tuning(X, y, n_folds=5, param_comb=25, scoring='roc_auc', jobs=12):
    xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1)
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    rand_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring=scoring, n_jobs=jobs,
                                   cv=skf.split(X, y), verbose=3, random_state=42)

    start_time = timer(None) # timing starts from this point for "start_time" variable
    rand_search.fit(X, y)
    timer(start_time)
    print("Best Score: {:.3%}".format(rand_search.best_score_))
    print(rand_search.best_params_)
    return rand_search

score_cols = ["test_balanced_accuracy","test_precision_0", "test_recall_0",
               "test_precision_1","test_recall_1", "test_auc"]

def get_scores(cv_results, score_keys=None, df_cols=None):
    if score_keys is None:
        score_keys = score_cols
    if df_cols is None:
        score_keys = score_cols
    scores = np.empty([1, len(score_keys)])
    for i, s in enumerate(score_keys):
        scores[0][i] = np.mean(cv_results[s])
    scores_df = pd.DataFrame(data=scores, columns=df_cols)
    return scores_df

def evaluate_embedding(path, outcome_df, target="posOutcome", merge_col="patient_ID", n_jobs=-1):
    emb_df = pd.read_csv(path, sep="\t")
    emb_outcome_df = pd.merge(outcome_df, emb_df, on=merge_col)
    X_emb, y_emb = emb_outcome_df[emb_outcome_df.columns.difference([merge_col, target])], emb_outcome_df[target]
    X_train_emb, X_test_emb, y_train_emb, y_test_emb = train_test_split(X_emb, y_emb, test_size=0.3, random_state=seed)
    rand_search_emb = param_tuning(X_train_emb, y_train_emb, jobs=n_jobs)
    params = rand_search_emb.best_params_
    clf_emb = rand_search_emb.best_estimator_
    cv_res = cross_validate(clf_emb, X_train_emb, y_train_emb, scoring=scoring, n_jobs=n_jobs, verbose=1, return_train_score=True,
                            cv=st_cv)
    cv_res_df = get_scores(cv_res)
    clf_emb.fit(X_train_emb, y_train_emb)
    test_scores_df = calc_scores(clf_emb, X_test_emb, y_test_emb)

    return params, cv_res_df, test_scores_df

In [4]:
ge_outcome_df = pd.read_csv("datasets/train.csv")
ge_outcome_test_df = pd.read_csv("datasets/test.csv")
X_train, y_train = ge_outcome_df[ge_outcome_df.columns.difference(["patient_ID", "posOutcome"])], ge_outcome_df["posOutcome"]

X_test, y_test = ge_outcome_test_df[ge_outcome_df.columns.difference(["patient_ID", "posOutcome"])], ge_outcome_test_df["posOutcome"]

print("Train shape: {0}\nTest shape:{1}".format(X_train.shape, X_test.shape))

Train shape: (1565, 8832)
Test shape:(672, 8832)


In [5]:
#Load the models
clf_mrmr50 = XGBClassifier()
clf_mrmr50.load_model("datasets/models/clf_mrmr50_acc.json")
clf_moses83 = XGBClassifier()
clf_moses83.load_model("datasets/models/clf_moses83.json")
clf_mrmr500 = XGBClassifier()
clf_mrmr500.load_model("datasets/models/clf_mrmr500.json")
clf_moses500 = XGBClassifier()
clf_moses500.load_model("datasets/models/clf_moses500.json")

In [18]:
fts_moses83 = []
fts_moses500 = []
with open("datasets/moses_ft83.txt", "r") as fp:
    for line in fp.readlines():
        fts_moses83.append(line.strip())

with open("datasets/moses_ft500_bmc.txt", "r") as fp:
    for line in fp.readlines():
        fts_moses500.append(line.strip())

fts_50_df = pd.read_csv("datasets/mrmr_top50.tsv", sep="\t")
fts_50_df.columns = ["Order", "Feat_Index", "Name", "Score"]
fts_50_df["Name"] = fts_50_df["Name"].str.strip()
feats_50 = fts_50_df["Name"].to_list()
fts_500_df = pd.read_csv("datasets/mrmr_top500.tsv", sep="\t")
fts_500_df.columns = ["Order", "Feat_Index", "Name", "Score"]
fts_500_df["Name"] = fts_500_df["Name"].str.strip()
feats_500 = fts_500_df["Name"].to_list()

In [19]:
moses83_pipe = make_pipeline(ColumnSelector(cols=fts_moses83),
                             clf_moses83)

mrmr83_pipe = make_pipeline(ColumnSelector(cols=feats_50),
                           clf_mrmr50)

eclf_1 = EnsembleVoteClassifier(clfs=[moses83_pipe, mrmr83_pipe],
                                voting="hard")

cv_results_eclf_1 = cross_validate(eclf_1, X_train, y_train,
                                   n_jobs=-1, scoring=scoring, cv=st_cv)

cv_results_eclf_1

In [11]:
scores_eclf_1_df = get_scores(cv_results_eclf_1, score_cols, df_cols=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
scores_eclf_1_df.mean()

balanced_accuracy    0.751339
recall_0             0.733227
precision_0          0.847273
recall_1             0.794432
precision_1          0.655405
auc                  0.838436
dtype: float64

In [12]:
eclf_2 = EnsembleVoteClassifier(clfs=[moses83_pipe, mrmr83_pipe],
                                voting="soft")

cv_results_eclf_2 = cross_validate(eclf_2, X_train, y_train,
                                   n_jobs=-1, scoring=scoring, cv=st_cv)

cv_results_eclf_2

{'fit_time': array([85.03790474, 87.78969002, 87.96304607, 87.30136609, 86.71395373]),
 'score_time': array([0.08550072, 0.04847836, 0.01700473, 0.05619431, 0.09144735]),
 'test_balanced_accuracy': array([0.7522932 , 0.78443898, 0.76625717, 0.78816544, 0.78071253]),
 'test_recall_0': array([0.84242424, 0.81212121, 0.77575758, 0.80606061, 0.81818182]),
 'test_precision_0': array([0.73544974, 0.78823529, 0.7804878 , 0.79640719, 0.78034682]),
 'test_recall_1': array([0.66216216, 0.75675676, 0.75675676, 0.77027027, 0.74324324]),
 'test_precision_1': array([0.79032258, 0.78321678, 0.75167785, 0.78082192, 0.78571429]),
 'test_auc': array([0.84127764, 0.83046683, 0.84627355, 0.84058149, 0.83357903])}

In [13]:
scores_eclf_2_df = get_scores(cv_results_eclf_2, score_cols, df_cols=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
scores_eclf_2_df.mean()

balanced_accuracy    0.774373
recall_0             0.776185
precision_0          0.810909
recall_1             0.778351
precision_1          0.737838
auc                  0.838436
dtype: float64

In [20]:
sclf_1 = StackingCVClassifier(classifiers=[moses83_pipe, mrmr83_pipe], meta_classifier=LogisticRegression(),
                              cv=st_cv, verbose=True, use_probas=False,
                              random_state=seed)

cv_results_sclf_1 = cross_validate(sclf_1, X_train, y_train,
                                   n_jobs=-1, scoring=scoring, cv=st_cv)

cv_results_sclf_1

In [15]:
scores_sclf_1_df = get_scores(cv_results_sclf_1, score_cols, df_cols=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
scores_sclf_1_df.mean()

balanced_accuracy    0.780495
recall_0             0.796724
precision_0          0.786667
recall_1             0.766126
precision_1          0.774324
auc                  0.804054
dtype: float64

In [16]:
eclf_1.fit(X_train, y_train)
eclf_2.fit(X_train, y_train)
sclf_1.fit(X_train, y_train)

Fitting 2 classifiers...
Fitting classifier1: pipeline (1/2)
Fitting classifier2: pipeline (2/2)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    4.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.0s finished


StackingCVClassifier(classifiers=[Pipeline(steps=[('columnselector',
                                                   ColumnSelector(cols=['PCOLCE2',
                                                                        'SGCA',
                                                                        'SFTPA2',
                                                                        'INSL6',
                                                                        'RHAG',
                                                                        'OMP',
                                                                        'NPY5R',
                                                                        'STMN2',
                                                                        'SCARF1',
                                                                        'KIF13A',
                                                                        'DUOX1',
                                       

In [9]:
moses500_pipe = make_pipeline(ColumnSelector(cols=fts_moses500),
                             clf_moses500)

mrmr500_pipe = make_pipeline(ColumnSelector(cols=feats_500),
                           clf_mrmr500)

eclf_3 = EnsembleVoteClassifier(clfs=[moses500_pipe, mrmr500_pipe],
                                voting="hard")

cv_results_eclf_3 = cross_validate(eclf_3, X_train, y_train,
                                   n_jobs=-1, scoring=scoring, cv=st_cv)

cv_results_eclf_3_df = get_scores(cv_results_eclf_3, df_cols=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
cv_results_eclf_3_df.mean()

balanced_accuracy    0.772682
recall_0             0.756444
precision_0          0.852121
recall_1             0.807634
precision_1          0.693243
auc                  0.853464
dtype: float64

In [10]:
eclf_4 = EnsembleVoteClassifier(clfs=[moses500_pipe, mrmr500_pipe],
                                voting="soft")

cv_results_eclf_4 = cross_validate(eclf_4, X_train, y_train,
                                   n_jobs=-1, scoring=scoring, cv=st_cv)

cv_results_eclf_4_df = get_scores(cv_results_eclf_4, df_cols=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
cv_results_eclf_3_df.mean()

balanced_accuracy    0.772682
recall_0             0.756444
precision_0          0.852121
recall_1             0.807634
precision_1          0.693243
auc                  0.853464
dtype: float64

In [11]:
sclf_2 = StackingCVClassifier(classifiers=[moses500_pipe, mrmr500_pipe], meta_classifier=LogisticRegression(),
                              cv=st_cv, verbose=True, use_probas=False,
                              random_state=seed)

cv_results_sclf_2 = cross_validate(sclf_2, X_train, y_train,
                                   n_jobs=-1, scoring=scoring, cv=st_cv)

cv_results_sclf_2_df = get_scores(cv_results_sclf_2, df_cols=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
cv_results_sclf_2_df.mean()

balanced_accuracy    0.791314
recall_0             0.794133
precision_0          0.821818
recall_1             0.793073
precision_1          0.760811
auc                  0.810500
dtype: float64

In [12]:
sclf_3 = StackingCVClassifier(classifiers=[moses500_pipe, mrmr500_pipe], meta_classifier=LogisticRegression(),
                              cv=st_cv, verbose=True, use_probas=True,
                              random_state=seed)

cv_results_sclf_3 = cross_validate(sclf_3, X_train, y_train,
                                   n_jobs=-1, scoring=scoring, cv=st_cv)

cv_results_sclf_3_df = get_scores(cv_results_sclf_3, df_cols=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
cv_results_sclf_3_df.mean()

balanced_accuracy    0.790708
recall_0             0.793444
precision_0          0.820606
recall_1             0.792170
precision_1          0.760811
auc                  0.859214
dtype: float64

In [14]:
clf_xgb500 = XGBClassifier()
clf_xgb500.load_model("datasets/models/clf_xgb500.json")

fts_xgb500 = []

with open("datasets/xgb500_genes.txt", "r") as fp:
    for line in fp.readlines():
        fts_xgb500.append(line.strip())

In [15]:
xgb500_pipe = make_pipeline(ColumnSelector(cols=fts_xgb500), clf_xgb500)
sclf_4 = StackingCVClassifier(classifiers=[moses500_pipe, mrmr500_pipe, xgb500_pipe], meta_classifier=LogisticRegression(),
                              cv=st_cv, verbose=True,
                              random_state=seed)

cv_results_sclf_4 = cross_validate(sclf_4, X_train, y_train,
                                   n_jobs=-1, scoring=scoring, cv=st_cv)

cv_results_sclf_4_df = get_scores(cv_results_sclf_4, df_cols=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
cv_results_sclf_4_df.mean()


balanced_accuracy    0.828247
recall_0             0.838241
precision_0          0.837576
recall_1             0.820057
precision_1          0.818919
auc                  0.848468
dtype: float64

In [21]:
sclf_4.fit(X_train, y_train)
sclf_3.fit(X_train, y_train)
sclf_2.fit(X_train, y_train)
sclf_1.fit(X_train, y_train)

Fitting 2 classifiers...
Fitting classifier1: pipeline (1/2)
Fitting classifier2: pipeline (2/2)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    4.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.2s finished


StackingCVClassifier(classifiers=[Pipeline(steps=[('columnselector',
                                                   ColumnSelector(cols=['PCOLCE2',
                                                                        'SGCA',
                                                                        'SFTPA2',
                                                                        'INSL6',
                                                                        'RHAG',
                                                                        'OMP',
                                                                        'NPY5R',
                                                                        'STMN2',
                                                                        'SCARF1',
                                                                        'KIF13A',
                                                                        'DUOX1',
                                       

In [22]:
test_scores_sclf_1_df = calc_scores(sclf_1, X_test, y_test)
test_scores_sclf_1_df.mean()

balanced_accuracy    0.755410
recall_0             0.769886
precision_0          0.765537
recall_1             0.740625
precision_1          0.745283
auc                  0.777209
dtype: float64

In [23]:
test_scores_sclf_2_df = calc_scores(sclf_2, X_test, y_test)
test_scores_sclf_2_df.mean()

balanced_accuracy    0.756343
recall_0             0.766017
precision_0          0.776836
recall_1             0.747604
precision_1          0.735849
auc                  0.788313
dtype: float64

In [24]:
test_scores_sclf_3_df = calc_scores(sclf_3, X_test, y_test)
test_scores_sclf_3_df.mean()

balanced_accuracy    0.772066
recall_0             0.787966
precision_0          0.776836
recall_1             0.755418
precision_1          0.767296
auc                  0.830144
dtype: float64

In [25]:
test_scores_sclf_4_df = calc_scores(sclf_4, X_test, y_test)
test_scores_sclf_4_df.mean()

balanced_accuracy    0.761060
recall_0             0.772472
precision_0          0.776836
recall_1             0.750000
precision_1          0.745283
auc                  0.805369
dtype: float64