In [2]:
import warnings

import numpy as np
import pandas as pd
warnings.filterwarnings('ignore')
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, balanced_accuracy_score
from sklearn.model_selection import RepeatedStratifiedKFold,  cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer

In [3]:
def calc_scores(clf, X_test, y_test):
    y_pred = clf.predict(X_test)
    recall_0, recall_1 = recall_score(y_test, y_pred, pos_label=0), recall_score(y_test, y_pred, pos_label=1)
    precision_0, precision_1 =  precision_score(y_test, y_pred, pos_label=0), precision_score(y_test, y_pred, pos_label=1)
    acc = balanced_accuracy_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
    return np.array([[acc, recall_0, precision_0, recall_1, precision_1, auc_score]])

def prepare_stack_input(estimators, X, y):
    num = len(estimators)
    num_samples = X.shape[0]
    matrix = np.empty(shape=[num_samples, num + 1])

    for i, k in enumerate(estimators):
        clf, feats = estimators[k]
        if feats is not None:
            X_feats = X[feats]
            arr = clf.predict(X_feats)
        else:
            arr = clf.predict(X)
        matrix[:,i] = arr

    matrix[:, num] = y
    cols = [k for k in estimators]
    cols.append("posOutcome")
    df = pd.DataFrame(data=matrix, columns=cols)

    return df

def recall_0(y_true, y_pred):
    return recall_score(y_true, y_pred, pos_label=0)

def precision_0(y_true, y_pred):
    return precision_score(y_true, y_pred, pos_label=0)

scoring = {"balanced_accuracy": make_scorer(balanced_accuracy_score),
           "recall_0": make_scorer(recall_0), "precision_0": make_scorer(precision_0),
           "recall_1": make_scorer(recall_score), "precision_1": make_scorer(precision_score), "auc": "roc_auc" }

def run_exp(ests, X, y, target=None, folds=5, repeats=20):
    if target is None:
	    target = ["posOutcome"]
    rst_cv = RepeatedStratifiedKFold(n_splits=folds, n_repeats=repeats)
    cross_val_res = {}
    test_scores = {}
    estimators = {}
    for i in range(3):
        X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
        log_input_df = prepare_stack_input(ests, X_train, y_train)
        clf_log = LogisticRegression()
        X_log = log_input_df[log_input_df.columns.difference(target)]
        cv_results = cross_validate(clf_log, X_log, y_train, n_jobs=-1, cv=rst_cv,
                                    scoring=scoring, return_train_score=True)
        k = "run_%d" % i
        cross_val_res[k] = cv_results
        estimators[k] = clf_log
        log_input_test_df = prepare_stack_input(ests, X_test, y_test)
        clf_log.fit(X_log, y_train)
        X_log_test = log_input_test_df[log_input_test_df.columns.difference(target)]
        test_scores[k] = calc_scores(clf_log, X_log_test, y_test)
    return cross_val_res, test_scores ,estimators



In [4]:
#Load the models
clf_xg50 = XGBClassifier()
clf_xg50.load_model("datasets/models/xgb50_raw.json")
clf_moses50 = XGBClassifier()
clf_moses50.load_model("datasets/models/moses50_raw.json")
clf_raw = XGBClassifier()
clf_raw.load_model("datasets/models/raw_model.json")
clf_pam = XGBClassifier()
clf_pam.load_model("datasets/models/pam35_raw.json")
clf_gan = XGBClassifier()
clf_gan.load_model("datasets/models/infogan_model.json")

In [5]:
moses50_genes = ["PRND", "FRS3", "FCN3", "DSCR4", "BRCA2", "CXCL6", "LMX1B", "DLX5", "OMP", "ADH6", "PGAP1", "ART3", "BCHE", "FGB", "IL1RAPL1", "FSTL4", "ASGR1", "ZNF135", "DLL3", "NPHS2", "ANGPT2", "GLP2R", "GRIA3", "HOXB8", "MSC", "PLA2R1", "CYP2F1", "TAS2R7", "NKX6-1", "WNT11", "CHST11", "CLCA4", "ENPEP", "PAH", "WFDC1", "CHGA", "SEZ6L", "UGT2A3", "PRDM16", "GALR2", "GUCA1A", "CASQ1", "NOS1AP", "CACNA2D3", "FHOD3", "SRGAP3", "TMOD2", "ATOH1", "SLC6A1", "HAS1"]
xgb50_genes = ['CDX4','GLRA1', 'OR12D3', 'DSCR4', 'HOXB8', 'C9', 'MTNR1B', 'MOS', 'HSD17B3', 'FGF20', 'KCNH4', 'ATP4B', 'CPB2', 'CRYBB1', 'ANGPTL3', 'MYH8', 'GYS2', 'SLC25A21', 'TAS2R7', 'F11', 'GABRA6', 'MYT1L', 'DEFB126', 'RPL18', 'GABRQ', 'ZFP37', 'PIP5K1B', 'MCM5', 'PRKAA1', 'WDR76', 'CHRM4', 'RPS6KC1', 'EIF1AY', 'WNT1', 'SCN3B', 'NLGN4Y', 'MAGEB1', 'NUDC', 'HIGD1A', 'OXCT2', 'GALR2', 'EEF1B2', 'RXRG', 'CALCA', 'TEX13A', 'CST3', 'IGFBP4', 'CRYGA', 'ESR1', 'ZNF750']
pam35_genes = ["BAG1", "BIRC5", "BLVRA", "CCNB1", "CCNE1", "CDC20", "CDC6", "CDH3", "CENPF", "CEP55", "EGFR", "ERBB2", "ESR1", "EXO1", "FOXA1", "FOXC1",  "GRB7", "KIF2C", "KRT14", "KRT17", "KRT5", "MAPT", "MDM2", "MELK", "MIA", "MKI67", "MMP11", "MYBL2", "MYC", "PGR", "RRM2", "SFRP1", "SLC39A6", "TYMS", "UBE2C"]

In [6]:
estimators = {"raw": (clf_raw, None), "xgb50" : (clf_xg50, xgb50_genes),
              "moses50": (clf_moses50, moses50_genes), "pam35_genes": (clf_pam, pam35_genes)}

In [7]:
ge_df = pd.read_csv("datasets/merged-combat15.csv")
pos_outcome_df = pd.read_csv("datasets/combat15outcomes_latest.csv")
outcome_df = pos_outcome_df[["patient_ID", "posOutcome"]].dropna(axis=0, subset=["posOutcome"])
outcome_df.posOutcome = outcome_df.posOutcome.astype(int)
ge_outcome_df = pd.merge(outcome_df, ge_df, on="patient_ID")

X, y = ge_outcome_df[ge_outcome_df.columns.difference(["patient_ID", "posOutcome"])], ge_outcome_df["posOutcome"]

X.shape

(2237, 8832)

In [8]:
exp_scores, exp_test_scores, exp_ests = run_exp(estimators, X, y, target=["posOutcome"])

In [9]:
score_cols = ["test_balanced_accuracy", "train_balanced_accuracy", "test_recall_0",
              "train_recall_0", "test_precision_0", "train_precision_0",
              "test_recall_1","train_recall_1", "test_precision_1",
              "train_precision_1" ,"test_auc", "train_auc"]

def get_score_df(scores, cols=score_cols):
    scores_arr = np.empty([3, 12])

    for i, k in enumerate(scores):
        for j, s in enumerate(score_cols):
            scores_arr[i][j] = np.mean(scores[k][s])

    return scores_arr

log_scores_arr_1 = get_score_df(exp_scores)
log_cross_val_scores_df = pd.DataFrame(data=log_scores_arr_1, columns=score_cols)
log_cross_val_scores_df

Unnamed: 0,test_balanced_accuracy,train_balanced_accuracy,test_recall_0,train_recall_0,test_precision_0,train_precision_0,test_recall_1,train_recall_1,test_precision_1,train_precision_1,test_auc,train_auc
0,0.786328,0.787238,0.650459,0.650736,0.903441,0.904912,0.922198,0.92374,0.703505,0.703525,0.803607,0.805941
1,0.792211,0.793962,0.655778,0.656477,0.911999,0.914702,0.928645,0.931448,0.708023,0.708675,0.807237,0.808621
2,0.794122,0.794118,0.665171,0.665159,0.906446,0.906036,0.923072,0.923077,0.712505,0.71209,0.804118,0.811468


In [10]:
log_cross_val_scores_df.filter(like="train").mean()

train_balanced_accuracy    0.791773
train_recall_0             0.657457
train_precision_0          0.908550
train_recall_1             0.926088
train_precision_1          0.708097
train_auc                  0.808676
dtype: float64

In [11]:
log_cross_val_scores_df.filter(like="test").mean()

test_balanced_accuracy    0.790887
test_recall_0             0.657136
test_precision_0          0.907296
test_recall_1             0.924638
test_precision_1          0.708011
test_auc                  0.804987
dtype: float64

In [32]:
exp_scores_2, exp_test_scores_2, exp_ests_2 = run_exp(estimators, X, y, target=["posOutcome"],
                                   folds=50, repeats=5)

In [33]:
log_scores_arr_2 = get_score_df(exp_scores_2)

log_scores_arr_2_df = pd.DataFrame(data=log_scores_arr_2, columns=score_cols)
log_scores_arr_2_df.mean()

test_balanced_accuracy     0.786813
train_balanced_accuracy    0.787764
test_recall_0              0.650044
train_recall_0             0.650148
test_precision_0           0.908679
train_precision_0          0.906631
test_recall_1              0.923583
train_recall_1             0.925381
test_precision_1           0.708426
train_precision_1          0.703535
test_auc                   0.799698
train_auc                  0.803856
dtype: float64

In [34]:
log_scores_arr_2_df.filter(like="train").mean()

train_balanced_accuracy    0.787764
train_recall_0             0.650148
train_precision_0          0.906631
train_recall_1             0.925381
train_precision_1          0.703535
train_auc                  0.803856
dtype: float64

In [35]:
log_scores_arr_2_df.filter(like="test").mean()

test_balanced_accuracy    0.786813
test_recall_0             0.650044
test_precision_0          0.908679
test_recall_1             0.923583
test_precision_1          0.708426
test_auc                  0.799698
dtype: float64

In [36]:
score_cols_2 = ["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"]
test_scores_arr = np.empty([3, 6])
for i, k in enumerate(exp_test_scores_2):
    for j, s in enumerate(score_cols_2):
        test_scores_arr[i][j] = exp_test_scores_2[k][0,j]

test_scores_df = pd.DataFrame(data=test_scores_arr, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
test_scores_df

Unnamed: 0,balanced_accuracy,recall_0,precision_0,recall_1,precision_1,auc
0,0.764886,0.620339,0.884058,0.909434,0.68272,0.779053
1,0.780333,0.647458,0.892523,0.913208,0.699422,0.800294
2,0.797474,0.677966,0.900901,0.916981,0.718935,0.819309


In [37]:
test_scores_df.mean()

balanced_accuracy    0.780898
recall_0             0.648588
precision_0          0.892494
recall_1             0.913208
precision_1          0.700359
auc                  0.799552
dtype: float64

In [38]:
exp_test_scores_2

{'run_0': array([[0.76488647, 0.62033898, 0.88405797, 0.90943396, 0.68271955,
         0.77905341]]),
 'run_1': array([[0.78033259, 0.64745763, 0.89252336, 0.91320755, 0.69942197,
         0.80029421]]),
 'run_2': array([[0.79747362, 0.6779661 , 0.9009009 , 0.91698113, 0.71893491,
         0.81930924]])}