In [4]:
from bayes_opt import BayesianOptimization
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV, RFE
from sklearn.metrics import roc_auc_score, recall_score, precision_score, balanced_accuracy_score, make_scorer
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate, cross_val_predict
warnings.filterwarnings('ignore')

In [122]:
train_df = pd.read_csv("datasets/train.csv")
X_train, y_train = train_df[train_df.columns.difference(["patient_ID", "posOutcome"])], train_df["posOutcome"]

ft_500 = []
with open("datasets/xgb500_genes.txt", "r") as fp:
    for line in fp.readlines():
        ft_500.append(line.strip())

X_train_500 = X_train[ft_500]
X_train_500.head()

Unnamed: 0,LITAF,TPBG,CCRL2,ZNF268,CCT6B,CD163,CD320,KIF17,KIF1A,CD44,...,WDR62,ADORA3,AP1B1,HGD,WHSC1L1,ADRB3,ADAMTSL2,ANXA7,ABI1,AHR
0,7.108456,7.46369,3.841445,4.848988,2.944146,5.316088,6.127067,3.629676,6.434729,6.424752,...,3.4724,3.884389,6.602958,2.997926,6.027037,3.148324,3.284457,8.879693,7.774435,8.3991
1,8.271182,9.319451,4.015521,3.670656,2.949599,7.182012,5.874283,3.501298,2.533096,7.274651,...,3.302293,4.162791,6.752108,4.526801,2.932985,3.351544,3.711498,8.241469,6.846877,7.129267
2,7.894494,8.052706,5.259943,5.5527,3.145896,7.906607,4.067954,3.641523,3.108423,8.087042,...,2.748002,5.146482,7.613484,2.615693,3.983403,3.288953,3.839644,8.194948,8.402202,8.11188
3,8.651722,9.1587,3.778845,5.629598,3.09684,7.099068,5.485791,3.600242,3.262309,6.773035,...,3.646378,3.597345,6.584821,5.801371,3.740626,3.062545,3.242531,8.907759,7.684565,6.964725
4,8.440454,6.642578,5.889477,4.645318,3.188979,10.478267,5.70505,3.607868,3.293778,6.724959,...,4.108428,7.332183,8.042622,3.921174,3.752512,3.593116,3.241774,8.678762,7.63701,6.882944


In [116]:
rand_seed = 42
params = {'n_estimators': [300, 400, 500, 600, 700],
              'learning_rate': [0.01, 0.02, 0.03, 0.05, 0.07],
              'gamma': [0.5, 1, 1.5, 2, 5],
              'max_depth': [3, 4, 5, 6],
              'subsample': [0.6, 0.8, 1.0],
              'colsample_bytree': [0.6, 0.8, 1.0],
              'min_child_weight': [1, 2, 3, 4, 5]}

init_points = {
              'max_depth': [3, 8, 3, 8, 8, 3, 8, 3],
              'gamma':    [0.5, 8, 0.2, 9, 0.5, 8, 0.2, 9],
              'min_child_weight': [0.2, 0.2, 0.2, 0.2, 12, 12, 12, 12],
              'subsample':  [0.6, 0.8, 0.6, 0.8, 0.6, 0.8, 0.6, 0.8],
              'colsample_bytree': [0.6, 0.8, 0.6, 0.8, 0.6, 0.8, 0.6, 0.8],
              'learning_rate': [0.01, 0.02, 0.01, 0.02, 0.01, 0.02, 0.01, 0.02],
              'n_   estimators': [400, 600, 400, 600, 400, 600, 700, 500]
              }

pbounds = {
             'n_estimators': (100, 800),
             'learning_rate': (0.01, 0.09),
             'max_depth': (2, 12),
             'gamma': (0.001, 10.0),
             'min_child_weight': (0, 20),
             'subsample': (0.4, 1.0),
             'colsample_bytree' :(0.4, 1.0)
            }

def bayesian_opt(train, target, n_jobs=-1):
    def xgb_opt(n_estimators, learning_rate, gamma, max_depth,
                subsample, colsample_bytree, min_child_weight):

        received_params = {
            'n_estimators': int(n_estimators),
            'learning_rate': learning_rate,
            'gamma': gamma,
            'max_depth': int(max_depth),
            'subsample': max(min(subsample, 1), 0),
            'colsample_bytree': max(min(colsample_bytree, 1), 0),
            'min_child_weight': min_child_weight,
            'random_state': rand_seed,
            'objective' : 'binary:logistic'
        }
        st_cv = StratifiedKFold(shuffle=True, random_state=rand_seed)
        clf = XGBClassifier(**received_params, n_jobs=4)
        cv_results = cross_validate(clf, train, target, cv=st_cv,
                                    n_jobs=14, scoring='roc_auc',
                                    return_train_score=True)
        val_score = np.mean(cv_results["test_score"])
        train_score = np.mean(cv_results["train_score"])
        print("Mean test auc: {0:.2%},  Mean train auc: {1:.2%}, Diff: {2:.2%}".format(
              val_score, train_score, (train_score-val_score)))

        return (val_score * 2) - 1

    return xgb_opt

In [118]:
xgb_500_bo = bayesian_opt(X_train, y_train)
xgb_bo_500 = BayesianOptimization(f=xgb_500_bo, pbounds=pbounds,
                              random_state=rand_seed,
                              verbose=2,)
# xgb_bo_500.probe(params=init_points, lazy=True)
xgb_bo_500.maximize(init_points=4, n_iter=10, acq='ei', xi=0.0)

|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | min_ch... | n_esti... | subsample |
-------------------------------------------------------------------------------------------------------------
Mean test auc: 83.73%,  Mean train auc: 99.13%, Diff: 15.40%
| [0m 1       [0m | [0m 0.6746  [0m | [0m 0.6247  [0m | [0m 9.507   [0m | [0m 0.06856 [0m | [0m 7.987   [0m | [0m 3.12    [0m | [0m 209.2   [0m | [0m 0.4349  [0m |
Mean test auc: 84.30%,  Mean train auc: 99.91%, Diff: 15.61%
| [95m 2       [0m | [95m 0.686   [0m | [95m 0.9197  [0m | [95m 6.012   [0m | [95m 0.06665 [0m | [95m 2.206   [0m | [95m 19.4    [0m | [95m 682.7   [0m | [95m 0.5274  [0m |
Mean test auc: 85.08%,  Mean train auc: 100.00%, Diff: 14.92%
| [95m 3       [0m | [95m 0.7016  [0m | [95m 0.5091  [0m | [95m 1.835   [0m | [95m 0.03434 [0m | [95m 7.248   [0m | [95m 8.639   [0m | [95m 303.9   [0m | [95m 0.7671  [0m |
Mean test auc: 84.71%,  Mean tra

In [78]:
print("Best score: {0}\nParams: {1}".format(xgb_bo_500.max["target"], xgb_bo_500.max["params"]))
for i, res in enumerate(xgb_bo_500.res):
    print("Iteration {}: \n\t{}".format(i, res))

Best score: 0.8096969696969694
Params: {'colsample_bytree': 0.9991337809826765, 'gamma': 0.5126040322793047, 'learning_rate': 0.07260519432257369, 'max_depth': 9.423245502822606, 'min_child_weight': 15.847405956328881, 'n_estimators': 344.4967382859727, 'subsample': 0.6786490570865282}
Iteration 0: 
	{'target': 0.7634889434889438, 'params': {'colsample_bytree': 0.6247240713084175, 'gamma': 9.507192349792751, 'learning_rate': 0.0685595153449124, 'max_depth': 7.986584841970366, 'min_child_weight': 3.1203728088487304, 'n_estimators': 209.19616423534185, 'subsample': 0.4348501673009197}}
Iteration 1: 
	{'target': 0.7854873054873057, 'params': {'colsample_bytree': 0.9197056874649611, 'gamma': 6.011549002420345, 'learning_rate': 0.06664580622368364, 'max_depth': 2.2058449429580245, 'min_child_weight': 19.398197043239886, 'n_estimators': 682.7098485602952, 'subsample': 0.5274034664069657}}
Iteration 2: 
	{'target': 0.7578542178542178, 'params': {'colsample_bytree': 0.8056204439897416, 'gamma'

In [119]:
opt_params_500 = xgb_bo_500.max["params"]
params_xgb500 = {'colsample_bytree': opt_params_500["colsample_bytree"],
                 'gamma': opt_params_500["gamma"],
                 'learning_rate': opt_params_500["learning_rate"],
                 'max_depth': int(opt_params_500["max_depth"]),
                 'min_child_weight': opt_params_500["min_child_weight"],
                 'n_estimators': int(opt_params_500["n_estimators"]),
                 'subsample': opt_params_500["subsample"]}

clf_500 = XGBClassifier(**params_xgb500, n_jobs=16)

clf_500.fit(X_train_500, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.461702448981613,
              gamma=1.161953091628986, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.05224857007800442,
              max_delta_step=0, max_depth=8, min_child_weight=8.07481593002973,
              missing=nan, monotone_constraints='()', n_estimators=302,
              n_jobs=16, num_parallel_tree=1, random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=0.8855904253753344,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [51]:
test_df = pd.read_csv("datasets/test.csv")
X_test, y_test = test_df[test_df.columns.difference(["patient_ID", "posOutcome"])], test_df["posOutcome"]

In [91]:
def calc_scores(clf, X_test, y_test):
    y_pred = clf.predict(X_test)
    recall_0, recall_1 = recall_score(y_test, y_pred, pos_label=0), recall_score(y_test, y_pred, pos_label=1)
    precision_0, precision_1 =  precision_score(y_test, y_pred, pos_label=0), precision_score(y_test, y_pred, pos_label=1)
    acc = balanced_accuracy_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
    return np.array([[acc, precision_0, recall_0, precision_1, recall_1,auc_score]])

In [80]:
X_test_500 = X_test[ft_500]

test_scores = calc_scores(clf_500, X_test_500, y_test)
scores_test_500_df = pd.DataFrame(data=test_scores, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
scores_test_500_df.mean()

balanced_accuracy    0.765297
recall_0             0.785311
precision_0          0.774373
recall_1             0.745283
precision_1          0.757188
auc                  0.845859
dtype: float64

In [83]:
pam35_genes = ["BAG1", "BIRC5", "BLVRA", "CCNB1", "CCNE1", "CDC20", "CDC6", "CDH3", "CENPF", "CEP55", "EGFR", "ERBB2", "ESR1", "EXO1", "FOXA1", "FOXC1",  "GRB7", "KIF2C", "KRT14", "KRT17", "KRT5", "MAPT", "MDM2", "MELK", "MIA", "MKI67", "MMP11", "MYBL2", "MYC", "PGR", "RRM2", "SFRP1", "SLC39A6", "TYMS", "UBE2C"]

X_pam35 = X_train[pam35_genes]

In [88]:
xgb_pam_bo = bayesian_opt(X_pam35, y_train)
opt_params_pam = xgb_pam_bo.max["params"]
params_pam35 = {'colsample_bytree': opt_params_pam["colsample_bytree"],
                 'gamma': opt_params_pam["gamma"],
                 'learning_rate': opt_params_pam["learning_rate"],
                 'max_depth': int(opt_params_pam["max_depth"]),
                 'min_child_weight': opt_params_pam["min_child_weight"],
                 'n_estimators': int(opt_params_pam["n_estimators"]),
                 'subsample': opt_params_pam["subsample"]}

clf_pam = XGBClassifier(**params_pam35, n_jobs=16)

clf_pam.fit(X_pam35, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5285030615978858,
              gamma=2.4910681761867757, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.01, max_delta_step=0,
              max_depth=7, min_child_weight=6.401699642327911, missing=nan,
              monotone_constraints='()', n_estimators=308, n_jobs=16,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=0.9705148953040089,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [94]:
X_pam35 = X_test[pam35_genes]

test_scores_pam35 = calc_scores(clf_pam, X_pam35, y_test)
scores_test_pam35_df = pd.DataFrame(data=test_scores_pam35, columns=["balanced_accuracy", "precision_0", "recall_0", "precision_1","recall_1", "auc"])
scores_test_pam35_df.mean() * 100

balanced_accuracy    73.121202
precision_0          74.438202
recall_0             74.858757
precision_1          71.835443
recall_1             71.383648
auc                  79.729418
dtype: float64

In [120]:
from sklearn.model_selection import RandomizedSearchCV, cross_validate, StratifiedKFold
from sklearn.metrics import precision_score, recall_score, balanced_accuracy_score

def recall_0(y_true, y_pred):
    return recall_score(y_true, y_pred, pos_label=0)

def precision_0(y_true, y_pred):
    return precision_score(y_true, y_pred, pos_label=0)

scoring = {"balanced_accuracy": make_scorer(balanced_accuracy_score),
           "recall_0": make_scorer(recall_0), "precision_0": make_scorer(precision_0),
           "recall_1": make_scorer(recall_score), "precision_1": make_scorer(precision_score), "auc": "roc_auc" }

#cross_validation

def print_score_comparison(raw_score, emb_score, target_feature="posOutcome",
                           header_1="Raw Score", header_2="Embedding Score"):
    print("\t\t{0}\n\t\t\t{1}\t\t{2}".format(target_feature, header_1, header_2))
    print("\t\t-----------------------------------------------")
    print("balanced_accuracy:\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["balanced_accuracy"].mean(), emb_score["balanced_accuracy"].mean()))
    print("precision_0:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["precision_0"].mean(), emb_score["precision_0"].mean()))
    print("recall_0:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["recall_0"].mean(), emb_score["recall_0"].mean()))
    print("precision_1:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["precision_1"].mean(), emb_score["precision_1"].mean()))
    print("recall_1:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["recall_1"].mean(), emb_score["recall_1"].mean()))
    print("auc:\t\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["auc"].mean(), emb_score["auc"].mean()))

def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time

    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

def param_tuning(X, y, n_folds=5, param_comb=25, scoring='roc_auc', jobs=12):
    xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1)
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    rand_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring=scoring, n_jobs=jobs,
                                   cv=skf.split(X, y), verbose=3, random_state=42)

    start_time = timer(None) # timing starts from this point for "start_time" variable
    rand_search.fit(X, y)
    timer(start_time)
    print("Best Score: {:.3%}".format(rand_search.best_score_))
    print(rand_search.best_params_)
    return rand_search

In [96]:
rand_params_500 = param_tuning(X_train_500, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 5 minutes and 22.97 seconds.
Best Score: 91.218%
{'subsample': 0.6, 'n_estimators': 700, 'min_child_weight': 5, 'max_depth': 5, 'learning_rate': 0.03, 'gamma': 0.5, 'colsample_bytree': 0.8}


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:   21.9s
[Parallel(n_jobs=12)]: Done 125 out of 125 | elapsed:  4.9min finished


In [99]:
params_500_rand = {'subsample': 0.6,
 'n_estimators': 700,
 'min_child_weight': 5,
 'max_depth': 5,
 'learning_rate': 0.03,
 'gamma': 0.5,
 'colsample_bytree': 0.8}

clf_500_rand = XGBClassifier(**params_500_rand)

s_v = StratifiedKFold(n_splits=5, shuffle=True, random_state=rand_seed)
cv_results = cross_validate(clf_500_rand, X_train_500, y_train, scoring=scoring,
                            verbose=2, n_jobs=-1)
cv_results

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.6min finished


{'fit_time': array([97.65030217, 96.3224237 , 97.5036397 , 96.75780582, 97.38839674]),
 'score_time': array([0.04465437, 0.10535979, 0.06776261, 0.07968521, 0.0639739 ]),
 'test_balanced_accuracy': array([0.81683047, 0.84176904, 0.82264537, 0.81891892, 0.84316134]),
 'test_recall_0': array([0.83636364, 0.87272727, 0.79393939, 0.8       , 0.84848485]),
 'test_precision_0': array([0.82142857, 0.8372093 , 0.85620915, 0.84615385, 0.85365854]),
 'test_recall_1': array([0.7972973 , 0.81081081, 0.85135135, 0.83783784, 0.83783784]),
 'test_precision_1': array([0.8137931 , 0.85106383, 0.7875    , 0.78980892, 0.83221477]),
 'test_auc': array([0.91502867, 0.92674038, 0.91216216, 0.90266175, 0.91511057])}

In [101]:
score_cols = ["test_balanced_accuracy","test_precision_0", "test_recall_0",
               "test_precision_1","test_recall_1", "test_auc"]

def get_scores(cv_results, score_keys=score_cols):
    scores = np.empty([1, len(score_keys)])
    for i, s in enumerate(score_keys):
        scores[0][i] = np.mean(cv_results[s])

    return scores

In [102]:
scores_rand_500 = get_scores(cv_results)
scores_rand_500

array([[0.82866503, 0.84293188, 0.83030303, 0.81487612, 0.82702703,
        0.9143407 ]])

In [104]:
clf_500_rand.fit(X_train_500, y_train)
test_scores_rand_500 = calc_scores(clf_500_rand, X_test_500, y_test)
test_scores_rand_500_df = pd.DataFrame(data=test_scores_rand_500, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])



array([[0.76105959, 0.77247191, 0.77683616, 0.75      , 0.74528302,
        0.85118857]])

In [108]:
from mlxtend.classifier import StackingCVClassifier
from mlxtend.feature_selection import ColumnSelector
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

pam35_pipe = make_pipeline(ColumnSelector(cols=pam35_genes),
                           clf_pam)

clf_500_pipe = make_pipeline(ColumnSelector(cols=ft_500),
                             clf_500_rand)

sclf_1 = StackingCVClassifier(classifiers=[clf_500_pipe, pam35_pipe], meta_classifier=LogisticRegression(),
                              cv=s_v, use_clones=False, verbose=True,
                              n_jobs=14,
                              random_state=rand_seed)

# sclf_1.fit(X_train, y_train)

In [109]:
from sklearn.model_selection import cross_val_score
scores_sclf_1 = cross_validate(sclf_1, X_train, y_train, n_jobs=-1,
                             cv=s_v, scoring=scoring,
                             verbose=2)
scores_sclf_1 = get_scores(scores_sclf_1)
scores_sclf_1

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.5min finished


array([[0.82824734, 0.83824082, 0.83757576, 0.82005687, 0.81891892,
        0.84077805]])

In [110]:
moses50_genes = ["PRND", "FRS3", "FCN3", "DSCR4", "BRCA2", "CXCL6", "LMX1B", "DLX5", "OMP", "ADH6", "PGAP1", "ART3", "BCHE", "FGB", "IL1RAPL1", "FSTL4", "ASGR1", "ZNF135", "DLL3", "NPHS2", "ANGPT2", "GLP2R", "GRIA3", "HOXB8", "MSC", "PLA2R1", "CYP2F1", "TAS2R7", "NKX6-1", "WNT11", "CHST11", "CLCA4", "ENPEP", "PAH", "WFDC1", "CHGA", "SEZ6L", "UGT2A3", "PRDM16", "GALR2", "GUCA1A", "CASQ1", "NOS1AP", "CACNA2D3", "FHOD3", "SRGAP3", "TMOD2", "ATOH1", "SLC6A1", "HAS1"]
clf_moses50 = XGBClassifier()
clf_moses50.load_model("datasets/models/moses50_raw.json")
moses50_pipe = make_pipeline(ColumnSelector(cols=moses50_genes),
                           clf_moses50)

sclf_2 = StackingCVClassifier(classifiers=[clf_500_pipe, pam35_pipe, moses50_pipe],
                  meta_classifier=LogisticRegression(),
                  cv=s_v, use_clones=False, verbose=True,
                  n_jobs=14, use_probas=True,
                  random_state=rand_seed)
scores_sclf_2 = cross_validate(sclf_2, X_train, y_train, n_jobs=-1,
                             cv=s_v, scoring=scoring,
                             verbose=2)
scores_sclf_2 = get_scores(scores_sclf_2)
scores_sclf_2

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.6min finished


array([[0.83195332, 0.84001719, 0.84363636, 0.82591047, 0.82027027,
        0.91352989]])

In [115]:
sclf_1.fit(X_train, y_train)
test_scors_sclf_1 = calc_scores(sclf_1, X_test, y_test)
test_scors_sclf_1_df = pd.DataFrame(data=test_scors_sclf_1, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
test_scors_sclf_1_df.mean()

Fitting 2 classifiers...
Fitting classifier1: pipeline (1/2)
Fitting classifier2: pipeline (2/2)


[Parallel(n_jobs=14)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=14)]: Done   2 out of   5 | elapsed:  2.0min remaining:  3.0min
[Parallel(n_jobs=14)]: Done   5 out of   5 | elapsed:  2.0min finished
[Parallel(n_jobs=14)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=14)]: Done   2 out of   5 | elapsed:  1.4min remaining:  2.1min
[Parallel(n_jobs=14)]: Done   5 out of   5 | elapsed:  1.5min finished


balanced_accuracy    0.761060
recall_0             0.772472
precision_0          0.776836
recall_1             0.750000
precision_1          0.745283
auc                  0.794443
dtype: float64

In [113]:
# sclf_2.fit(X_train, y_train)
test_scors_sclf_2 = calc_scores(sclf_2, X_test, y_test)
test_scors_sclf_2_df = pd.DataFrame(data=test_scors_sclf_2, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
test_scors_sclf_2_df.mean()

balanced_accuracy    0.762632
recall_0             0.774648
precision_0          0.776836
recall_1             0.750789
precision_1          0.748428
auc                  0.848977
dtype: float64

In [126]:
mrmr_df = train_df[train_df.columns.difference(["patient_ID"])]
pos_outcome = train_df["posOutcome"]
mrmr_df.drop(labels=["posOutcome"], inplace=True, axis=1)
mrmr_df.insert(0, "posOutcome", pos_outcome)
mrmr_df.to_csv("datasets/train_mrmr.csv", index=False)