In [2]:
import warnings
import numpy as np
import pandas as pd
warnings.filterwarnings('ignore')
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score, recall_score, balanced_accuracy_score, precision_score, make_scorer
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold, train_test_split
from xgboost import XGBClassifier
import matplotlib.pyplot as plt

In [28]:
params = {'n_estimators': [300, 400, 500, 600, 700],
              'learning_rate': [0.01, 0.02, 0.03, 0.05, 0.07],
              'gamma': [0.5, 1, 1.5, 2, 5],
              'max_depth': [3, 4, 5, 6],
              'subsample': [0.6, 0.8, 1.0],
              'colsample_bytree': [0.6, 0.8, 1.0],
              'min_child_weight': [1, 2, 3, 4, 5]}

seed = 42
st_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

def calc_scores(clf, X_test, y_test):
    y_pred = clf.predict(X_test)
    recall_0, recall_1 = recall_score(y_test, y_pred, pos_label=0), recall_score(y_test, y_pred, pos_label=1)
    precision_0, precision_1 =  precision_score(y_test, y_pred, pos_label=0), precision_score(y_test, y_pred, pos_label=1)
    acc = balanced_accuracy_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
    arr = np.array([[acc, precision_0, recall_0, precision_1, recall_1,auc_score]])
    return pd.DataFrame(data=arr, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])

def recall_0(y_true, y_pred):
    return recall_score(y_true, y_pred, pos_label=0)

def precision_0(y_true, y_pred):
    return precision_score(y_true, y_pred, pos_label=0)

scoring = {"balanced_accuracy": make_scorer(balanced_accuracy_score),
           "recall_0": make_scorer(recall_0), "precision_0": make_scorer(precision_0),
           "recall_1": make_scorer(recall_score), "precision_1": make_scorer(precision_score), "auc": "roc_auc" }

#cross_validation

def print_score_comparison(raw_score, emb_score, target_feature="posOutcome",
                           header_1="Raw Score", header_2="Embedding Score"):
    print("\t\t{0}\n\t\t\t{1}\t\t{2}".format(target_feature, header_1, header_2))
    print("\t\t-----------------------------------------------")
    print("balanced_accuracy:\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["balanced_accuracy"].mean(), emb_score["balanced_accuracy"].mean()))
    print("precision_0:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["precision_0"].mean(), emb_score["precision_0"].mean()))
    print("recall_0:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["recall_0"].mean(), emb_score["recall_0"].mean()))
    print("precision_1:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["precision_1"].mean(), emb_score["precision_1"].mean()))
    print("recall_1:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["recall_1"].mean(), emb_score["recall_1"].mean()))
    print("auc:\t\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["auc"].mean(), emb_score["auc"].mean()))

def print_score_comparison_cv(raw_score, emb_score, target_feature="posOutcome",header_1="Raw Score", header_2="Embedding Score"):
    print("\t\t{0}\n\t\t\t{1}\t\t{2}".format(target_feature, header_1, header_2))
    print("\t\t-----------------------------------------------")
    print("balanced_accuracy:\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["test_balanced_accuracy"].mean(), emb_score["test_balanced_accuracy"].mean()))
    print("precision_0:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["test_precision_0"].mean(), emb_score["test_precision_0"].mean()))
    print("recall_0:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["test_recall_0"].mean(), emb_score["test_recall_0"].mean()))
    print("precision_1:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["test_precision_1"].mean(), emb_score["test_precision_1"].mean()))
    print("recall_1:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["test_recall_1"].mean(), emb_score["test_recall_1"].mean()))
    print("auc:\t\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["test_auc"].mean(), emb_score["test_auc"].mean()))

def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time

    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

def param_tuning(X, y, n_folds=5, param_comb=25, scoring='roc_auc', jobs=12):
    xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1)
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    rand_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring=scoring, n_jobs=jobs,
                                   cv=skf.split(X, y), verbose=3, random_state=42)

    start_time = timer(None) # timing starts from this point for "start_time" variable
    rand_search.fit(X, y)
    timer(start_time)
    print("Best Score: {:.3%}".format(rand_search.best_score_))
    print(rand_search.best_params_)
    return rand_search

score_cols = ["test_balanced_accuracy","test_precision_0", "test_recall_0",
               "test_precision_1","test_recall_1", "test_auc"]

def get_scores(cv_results, score_keys=None, df_cols=None):
    if score_keys is None:
        score_keys = score_cols
    if df_cols is None:
        df_cols = score_cols
    scores = np.empty([1, len(score_keys)])
    for i, s in enumerate(score_keys):
        scores[0][i] = np.mean(cv_results[s])
    scores_df = pd.DataFrame(data=scores, columns=df_cols)
    return scores_df

def evaluate_embedding(path, outcome_df, target="posOutcome", merge_col="patient_ID", params=None
                       ,n_jobs=-1):
    emb_df = pd.read_csv(path, sep="\t")
    emb_outcome_df = pd.merge(outcome_df, emb_df, on=merge_col)
    X_emb, y_emb = emb_outcome_df[emb_outcome_df.columns.difference([merge_col, target])], emb_outcome_df[target]
    X_train_emb, X_test_emb, y_train_emb, y_test_emb = train_test_split(X_emb, y_emb, test_size=0.3, random_state=seed)
    if params is None:
        rand_search_emb = param_tuning(X_train_emb, y_train_emb, jobs=n_jobs)
        params = rand_search_emb.best_params_
        clf_emb = rand_search_emb.best_estimator_
    else:
        clf_emb = XGBClassifier(**params)
    cv_res = cross_validate(clf_emb, X_train_emb, y_train_emb, scoring=scoring, n_jobs=n_jobs, verbose=1, return_train_score=True,
                            cv=st_cv)
    cv_res_df = get_scores(cv_res)
    clf_emb.fit(X_train_emb, y_train_emb)
    test_scores_df = calc_scores(clf_emb, X_test_emb, y_test_emb)

    return params, clf_emb, cv_res_df, test_scores_df

In [4]:
pos_outcome_df = pd.read_csv("datasets/combat15outcomes_latest.csv")
pos_outcome_df = pos_outcome_df[["patient_ID", "posOutcome"]].dropna(axis=0, subset=["posOutcome"])
moses500_emb_df = pd.read_csv("datasets/embedding-vectors/genexpr_clinicaldata/property_vector_moses500_wopln_2021-01-28.csv", sep="\t")
moses500_emb_df = moses500_emb_df.loc[:,~moses500_emb_df.columns.str.contains("^Unnamed")]
moses500_emb_outcome_df = pd.merge(pos_outcome_df, moses500_emb_df, on="patient_ID")

X_moses500_emb, y_500_outcome = moses500_emb_outcome_df[moses500_emb_outcome_df.columns.difference(["patient_ID",
                                                                                                    "posOutcome"])], \
                                moses500_emb_outcome_df["posOutcome"]

X_train_moses500, X_test_moses500, y_train_moses500, y_test_moses500 = train_test_split(X_moses500_emb, y_500_outcome, test_size=0.3, random_state=seed)

X_train_moses500.head()

Unnamed: 0,0,1,10,100,1000,1001,1002,1003,1004,1005,...,990,991,992,993,994,995,996,997,998,999
1803,-0.141096,0.025587,-0.052658,-0.003273,0.023713,-0.000369,-0.017225,0.010393,0.010978,-0.002139,...,-0.013333,0.001245,-0.007404,0.005843,-0.000109,-0.006222,-0.027249,0.004551,0.006404,-0.022996
2021,0.521562,-0.068304,-0.062999,0.01445,0.019302,0.030928,-0.028432,0.016746,0.011054,0.00332,...,-0.004796,-9.4e-05,0.021299,-0.000849,0.021312,0.011025,-0.002,-0.000835,0.003087,0.009384
1949,0.444903,-0.047255,-0.009275,0.002878,-0.011309,-0.003531,-0.015948,-0.013611,-0.012096,0.001568,...,-0.002538,0.024432,-0.000844,0.005191,-0.006366,0.01731,0.001961,0.006773,0.006698,0.0162
1591,-0.144351,-0.115069,0.142886,-0.050261,-0.003571,0.009988,-0.003947,0.004796,0.001001,0.020687,...,-0.002948,0.005899,-0.00029,0.008037,0.011734,-0.00761,0.007086,-0.003716,-0.005912,0.001909
1746,-0.190454,0.018854,0.00717,-0.040255,-0.030323,-0.001451,0.00609,0.008806,0.001865,0.007181,...,0.003491,0.025046,0.012089,-0.030416,-0.001304,-0.001814,0.012315,0.010432,0.003401,0.00957


In [12]:
rand_search_moses500 = param_tuning(X_train_moses500, y_500_outcome)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 21 minutes and 46.7 seconds.
Best Score: 82.559%
{'subsample': 1.0, 'n_estimators': 400, 'min_child_weight': 3, 'max_depth': 5, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 0.6}


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:  1.6min
[Parallel(n_jobs=12)]: Done 125 out of 125 | elapsed: 20.5min finished


In [5]:
params_moses_500 = {'subsample': 1.0,
 'n_estimators': 400,
 'min_child_weight': 3,
 'max_depth': 5,
 'learning_rate': 0.01,
 'gamma': 1,
 'colsample_bytree': 0.6}

clf_moses500 = XGBClassifier(**params_moses_500, n_jobs=4)

In [6]:
cv_res_moses500 = cross_validate(clf_moses500, X_train_moses500, y_train_moses500, scoring=scoring,
                                 n_jobs=-1, verbose=1, return_train_score=True, cv=st_cv)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   52.3s finished


In [8]:
cv_scores_moses500_df = get_scores(cv_res_moses500, score_cols)
cv_scores_moses500_df.mean()

0    0.748339
1    0.769034
2    0.763307
3    0.730289
4    0.733372
5    0.819359
dtype: float64

In [40]:
clf_moses500.fit(X_train_moses500, y_train_moses500)
test_scores_moses500 = calc_scores(clf_moses500,X_test_moses500, y_test_moses500)

test_scores_moses500_df = pd.DataFrame(data=test_scores_moses500, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
test_scores_moses500_df.mean()



balanced_accuracy    0.755837
recall_0             0.748571
precision_0          0.775148
recall_1             0.763975
precision_1          0.736527
auc                  0.826276
dtype: float64

In [9]:
# rand_search_mrmr50 = param_tuning(X_train_mrmr50, y_train_mrmr50, jobs=16)

Unnamed: 0,0,1,10,100,1000,1001,1002,1003,1004,1005,...,990,991,992,993,994,995,996,997,998,999
1803,-0.074989,-0.034986,0.016726,-0.029518,0.008582,0.005779,-0.000579,-0.002969,0.001462,0.015135,...,0.007104,0.002937,0.002284,0.003757,-0.003861,-0.002634,0.004086,-0.001413,0.000586,-0.002029
2021,-0.461253,0.310014,-0.150956,-0.041953,-0.006985,0.007958,0.004016,0.003632,0.005419,-0.001957,...,-0.001212,-0.012179,0.00141,-0.006575,-0.007483,-0.004757,0.004736,-0.005549,0.008913,-0.00159
1949,-0.229845,-0.02211,0.042118,0.005673,-0.007579,0.002075,-0.005237,0.006646,0.007098,0.001961,...,-0.007089,-0.005877,0.002784,0.000884,0.007778,0.012091,0.009944,0.016688,0.002074,0.011915
1591,-0.305333,-0.097317,-0.05269,0.012518,-0.008363,-0.003001,0.008997,-0.002156,0.002506,0.000815,...,-0.004456,0.001293,-0.00383,0.000903,0.011116,-0.007334,-0.004435,0.005998,-0.006971,-0.006417
1746,0.029845,-0.185375,0.141574,0.055656,-0.003347,-0.017563,0.016647,0.014553,0.010957,-0.000302,...,-0.004145,9e-05,-0.004986,-0.010449,0.014984,-0.010606,0.002446,-0.002113,0.035375,-0.005707


In [13]:
mrmr50_params = {'subsample': 0.8,
 'n_estimators': 400,
 'min_child_weight': 3,
 'max_depth': 5,
 'learning_rate': 0.03,
 'gamma': 1.5,
 'colsample_bytree': 0.8}

mrmr50_params, clf_mrmr50, cv_scores_mrmr50_df, test_scores_mrmr50_df = evaluate_embedding("datasets/embedding-vectors/genexpr_only/property_vector_mrmr_ft50_2021-01-29.csv", pos_outcome_df, params=mrmr50_params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.2min finished




In [14]:
cv_scores_mrmr50_df.mean()

0    0.745660
1    0.749200
2    0.804931
3    0.753668
4    0.686389
5    0.797917
dtype: float64

In [15]:
test_scores_mrmr50_df.mean()

balanced_accuracy    0.718465
recall_0             0.701897
precision_0          0.766272
recall_1             0.739274
precision_1          0.670659
auc                  0.792226
dtype: float64

In [32]:
moses83_params, clf_moses83, cv_moses83, test_scores_moses83 = evaluate_embedding("datasets/embedding-vectors/genexpr_only/property_vector_moses_ft83_2021-01-30.csv", pos_outcome_df)
print(moses83_params)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 26 minutes and 41.99 seconds.
Best Score: 78.738%
{'subsample': 0.8, 'n_estimators': 300, 'min_child_weight': 5, 'max_depth': 5, 'learning_rate': 0.03, 'gamma': 2, 'colsample_bytree': 0.6}
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


{'subsample': 0.8, 'n_estimators': 300, 'min_child_weight': 5, 'max_depth': 5, 'learning_rate': 0.03, 'gamma': 2, 'colsample_bytree': 0.6}


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed: 25.6min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   52.8s finished


In [56]:
cv_moses83.mean()

test_balanced_accuracy    0.718882
test_precision_0          0.724353
test_recall_0             0.785926
test_precision_1          0.724941
test_recall_1             0.651839
test_auc                  0.787383
dtype: float64

In [57]:
test_scores_moses83.mean()

balanced_accuracy    0.716880
recall_0             0.694737
precision_0          0.781065
recall_1             0.746575
precision_1          0.652695
auc                  0.766609
dtype: float64

In [58]:
#Test score comparison
print_score_comparison(test_scores_mrmr50_df, test_scores_moses83,
                       header_1="MRMR 50", header_2="MOSES 83")

		posOutcome
			MRMR 50		MOSES 83
		-----------------------------------------------
balanced_accuracy:	71.847%			71.688%

precision_0:		76.627%			78.107%

recall_0:		70.190%			69.474%

precision_1:		67.066%			65.269%

recall_1:		73.927%			74.658%

auc:			79.223%			76.661%



In [None]:
mrmr50_cl_params, cv_mrmr50_cl, test_scores_mrmr50_cl_df = evaluate_embedding("datasets/embedding-vectors/genexpr_clinicaldata/property_vector_mrmr_ft50_wopln_2021-01-29.csv", pos_outcome_df)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


In [60]:
cv_mrmr50_cl.mean()

test_balanced_accuracy    0.726911
test_precision_0          0.730322
test_recall_0             0.797816
test_precision_1          0.738372
test_recall_1             0.656006
test_auc                  0.798681
dtype: float64

In [61]:
test_scores_mrmr50_cl_df.mean()

balanced_accuracy    0.736128
recall_0             0.705882
precision_0          0.816568
recall_1             0.779359
precision_1          0.655689
auc                  0.813831
dtype: float64

In [62]:
print_score_comparison(test_scores_mrmr50_df, test_scores_mrmr50_cl_df,
                       header_1="MRMR 50 Emb w/o CL",
                       header_2="MRMR 50 Emb w CL")

		posOutcome
			MRMR 50 Emb w/o CL		MRMR 50 Emb w CL
		-----------------------------------------------
balanced_accuracy:	71.847%			73.613%

precision_0:		76.627%			81.657%

recall_0:		70.190%			70.588%

precision_1:		67.066%			65.569%

recall_1:		73.927%			77.936%

auc:			79.223%			81.383%



In [None]:

moses83_cl_params, cv_moses83_cl, test_scores_moses83_cl_df = evaluate_embedding("datasets/embedding-vectors/genexpr_clinicaldata/property_vector_moses_ft83-wopln_2021-01-30.csv", pos_outcome_df)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


In [64]:
cv_moses83_cl.mean()

test_balanced_accuracy    0.732873
test_precision_0          0.732185
test_recall_0             0.809721
test_precision_1          0.749274
test_recall_1             0.656025
test_auc                  0.809083
dtype: float64

In [65]:
test_scores_moses83_cl_df.mean()

balanced_accuracy    0.733152
recall_0             0.703325
precision_0          0.813609
recall_1             0.775801
precision_1          0.652695
auc                  0.815124
dtype: float64

In [66]:
print_score_comparison(test_scores_moses83, test_scores_moses83_cl_df,
                       header_1="MOSES 83 w/o CL",
                       header_2="MOSES 83 w CL")

		posOutcome
			MOSES 83 w/o CL		MOSES 83 w CL
		-----------------------------------------------
balanced_accuracy:	71.688%			73.315%

precision_0:		78.107%			81.361%

recall_0:		69.474%			70.332%

precision_1:		65.269%			65.269%

recall_1:		74.658%			77.580%

auc:			76.661%			81.512%



In [69]:
mrmr100_params, cv_mrmr100, test_scores_mrmr100_df = evaluate_embedding("datasets/embedding-vectors/genexpr_only/property_vector_mrmr_ft100_2021-01-29.csv", pos_outcome_df)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 25 minutes and 33.28 seconds.
Best Score: 79.597%
{'subsample': 1.0, 'n_estimators': 400, 'min_child_weight': 3, 'max_depth': 5, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 0.6}
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed: 23.7min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.5min finished


In [70]:
cv_mrmr100.mean()

test_balanced_accuracy    0.721821
test_precision_0          0.736879
test_recall_0             0.758594
test_precision_1          0.710231
test_recall_1             0.685048
test_auc                  0.795970
dtype: float64

In [71]:
test_scores_mrmr100_df.mean()

balanced_accuracy    0.736376
recall_0             0.721763
precision_0          0.775148
recall_1             0.754045
precision_1          0.697605
auc                  0.799344
dtype: float64

In [None]:
mrmr100_cl_params, cv_mrmr100_cl, test_scores_mrmr100_cl_df = evaluate_embedding("datasets/embedding-vectors/genexpr_clinicaldata/property_vector_mrmr_ft100_wopln_2021-01-29.csv", pos_outcome_df)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


In [73]:
cv_mrmr100_cl.mean()

test_balanced_accuracy    0.714536
test_precision_0          0.721492
test_recall_0             0.779973
test_precision_1          0.719288
test_recall_1             0.649100
test_auc                  0.799172
dtype: float64

In [74]:
test_scores_mrmr100_cl_df.mean()

balanced_accuracy    0.725826
recall_0             0.703704
precision_0          0.786982
recall_1             0.755102
precision_1          0.664671
auc                  0.806984
dtype: float64

In [75]:
print_score_comparison(test_scores_mrmr100_df, test_scores_mrmr100_cl_df,
                       header_1="MRMR 100 Emb w/o CL",
                       header_2="MRMR 100 Emb w CL")

		posOutcome
			MRMR 100 Emb w/o CL		MRMR 100 Emb w CL
		-----------------------------------------------
balanced_accuracy:	73.638%			72.583%

precision_0:		77.515%			78.698%

recall_0:		72.176%			70.370%

precision_1:		69.760%			66.467%

recall_1:		75.405%			75.510%

auc:			79.934%			80.698%



In [17]:
params_mrmr50_nn, clf_mrmr50_nn, cv_scores_mrmr50_nn_df, test_scores_mrmr50_nn_df = evaluate_embedding("datasets/embedding-vectors/genexpr_only/property_vector_mrmr_ft50_notnormalized_2021-02-04.csv", pos_outcome_df, n_jobs=-1)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 26 minutes and 35.45 seconds.
Best Score: 79.358%
{'subsample': 0.8, 'n_estimators': 300, 'min_child_weight': 5, 'max_depth': 5, 'learning_rate': 0.03, 'gamma': 2, 'colsample_bytree': 0.6}
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed: 25.5min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   52.7s finished


In [18]:
cv_scores_mrmr50_nn_df.mean()

0    0.748330
1    0.754550
2    0.797809
3    0.750667
4    0.698851
5    0.793583
dtype: float64

In [19]:
test_scores_mrmr50_nn_df.mean()

balanced_accuracy    0.728803
recall_0             0.706349
precision_0          0.789941
recall_1             0.758503
precision_1          0.667665
auc                  0.779958
dtype: float64

In [31]:
print_score_comparison_cv(cv_scores_mrmr50_df, cv_scores_mrmr50_nn_df, header_1="MRMR 50 Norm", header_2="MRMR 50 Not Norm")

		posOutcome
			MRMR 50 Norm		MRMR 50 Not Norm
		-----------------------------------------------
balanced_accuracy:	74.566%			74.833%

precision_0:		74.920%			75.455%

recall_0:		80.493%			79.781%

precision_1:		75.367%			75.067%

recall_1:		68.639%			69.885%

auc:			79.792%			79.358%



In [22]:
print_score_comparison(test_scores_mrmr50_df, test_scores_mrmr50_nn_df, header_1="MRMR 50 Norm", header_2="MRMR 50 Not Norm")

		posOutcome
			MRMR 50 Norm		MRMR 50 Not Norm
		-----------------------------------------------
balanced_accuracy:	71.847%			72.880%

precision_0:		76.627%			78.994%

recall_0:		70.190%			70.635%

precision_1:		67.066%			66.766%

recall_1:		73.927%			75.850%

auc:			79.223%			77.996%



In [34]:
test_scores_moses83.mean()

balanced_accuracy    0.716880
recall_0             0.694737
precision_0          0.781065
recall_1             0.746575
precision_1          0.652695
auc                  0.766609
dtype: float64