In [42]:
import warnings
import numpy as np
import pandas as pd
warnings.filterwarnings('ignore')
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score, recall_score, balanced_accuracy_score, precision_score, make_scorer
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold
from xgboost import XGBClassifier
import matplotlib.pyplot as plt

In [43]:
params = {'n_estimators': [300, 400, 500, 600, 700],
              'learning_rate': [0.01, 0.02, 0.03, 0.05, 0.07],
              'gamma': [0.5, 1, 1.5, 2, 5],
              'max_depth': [3, 4, 5, 6],
              'subsample': [0.6, 0.8, 1.0],
              'colsample_bytree': [0.6, 0.8, 1.0],
              'min_child_weight': [1, 2, 3, 4, 5]}

seed = 42
st_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

def calc_scores(clf, X_test, y_test):
    y_pred = clf.predict(X_test)
    recall_0, recall_1 = recall_score(y_test, y_pred, pos_label=0), recall_score(y_test, y_pred, pos_label=1)
    precision_0, precision_1 =  precision_score(y_test, y_pred, pos_label=0), precision_score(y_test, y_pred, pos_label=1)
    acc = balanced_accuracy_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
    arr = np.array([[acc, precision_0, recall_0, precision_1, recall_1,auc_score]])
    return pd.DataFrame(data=arr, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])

def recall_0(y_true, y_pred):
    return recall_score(y_true, y_pred, pos_label=0)

def precision_0(y_true, y_pred):
    return precision_score(y_true, y_pred, pos_label=0)

scoring = {"balanced_accuracy": make_scorer(balanced_accuracy_score),
           "recall_0": make_scorer(recall_0), "precision_0": make_scorer(precision_0),
           "recall_1": make_scorer(recall_score), "precision_1": make_scorer(precision_score), "auc": "roc_auc" }

#cross_validation

def print_score_comparison(raw_score, emb_score, target_feature="posOutcome",
                           header_1="Raw Score", header_2="Embedding Score"):
    print("\t\t{0}\n\t\t\t{1}\t\t{2}".format(target_feature, header_1, header_2))
    print("\t\t-----------------------------------------------")
    print("balanced_accuracy:\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["balanced_accuracy"].mean(), emb_score["balanced_accuracy"].mean()))
    print("precision_0:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["precision_0"].mean(), emb_score["precision_0"].mean()))
    print("recall_0:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["recall_0"].mean(), emb_score["recall_0"].mean()))
    print("precision_1:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["precision_1"].mean(), emb_score["precision_1"].mean()))
    print("recall_1:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["recall_1"].mean(), emb_score["recall_1"].mean()))
    print("auc:\t\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["auc"].mean(), emb_score["auc"].mean()))

def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time

    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

def param_tuning(X, y, n_folds=5, param_comb=25, scoring='roc_auc', jobs=12):
    xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1)
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    rand_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring=scoring, n_jobs=jobs,
                                   cv=skf.split(X, y), verbose=3, random_state=42)

    start_time = timer(None) # timing starts from this point for "start_time" variable
    rand_search.fit(X, y)
    timer(start_time)
    print("Best Score: {:.3%}".format(rand_search.best_score_))
    print(rand_search.best_params_)
    return rand_search

score_cols = ["test_balanced_accuracy","test_precision_0", "test_recall_0",
               "test_precision_1","test_recall_1", "test_auc"]

def get_scores(cv_results, score_keys=score_cols, df_cols=score_cols):
    scores = np.empty([1, len(score_keys)])
    for i, s in enumerate(score_keys):
        scores[0][i] = np.mean(cv_results[s])
    scores_df = pd.DataFrame(data=scores, columns=df_cols)
    return scores_df


def evaluate_ge(x_train, y_train, x_test, y_test, feats=None, jobs=-1,
                scoring=scoring, rand_scoring="roc_auc", target="posOutcome"):
    if feats is not None:
        x_train = x_train[feats]
        x_test = x_test[feats]
    rand_search = param_tuning(x_train, y_train, scoring=rand_scoring)
    params = rand_search.best_params_
    clf = XGBClassifier(**params)
    cv_res = cross_validate(clf, x_train, y_train,scoring=scoring, cv=st_cv, n_jobs=-1)

    cv_res_df = get_scores(cv_res, score_cols, df_cols=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
    clf.fit(x_train, y_train)
    test_scores_df = calc_scores(clf, x_test, y_test)

    return params, cv_res_df, test_scores_df

In [44]:
train_df = pd.read_csv("datasets/train.csv")
X_train, y_train = train_df[train_df.columns.difference(["patient_ID", "posOutcome"])], train_df["posOutcome"]
X_train.head()

Unnamed: 0,A4GALT,AAAS,AACS,AADAC,AAK1,AAMP,AANAT,AARS,AARSD1,AASDHPPT,...,ZNHIT2,ZP2,ZPBP,ZSCAN2,ZW10,ZWINT,ZXDC,ZYX,ZZEF1,ZZZ3
0,3.490594,4.705177,7.388903,3.146066,5.324219,7.010299,3.20422,7.62326,4.908548,7.920498,...,3.616936,3.177763,3.120909,3.626377,5.573573,7.840314,5.720305,7.49144,7.049239,6.979166
1,3.493298,6.025729,6.501462,3.015961,4.639765,7.399345,3.801613,8.326222,5.075999,6.63509,...,4.002873,3.182145,3.414617,3.933382,3.717363,9.053191,6.370379,7.888914,5.422555,5.951768
2,3.426142,5.449551,5.632613,3.685224,5.643874,6.737401,3.596668,7.431818,5.591313,6.596328,...,2.695141,3.324802,3.251439,2.909459,4.385828,6.415808,5.480143,7.64496,6.797248,6.80828
3,3.426381,5.595401,6.882855,3.240755,6.07566,6.943799,3.20297,7.477471,4.90407,6.518033,...,3.3847,3.144302,3.158701,3.521218,3.968905,6.774039,6.299851,7.620011,5.797529,5.871506
4,3.479792,5.565861,4.662279,3.176784,6.033194,7.274996,3.204731,7.105333,6.663767,6.667291,...,3.414956,3.139913,3.185299,3.572568,3.874406,6.490379,6.589065,6.327172,6.770991,6.890959


In [45]:
fts_50_df = pd.read_csv("datasets/mrmr_top50.tsv", sep="\t")
fts_50_df.columns = ["Order", "Feat_Index", "Name", "Score"]
fts_50_df["Name"] = fts_50_df["Name"].str.strip()
feats_50 = fts_50_df["Name"].to_list()

In [6]:
X_train_50 = X_train[feats_50]
X_train_50.head()

Unnamed: 0,VNN2,ALAS2,E2F8,KIF21B,PPBP,SLITRK3,AQP9,DCT,IFNA8,EEF1A2,...,TGM3,PLS1,GABRB2,SLCO1A2,CHRNB3,BTC,SEMG1,P2RX7,P2RX3,LPO
0,3.189026,2.844616,4.046337,3.891334,2.993143,3.159996,3.711596,2.496415,3.465665,8.246389,...,3.799822,2.87701,3.268373,3.291948,3.290748,4.300402,3.263049,3.2035,3.165366,3.495641
1,4.874852,2.845986,5.026899,2.517917,5.31281,3.382119,4.410487,2.003576,4.518373,6.361501,...,4.32512,5.43193,3.444966,2.184971,3.482821,3.64882,2.790352,2.290437,2.604582,3.845958
2,6.907824,4.942284,3.5125,3.724512,5.608529,2.916534,6.535881,2.392424,3.45674,4.133781,...,3.999087,2.300098,3.597382,3.212796,3.276235,2.3823,2.845405,4.577597,4.208842,3.275248
3,3.95782,4.426584,3.69572,3.31921,4.085435,3.150267,4.255222,2.535919,3.466438,4.651417,...,3.595345,3.578949,3.315006,3.357198,3.283405,3.52531,3.230333,3.168525,3.182496,3.524188
4,4.846836,3.986994,3.560645,3.260197,4.244365,3.164787,4.170049,2.559143,3.468037,5.927124,...,3.700837,4.838044,3.540718,3.655555,3.332204,3.494355,3.215563,3.825358,3.197231,3.534038


In [7]:
rand_search_50 = param_tuning(X_train_50, y_train, jobs=14, scoring="balanced_accuracy")

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 0 minutes and 26.44 seconds.
Best Score: 78.050%
{'subsample': 1.0, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 6, 'learning_rate': 0.03, 'gamma': 1.5, 'colsample_bytree': 0.8}


[Parallel(n_jobs=14)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=14)]: Done   4 tasks      | elapsed:    2.7s
[Parallel(n_jobs=14)]: Done 125 out of 125 | elapsed:   23.7s finished


In [48]:
params_50 = {'subsample': 0.8,
 'n_estimators': 600,
 'min_child_weight': 3,
 'max_depth': 6,
 'learning_rate': 0.01,
 'gamma': 5,
 'colsample_bytree': 0.6}

params_50_acc = {'subsample': 1.0,
 'n_estimators': 500,
 'min_child_weight': 3,
 'max_depth': 6,
 'learning_rate': 0.03,
 'gamma': 1.5,
 'colsample_bytree': 0.8}

clf_50 = XGBClassifier(**params_50, n_jobs=4)

In [9]:
clf_50 = XGBClassifier(**params_50, n_jobs=4)
cv_results_50 = cross_validate(clf_50, X_train_50, y_train,
                               scoring=scoring, cv=st_cv, n_jobs=-1)
cv_results_50

{'fit_time': array([1.0860393 , 1.18201399, 1.78926873, 1.39332485, 1.69145489]),
 'score_time': array([0.03048277, 0.02456141, 0.01260757, 0.01266479, 0.01258349]),
 'test_balanced_accuracy': array([0.74494267, 0.77907453, 0.76124079, 0.7963145 , 0.78513514]),
 'test_recall_0': array([0.79393939, 0.78787879, 0.74545455, 0.78181818, 0.8       ]),
 'test_precision_0': array([0.74431818, 0.79268293, 0.78846154, 0.82165605, 0.79518072]),
 'test_recall_1': array([0.69594595, 0.77027027, 0.77702703, 0.81081081, 0.77027027]),
 'test_precision_1': array([0.75182482, 0.76510067, 0.73248408, 0.76923077, 0.7755102 ]),
 'test_auc': array([0.83099918, 0.82395577, 0.8496724 , 0.8495086 , 0.83988534])}

In [10]:
scores_50_df = get_scores(cv_results_50, score_cols, df_cols=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
scores_50_df.mean()

balanced_accuracy    0.773342
recall_0             0.788460
precision_0          0.781818
recall_1             0.758830
precision_1          0.764865
auc                  0.838804
dtype: float64

In [49]:
test_df = pd.read_csv("datasets/test.csv")
X_test, y_test = test_df[test_df.columns.difference(["patient_ID", "posOutcome"])], test_df["posOutcome"]

In [50]:
clf_50.fit(X_train_50, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, gamma=5, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=6,
              min_child_weight=3, missing=nan, monotone_constraints='()',
              n_estimators=600, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.8,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [52]:
X_test_50 = X_test[feats_50]

test_scores_50 = calc_scores(clf_50, X_test_50, y_test)
scores_test_50_df = pd.DataFrame(data=test_scores_50, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
scores_test_50_df.mean() * 100

balanced_accuracy    74.944036
recall_0             76.420455
precision_0          75.988701
recall_1             73.437500
precision_1          73.899371
auc                  80.808727
dtype: float64

In [59]:
fts_100_df = pd.read_csv("datasets/mrmr_top100.tsv", sep="\t")
fts_100_df.columns = ["Order", "Feat_Index", "Name", "Score"]
fts_100_df["Name"] = fts_100_df["Name"].str.strip()
feats_100 = fts_100_df["Name"].to_list()

with open("datasets/mrmr_ft100.txt", "w") as fp:
    for i in feats_100:
        fp.write("%s\n" % i)

In [15]:
X_train_100 = X_train[feats_100]
X_train_100.head()

Unnamed: 0,VNN2,SLC9A7,ADD2,FNDC3B,EEF1A2,E2F8,FUT3,PLS1,KIF1A,ZFPM2,...,CASQ1,ATP6V1D,CHRNB3,TGM3,RAD54L,TRIM17,HOXB1,CCR4,SGCG,SLC16A10
0,3.189026,3.442448,3.283981,6.704807,8.246389,4.046337,4.469728,2.87701,6.434729,4.089961,...,3.296052,6.800505,3.290748,3.799822,4.188891,3.448275,3.226325,3.345735,2.774195,3.181459
1,4.874852,3.548765,3.662784,7.195187,6.361501,5.026899,5.951502,5.43193,2.533096,4.317466,...,3.218257,6.136464,3.482821,4.32512,4.204713,4.13196,3.524856,3.786295,1.80432,3.99923
2,6.907824,3.208379,3.682653,7.386021,4.133781,3.5125,3.481282,2.300098,3.108423,5.881096,...,3.055024,8.064784,3.276235,3.999087,3.344914,2.777815,3.495302,3.334185,3.203982,4.547595
3,3.95782,3.432061,2.821736,7.049656,4.651417,3.69572,3.622156,3.578949,3.262309,5.77424,...,3.308403,7.500499,3.283405,3.595345,4.098439,3.436705,3.141711,3.375781,2.6639,3.165514
4,4.846836,3.429549,3.562138,6.093097,5.927124,3.560645,4.199089,4.838044,3.293778,2.471808,...,3.285745,7.870956,3.332204,3.700837,4.086651,3.482446,3.236798,3.423178,2.669285,3.260832


In [16]:
rand_search_100 = param_tuning(X_train_100, y_train, jobs=14)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 1 minutes and 2.77 seconds.
Best Score: 86.505%
{'subsample': 1.0, 'n_estimators': 700, 'min_child_weight': 3, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 2, 'colsample_bytree': 1.0}


[Parallel(n_jobs=14)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=14)]: Done   4 tasks      | elapsed:    4.0s
[Parallel(n_jobs=14)]: Done 125 out of 125 | elapsed:   51.2s finished


In [17]:
params_100 = {'subsample': 1.0,
 'n_estimators': 700,
 'min_child_weight': 3,
 'max_depth': 6,
 'learning_rate': 0.01,
 'gamma': 2,
 'colsample_bytree': 1.0}

clf_100 = XGBClassifier(**params_100, n_jobs=4)

In [18]:
cv_results_100 = cross_validate(clf_100, X_train_100, y_train,
                               scoring=scoring, cv=st_cv, n_jobs=-1)
cv_results_100

{'fit_time': array([5.71115232, 5.39548516, 5.37466288, 6.26632833, 5.48115301]),
 'score_time': array([0.01630974, 0.0211966 , 0.02494478, 0.01715851, 0.01657534]),
 'test_balanced_accuracy': array([0.7522932 , 0.79830057, 0.78920966, 0.80773956, 0.78583129]),
 'test_recall_0': array([0.84242424, 0.80606061, 0.78787879, 0.81818182, 0.78787879]),
 'test_precision_0': array([0.73544974, 0.81097561, 0.80745342, 0.81818182, 0.80246914]),
 'test_recall_1': array([0.66216216, 0.79054054, 0.79054054, 0.7972973 , 0.78378378]),
 'test_precision_1': array([0.79032258, 0.7852349 , 0.76973684, 0.7972973 , 0.76821192]),
 'test_auc': array([0.83665029, 0.86855037, 0.87223587, 0.87858313, 0.86920557])}

In [19]:
scores_100_df = get_scores(cv_results_100, score_cols, df_cols=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
scores_100_df.mean()

balanced_accuracy    0.786675
recall_0             0.794906
precision_0          0.808485
recall_1             0.782161
precision_1          0.764865
auc                  0.865045
dtype: float64

In [20]:
clf_100.fit(X_train_100, y_train)
X_test_100 = X_test[feats_100]

test_scores_100 = calc_scores(clf_100, X_test_100, y_test)
scores_test_100_df = pd.DataFrame(data=test_scores_100, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
scores_test_100_df.mean()



balanced_accuracy    0.756689
recall_0             0.782738
precision_0          0.742938
recall_1             0.729167
precision_1          0.770440
auc                  0.815043
dtype: float64

In [21]:
with open("mrmr_ft50.txt", "w") as fp:
    for f in feats_50:
        fp.write(f + "\n")

In [22]:
clf_50_acc = XGBClassifier(**params_50_acc)
cv_results_50_acc = cross_validate(clf_50_acc, X_train_50, y_train,
                               scoring=scoring, cv=st_cv, n_jobs=-1)
cv_results_50_acc

{'fit_time': array([84.99123073, 85.22930431, 84.73753142, 83.90528011, 85.05328965]),
 'score_time': array([0.03603697, 0.01239944, 0.05494833, 0.06586123, 0.04502559]),
 'test_balanced_accuracy': array([0.76347256, 0.78781736, 0.77848075, 0.78349713, 0.78920966]),
 'test_recall_0': array([0.82424242, 0.81212121, 0.73939394, 0.76969697, 0.78787879]),
 'test_precision_0': array([0.75555556, 0.79289941, 0.81879195, 0.8089172 , 0.80745342]),
 'test_recall_1': array([0.7027027 , 0.76351351, 0.81756757, 0.7972973 , 0.79054054]),
 'test_precision_1': array([0.78195489, 0.78472222, 0.73780488, 0.75641026, 0.76973684]),
 'test_auc': array([0.82624898, 0.82461097, 0.83181818, 0.83992629, 0.83718264])}

In [23]:
scores_50_acc_df = get_scores(cv_results_50_acc, score_cols, df_cols=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
scores_50_acc_df.mean()

balanced_accuracy    0.780495
recall_0             0.796724
precision_0          0.786667
recall_1             0.766126
precision_1          0.774324
auc                  0.831957
dtype: float64

In [39]:
test_df = pd.read_csv("datasets/test.csv")
X_test, y_test = test_df[test_df.columns.difference(["patient_ID", "posOutcome"])], test_df["posOutcome"]

In [25]:
clf_50_acc.fit(X_train_50, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=1.5, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.03, max_delta_step=0, max_depth=6,
              min_child_weight=3, missing=nan, monotone_constraints='()',
              n_estimators=500, n_jobs=16, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1.0,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [26]:
X_test_50_acc = X_test[feats_50]

test_scores_50_acc = calc_scores(clf_50_acc, X_test_50_acc, y_test)
scores_test_50_acc_df = pd.DataFrame(data=test_scores_50_acc, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
scores_test_50_df.mean()

balanced_accuracy    0.749440
recall_0             0.764205
precision_0          0.759887
recall_1             0.734375
precision_1          0.738994
auc                  0.808087
dtype: float64

In [27]:
rand_search_100_acc = param_tuning(X_train_100, y_train, jobs=14,
                                   scoring="balanced_accuracy")

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 0 minutes and 57.05 seconds.
Best Score: 79.031%
{'subsample': 1.0, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 6, 'learning_rate': 0.03, 'gamma': 1.5, 'colsample_bytree': 0.8}


[Parallel(n_jobs=14)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=14)]: Done   4 tasks      | elapsed:    4.0s
[Parallel(n_jobs=14)]: Done 125 out of 125 | elapsed:   50.5s finished


In [28]:
params_100_acc = {'subsample': 1.0,
 'n_estimators': 500,
 'min_child_weight': 3,
 'max_depth': 6,
 'learning_rate': 0.03,
 'gamma': 1.5,
 'colsample_bytree': 0.8}

clf_100_acc = XGBClassifier(**params_100_acc, n_jobs=4)

In [29]:
cv_results_100_acc = cross_validate(clf_100_acc, X_train_100, y_train,
                               scoring=scoring, cv=st_cv, n_jobs=-1)
cv_results_100_acc

{'fit_time': array([2.21092725, 3.06532526, 2.77823281, 3.44965267, 3.39590836]),
 'score_time': array([0.04098296, 0.01549959, 0.01548004, 0.01407075, 0.01474571]),
 'test_balanced_accuracy': array([0.7492629 , 0.79154382, 0.79154382, 0.81449631, 0.80470925]),
 'test_recall_0': array([0.83636364, 0.80606061, 0.80606061, 0.81818182, 0.81212121]),
 'test_precision_0': array([0.73404255, 0.80120482, 0.80120482, 0.82822086, 0.81707317]),
 'test_recall_1': array([0.66216216, 0.77702703, 0.77702703, 0.81081081, 0.7972973 ]),
 'test_precision_1': array([0.784     , 0.78231293, 0.78231293, 0.8       , 0.79194631]),
 'test_auc': array([0.84025389, 0.85851761, 0.87162162, 0.87858313, 0.86879607])}

In [30]:
scores_100_acc_df = get_scores(cv_results_100_acc, score_cols, df_cols=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
scores_100_acc_df.mean()

balanced_accuracy    0.790311
recall_0             0.796349
precision_0          0.815758
recall_1             0.788114
precision_1          0.764865
auc                  0.863554
dtype: float64

In [31]:
clf_100_acc.fit(X_train_100, y_train)
X_test_100 = X_test[feats_100]

test_scores_100_acc = calc_scores(clf_100_acc, X_test_100, y_test)
scores_test_100_acc_df = pd.DataFrame(data=test_scores_100_acc, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
scores_test_100_df.mean()



balanced_accuracy    0.756689
recall_0             0.782738
precision_0          0.742938
recall_1             0.729167
precision_1          0.770440
auc                  0.815043
dtype: float64

In [32]:
fts_250_df = pd.read_csv("datasets/mrmr_top250.tsv", sep="\t")
fts_250_df.columns = ["Order", "Feat_Index", "Name", "Score"]
fts_250_df["Name"] = fts_250_df["Name"].str.strip()
feats_250 = fts_250_df["Name"].to_list()

In [33]:
X_train_250 = X_train[feats_250]
X_train_250.head()

Unnamed: 0,VNN2,SLC9A7,ADD2,FNDC3B,EEF1A2,E2F8,FUT3,PLS1,KIF1A,ZFPM2,...,EPHA5,ZNF132,PNLIPRP2,FOLR3,GRIA3,F11,FETUB,MCM10,GNAT1,MYOM2
0,3.189026,3.442448,3.283981,6.704807,8.246389,4.046337,4.469728,2.87701,6.434729,4.089961,...,3.057387,3.065054,3.191119,2.912554,3.110144,3.054204,3.461555,5.718198,3.160783,3.652451
1,4.874852,3.548765,3.662784,7.195187,6.361501,5.026899,5.951502,5.43193,2.533096,4.317466,...,3.23149,2.947054,2.612252,3.81694,2.784161,3.189547,2.898729,3.676932,3.608013,3.791195
2,6.907824,3.208379,3.682653,7.386021,4.133781,3.5125,3.481282,2.300098,3.108423,5.881096,...,3.104999,3.610148,2.49398,3.102581,3.800372,2.337907,3.223666,2.091747,3.111369,3.391864
3,3.95782,3.432061,2.821736,7.049656,4.651417,3.69572,3.622156,3.578949,3.262309,5.77424,...,3.058671,2.982013,3.20615,2.992058,3.114048,3.04889,3.473469,3.328275,3.201443,3.706868
4,4.846836,3.429549,3.562138,6.093097,5.927124,3.560645,4.199089,4.838044,3.293778,2.471808,...,3.108481,3.06866,3.168892,3.013119,3.149837,3.054605,3.493431,3.246946,3.248864,3.627807


In [34]:
rand_search_250 = param_tuning(X_train_250, y_train, jobs=14)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 1 minutes and 55.21 seconds.
Best Score: 86.037%
{'subsample': 0.8, 'n_estimators': 400, 'min_child_weight': 3, 'max_depth': 5, 'learning_rate': 0.03, 'gamma': 1.5, 'colsample_bytree': 0.8}


[Parallel(n_jobs=14)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=14)]: Done   4 tasks      | elapsed:    8.1s
[Parallel(n_jobs=14)]: Done 125 out of 125 | elapsed:  1.8min finished


In [35]:
params_250 = {'subsample': 1.0,
 'n_estimators': 700,
 'min_child_weight': 3,
 'max_depth': 6,
 'learning_rate': 0.01,
 'gamma': 2,
 'colsample_bytree': 1.0}

clf_250 = XGBClassifier(**params_250, n_jobs=4)

In [36]:
cv_results_250 = cross_validate(clf_250, X_train_250, y_train,
                               scoring=scoring, cv=st_cv, n_jobs=-1)
cv_results_250

{'fit_time': array([11.09212136, 10.92821479, 10.87433219, 12.678509  , 10.35476208]),
 'score_time': array([0.02493453, 0.02382922, 0.02383351, 0.02303648, 0.06923652]),
 'test_balanced_accuracy': array([0.76312449, 0.78314906, 0.77569615, 0.78548321, 0.79527027]),
 'test_recall_0': array([0.83030303, 0.77575758, 0.78787879, 0.79393939, 0.8       ]),
 'test_precision_0': array([0.75274725, 0.80503145, 0.78787879, 0.79878049, 0.80981595]),
 'test_recall_1': array([0.69594595, 0.79054054, 0.76351351, 0.77702703, 0.79054054]),
 'test_precision_1': array([0.78625954, 0.75974026, 0.76351351, 0.77181208, 0.78      ]),
 'test_auc': array([0.8476249 , 0.8490991 , 0.8549959 , 0.86285831, 0.85876331])}

In [37]:
scores_250_df = get_scores(cv_results_250, score_cols, df_cols=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
scores_250_df.mean()

balanced_accuracy    0.780545
recall_0             0.790851
precision_0          0.797576
recall_1             0.772265
precision_1          0.763514
auc                  0.854668
dtype: float64

In [38]:
clf_250.fit(X_train_250, y_train)
X_test_250 = X_test[feats_250]

test_scores_250 = calc_scores(clf_250, X_test_250, y_test)
scores_test_250_df = pd.DataFrame(data=test_scores_250, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
scores_test_250_df.mean()



balanced_accuracy    0.757622
recall_0             0.778426
precision_0          0.754237
recall_1             0.735562
precision_1          0.761006
auc                  0.825525
dtype: float64

In [39]:
rand_search_250_acc = param_tuning(X_train_250, y_train, jobs=14, scoring="balanced_accuracy")

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 1 minutes and 53.7 seconds.
Best Score: 79.166%
{'subsample': 0.6, 'n_estimators': 400, 'min_child_weight': 4, 'max_depth': 4, 'learning_rate': 0.05, 'gamma': 0.5, 'colsample_bytree': 1.0}


[Parallel(n_jobs=14)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=14)]: Done   4 tasks      | elapsed:    8.3s
[Parallel(n_jobs=14)]: Done 125 out of 125 | elapsed:  1.8min finished


In [40]:
params_250_acc = {'subsample': 0.6,
 'n_estimators': 400,
 'min_child_weight': 4,
 'max_depth': 4,
 'learning_rate': 0.05,
 'gamma': 0.5,
 'colsample_bytree': 1.0}

clf_250_acc = XGBClassifier(**params_250_acc, n_jobs=4)

In [41]:
cv_results_250_acc = cross_validate(clf_250_acc, X_train_250, y_train,
                               scoring=scoring, cv=st_cv, n_jobs=-1)
cv_results_250_acc

{'fit_time': array([2.43479872, 3.91175222, 3.35607243, 2.6762898 , 3.34731841]),
 'score_time': array([0.05500865, 0.01890111, 0.02124858, 0.05094147, 0.01976895]),
 'test_balanced_accuracy': array([0.77862408, 0.78583129, 0.78513514, 0.81648239, 0.79223997]),
 'test_recall_0': array([0.85454545, 0.78787879, 0.8       , 0.84242424, 0.79393939]),
 'test_precision_0': array([0.76216216, 0.80246914, 0.79518072, 0.81764706, 0.80864198]),
 'test_recall_1': array([0.7027027 , 0.78378378, 0.77027027, 0.79054054, 0.79054054]),
 'test_precision_1': array([0.8125    , 0.76821192, 0.7755102 , 0.81818182, 0.77483444]),
 'test_auc': array([0.85462735, 0.84471744, 0.85573301, 0.87747748, 0.85036855])}

In [42]:
scores_250_acc_df = get_scores(cv_results_250_acc, score_cols, df_cols=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
scores_250_acc_df.mean()

balanced_accuracy    0.791663
recall_0             0.797220
precision_0          0.815758
recall_1             0.789848
precision_1          0.767568
auc                  0.856585
dtype: float64

In [43]:
clf_250_acc.fit(X_train_250, y_train)
X_test_250_acc = X_test[feats_250]

test_scores_250_acc = calc_scores(clf_250_acc, X_test_250_acc, y_test)
scores_test_250_acc_df = pd.DataFrame(data=test_scores_250_acc, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
scores_test_250_acc_df.mean()



balanced_accuracy    0.751972
recall_0             0.775811
precision_0          0.742938
recall_1             0.726727
precision_1          0.761006
auc                  0.823553
dtype: float64

In [27]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
def discretize_dataset(X, features, bins_labels = None):
    if bins_labels is None:
	    bins_labels = [-1, 0, 1]
    X_disc = X[features]
    bin_dict = {}

    for ft in features:
        r1 = X_disc[ft].mean() - X_disc[ft].std() / 2
        r2 = X_disc[ft].mean() + X_disc[ft].std() / 2
        bin_dict[ft]= [-np.inf, r1, r2, np.inf]
    le = LabelEncoder()

    le.fit(bins_labels)

    for ft in bin_dict:
        X_disc[ft] = le.transform(pd.cut(X_disc[ft], bins=bin_dict[ft], labels=bins_labels))

    ohe = OneHotEncoder(handle_unknown="ignore")
    transformed = ohe.fit_transform(X_disc).toarray()
    X_disc = pd.DataFrame(transformed, columns=ohe.get_feature_names(features))
    return X_disc

In [28]:
X_train_50_disc = discretize_dataset(X_train_50, feats_50)
X_train_50_disc.head()

Unnamed: 0,VNN2_0,VNN2_1,VNN2_2,ALAS2_0,ALAS2_1,ALAS2_2,E2F8_0,E2F8_1,E2F8_2,KIF21B_0,...,SEMG1_2,P2RX7_0,P2RX7_1,P2RX7_2,P2RX3_0,P2RX3_1,P2RX3_2,LPO_0,LPO_1,LPO_2
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0


In [29]:
rand_search_50_disc = param_tuning(X_train_50_disc, y_train, jobs=14)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 0 minutes and 29.15 seconds.
Best Score: 81.724%
{'subsample': 0.8, 'n_estimators': 300, 'min_child_weight': 5, 'max_depth': 5, 'learning_rate': 0.03, 'gamma': 2, 'colsample_bytree': 0.6}


[Parallel(n_jobs=14)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=14)]: Done   4 tasks      | elapsed:    3.0s
[Parallel(n_jobs=14)]: Done 125 out of 125 | elapsed:   27.7s finished


In [30]:
params_50_disc = {'subsample': 0.8,
 'n_estimators': 300,
 'min_child_weight': 5,
 'max_depth': 5,
 'learning_rate': 0.03,
 'gamma': 2,
 'colsample_bytree': 0.6}

In [31]:
clf_50_disc = XGBClassifier(**params_50_disc)
cv_results_50_disc = cross_validate(clf_50_disc, X_train_50_disc, y_train,
                               scoring=scoring, cv=st_cv, n_jobs=-1)
cv_results_50_disc

{'fit_time': array([43.05165124, 44.55136251, 44.39183044, 44.17680407, 44.62382197]),
 'score_time': array([0.06611395, 0.03911924, 0.04112458, 0.04994822, 0.01732683]),
 'test_balanced_accuracy': array([0.71791564, 0.76427109, 0.75890663, 0.77882883, 0.76019656]),
 'test_recall_0': array([0.79393939, 0.75151515, 0.72727273, 0.73333333, 0.76363636]),
 'test_precision_0': array([0.71195652, 0.78980892, 0.79470199, 0.82312925, 0.77777778]),
 'test_recall_1': array([0.64189189, 0.77702703, 0.79054054, 0.82432432, 0.75675676]),
 'test_precision_1': array([0.73643411, 0.73717949, 0.72222222, 0.73493976, 0.74172185]),
 'test_auc': array([0.78820639, 0.80687961, 0.82448812, 0.83918919, 0.82743653])}

In [32]:
scores_50_disc_df = get_scores(cv_results_50_disc, score_cols, df_cols=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
scores_50_disc_df.mean()

balanced_accuracy    0.756024
recall_0             0.779475
precision_0          0.753939
recall_1             0.734499
precision_1          0.758108
auc                  0.817240
dtype: float64

In [51]:
print_score_comparison(scores_50_df, scores_50_disc_df, header_1="Contin 50 Feats", header_2="Disc 50 Feats")

		posOutcome
			Contin 50 Feats		Disc 50 Feats
		-----------------------------------------------
balanced_accuracy:	77.334%			75.670%

precision_0:		78.182%			75.394%

recall_0:		78.846%			78.031%

precision_1:		76.486%			75.946%

recall_1:		75.883%			73.386%

auc:			83.880%			81.801%



In [52]:
print_score_comparison(scores_50_df, scores_50_acc_df, header_1="ROC_AUC 50 Feats", header_2="Acc 50 Feats")

		posOutcome
			ROC_AUC 50 Feats		Acc 50 Feats
		-----------------------------------------------
balanced_accuracy:	77.334%			78.050%

precision_0:		78.182%			78.667%

recall_0:		78.846%			79.672%

precision_1:		76.486%			77.432%

recall_1:		75.883%			76.613%

auc:			83.880%			83.196%



In [53]:
rand_search_50_disc_acc = param_tuning(X_train_50_disc, y_train, jobs=14, scoring="balanced_accuracy")

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 0 minutes and 12.66 seconds.
Best Score: 76.245%
{'subsample': 1.0, 'n_estimators': 600, 'min_child_weight': 2, 'max_depth': 6, 'learning_rate': 0.02, 'gamma': 2, 'colsample_bytree': 0.6}


[Parallel(n_jobs=14)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=14)]: Done   4 tasks      | elapsed:    1.1s
[Parallel(n_jobs=14)]: Done 125 out of 125 | elapsed:   11.4s finished


In [55]:
params_50_disc_acc = {'subsample': 1.0,
 'n_estimators': 600,
 'min_child_weight': 2,
 'max_depth': 6,
 'learning_rate': 0.02,
 'gamma': 2,
 'colsample_bytree': 0.6}

In [56]:
clf_50_disc_acc = XGBClassifier(**params_50_disc_acc)
cv_results_50_disc_acc = cross_validate(clf_50_disc_acc, X_train_50_disc, y_train,
                               scoring=scoring, cv=st_cv, n_jobs=-1)
cv_results_50_disc_acc

{'fit_time': array([120.20607376, 124.14437604, 123.56309533, 123.47662973,
        124.14804626]),
 'score_time': array([0.06935263, 0.02463698, 0.05627537, 0.04311252, 0.02216101]),
 'test_balanced_accuracy': array([0.70952088, 0.76496724, 0.77510238, 0.7791769 , 0.78349713]),
 'test_recall_0': array([0.76363636, 0.73939394, 0.73939394, 0.72727273, 0.76969697]),
 'test_precision_0': array([0.71186441, 0.79738562, 0.81333333, 0.82758621, 0.8089172 ]),
 'test_recall_1': array([0.65540541, 0.79054054, 0.81081081, 0.83108108, 0.7972973 ]),
 'test_precision_1': array([0.71323529, 0.73125   , 0.73619632, 0.73214286, 0.75641026]),
 'test_auc': array([0.79144144, 0.8030303 , 0.82567568, 0.83046683, 0.83157248])}

In [57]:
scores_50_disc_acc_df = get_scores(cv_results_50_disc_acc, score_cols, df_cols=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
scores_50_disc_acc_df.mean()

balanced_accuracy    0.762453
recall_0             0.791817
precision_0          0.747879
recall_1             0.733847
precision_1          0.777027
auc                  0.816437
dtype: float64

In [58]:
clf_50_acc.save_model("datasets/models/clf_mrmr50_acc.json")

In [33]:
fts_500_df = pd.read_csv("datasets/mrmr_top500.tsv", sep="\t")
fts_500_df.columns = ["Order", "Feat_Index", "Name", "Score"]
fts_500_df["Name"] = fts_500_df["Name"].str.strip()
feats_500 = fts_500_df["Name"].to_list()

In [34]:
X_train_500 = X_train[feats_500]
X_train_500.head()

Unnamed: 0,VNN2,SLC9A7,ADD2,FNDC3B,EEF1A2,E2F8,FUT3,PLS1,KIF1A,ZFPM2,...,PHOX2B,GCM2,CDH20,PDE6H,MTNR1B,CFHR5,GLP2R,DRD4,SLC2A2,C20orf195
0,3.189026,3.442448,3.283981,6.704807,8.246389,4.046337,4.469728,2.87701,6.434729,4.089961,...,2.991383,3.518356,3.418152,3.06605,3.346713,3.180736,3.226912,3.155459,3.094382,3.271942
1,4.874852,3.548765,3.662784,7.195187,6.361501,5.026899,5.951502,5.43193,2.533096,4.317466,...,2.410918,2.919491,3.381002,2.671217,3.10049,2.762882,3.239015,3.547748,3.365475,3.768267
2,6.907824,3.208379,3.682653,7.386021,4.133781,3.5125,3.481282,2.300098,3.108423,5.881096,...,2.73688,2.662786,3.083944,2.852804,3.566948,3.349166,3.623747,2.926309,2.964687,3.057714
3,3.95782,3.432061,2.821736,7.049656,4.651417,3.69572,3.622156,3.578949,3.262309,5.77424,...,2.97006,3.495451,3.4195,3.066539,3.337807,3.183639,3.227968,3.155875,3.094691,3.272536
4,4.846836,3.429549,3.562138,6.093097,5.927124,3.560645,4.199089,4.838044,3.293778,2.471808,...,2.989385,3.521955,3.419794,3.066645,3.347787,3.183792,3.228055,3.13684,3.094815,3.263791


In [6]:
rand_search_500 = param_tuning(X_train_500, y_train, jobs=-1)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 3 minutes and 11.61 seconds.
Best Score: 85.836%
{'subsample': 1.0, 'n_estimators': 400, 'min_child_weight': 3, 'max_depth': 6, 'learning_rate': 0.03, 'gamma': 0.5, 'colsample_bytree': 0.6}


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:  3.0min finished


In [35]:
params_500 = {'subsample': 1.0,
 'n_estimators': 400,
 'min_child_weight': 3,
 'max_depth': 6,
 'learning_rate': 0.03,
 'gamma': 0.5,
 'colsample_bytree': 0.6}
clf_500 = XGBClassifier(**params_500)

In [36]:
cv_results_500 = cross_validate(clf_500, X_train_500, y_train,scoring=scoring, cv=st_cv, n_jobs=-1)
cv_results_500_df = get_scores(cv_results_500, score_cols, df_cols=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
cv_results_500_df.mean()

balanced_accuracy    0.791314
recall_0             0.794133
precision_0          0.821818
recall_1             0.793073
precision_1          0.760811
auc                  0.858362
dtype: float64

In [40]:
clf_500.fit(X_train_500, y_train)
X_test_500 = X_test[feats_500]

test_scores_500 = calc_scores(clf_500, X_test_500, y_test)
test_scores_500_df = pd.DataFrame(data=test_scores_500, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
test_scores_500_df.mean()



balanced_accuracy    0.756343
recall_0             0.766017
precision_0          0.776836
recall_1             0.747604
precision_1          0.735849
auc                  0.824175
dtype: float64

In [41]:
clf_500.save_model("datasets/models/clf_mrmr500.json")

In [None]:
mrmr_ft50 = load_fea