In [2]:
import warnings
import numpy as np
import pandas as pd
warnings.filterwarnings('ignore')
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score, recall_score, balanced_accuracy_score, precision_score, make_scorer
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold
from xgboost import XGBClassifier
import matplotlib.pyplot as plt

In [3]:
params = {'n_estimators': [300, 400, 500, 600, 700],
              'learning_rate': [0.01, 0.02, 0.03, 0.05, 0.07],
              'gamma': [0.5, 1, 1.5, 2, 5],
              'max_depth': [3, 4, 5, 6],
              'subsample': [0.6, 0.8, 1.0],
              'colsample_bytree': [0.6, 0.8, 1.0],
              'min_child_weight': [1, 2, 3, 4, 5]}

seed = 42
st_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

def calc_scores(clf, X_test, y_test):
    y_pred = clf.predict(X_test)
    recall_0, recall_1 = recall_score(y_test, y_pred, pos_label=0), recall_score(y_test, y_pred, pos_label=1)
    precision_0, precision_1 =  precision_score(y_test, y_pred, pos_label=0), precision_score(y_test, y_pred, pos_label=1)
    acc = balanced_accuracy_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
    return np.array([[acc, precision_0, recall_0, precision_1, recall_1,auc_score]])

def recall_0(y_true, y_pred):
    return recall_score(y_true, y_pred, pos_label=0)

def precision_0(y_true, y_pred):
    return precision_score(y_true, y_pred, pos_label=0)

scoring = {"balanced_accuracy": make_scorer(balanced_accuracy_score),
           "recall_0": make_scorer(recall_0), "precision_0": make_scorer(precision_0),
           "recall_1": make_scorer(recall_score), "precision_1": make_scorer(precision_score), "auc": "roc_auc" }

#cross_validation

def print_score_comparison(raw_score, emb_score, target_feature="posOutcome",
                           header_1="Raw Score", header_2="Embedding Score"):
    print("\t\t{0}\n\t\t\t{1}\t\t{2}".format(target_feature, header_1, header_2))
    print("\t\t-----------------------------------------------")
    print("balanced_accuracy:\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["balanced_accuracy"].mean(), emb_score["balanced_accuracy"].mean()))
    print("precision_0:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["precision_0"].mean(), emb_score["precision_0"].mean()))
    print("recall_0:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["recall_0"].mean(), emb_score["recall_0"].mean()))
    print("precision_1:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["precision_1"].mean(), emb_score["precision_1"].mean()))
    print("recall_1:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["recall_1"].mean(), emb_score["recall_1"].mean()))
    print("auc:\t\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["auc"].mean(), emb_score["auc"].mean()))

def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time

    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

def param_tuning(X, y, n_folds=5, param_comb=25, scoring='roc_auc', jobs=12):
    xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1)
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    rand_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring=scoring, n_jobs=jobs,
                                   cv=skf.split(X, y), verbose=3, random_state=42)

    start_time = timer(None) # timing starts from this point for "start_time" variable
    rand_search.fit(X, y)
    timer(start_time)
    print("Best Score: {:.3%}".format(rand_search.best_score_))
    print(rand_search.best_params_)
    return rand_search

score_cols = ["test_balanced_accuracy","test_precision_0", "test_recall_0",
               "test_precision_1","test_recall_1", "test_auc"]

def get_scores(cv_results, score_keys=score_cols, df_cols=score_cols):
    scores = np.empty([1, len(score_keys)])
    for i, s in enumerate(score_keys):
        scores[0][i] = np.mean(cv_results[s])
    scores_df = pd.DataFrame(data=scores, columns=df_cols)
    return scores_df

In [4]:
train_df = pd.read_csv("datasets/train.csv")
X_train, y_train = train_df[train_df.columns.difference(["patient_ID", "posOutcome"])], train_df["posOutcome"]
X_train.head()

Unnamed: 0,A4GALT,AAAS,AACS,AADAC,AAK1,AAMP,AANAT,AARS,AARSD1,AASDHPPT,...,ZNHIT2,ZP2,ZPBP,ZSCAN2,ZW10,ZWINT,ZXDC,ZYX,ZZEF1,ZZZ3
0,3.490594,4.705177,7.388903,3.146066,5.324219,7.010299,3.20422,7.62326,4.908548,7.920498,...,3.616936,3.177763,3.120909,3.626377,5.573573,7.840314,5.720305,7.49144,7.049239,6.979166
1,3.493298,6.025729,6.501462,3.015961,4.639765,7.399345,3.801613,8.326222,5.075999,6.63509,...,4.002873,3.182145,3.414617,3.933382,3.717363,9.053191,6.370379,7.888914,5.422555,5.951768
2,3.426142,5.449551,5.632613,3.685224,5.643874,6.737401,3.596668,7.431818,5.591313,6.596328,...,2.695141,3.324802,3.251439,2.909459,4.385828,6.415808,5.480143,7.64496,6.797248,6.80828
3,3.426381,5.595401,6.882855,3.240755,6.07566,6.943799,3.20297,7.477471,4.90407,6.518033,...,3.3847,3.144302,3.158701,3.521218,3.968905,6.774039,6.299851,7.620011,5.797529,5.871506
4,3.479792,5.565861,4.662279,3.176784,6.033194,7.274996,3.204731,7.105333,6.663767,6.667291,...,3.414956,3.139913,3.185299,3.572568,3.874406,6.490379,6.589065,6.327172,6.770991,6.890959


In [5]:
fts_moses50 = []

with open("datasets/moses_ft50.txt", "r") as fp:
    for line in fp.readlines():
        fts_moses50.append(line.strip())

X_moses50 = X_train[fts_moses50]
X_moses50.head()

Unnamed: 0,PCOLCE2,SGCA,SFTPA2,INSL6,RHAG,OMP,NPY5R,STMN2,SCARF1,KIF13A,...,MPO,PPY,UBQLN3,CYP7A1,HOXB8,HSD11B2,RRAD,PAH,T,PON1
0,3.14843,3.592749,3.90313,3.34936,2.492889,3.232596,3.372126,3.698328,3.380497,3.232026,...,3.191073,3.634918,3.45282,3.084802,3.127523,3.343292,3.164438,2.840337,3.189698,3.108159
1,3.189867,3.18745,4.298787,3.315854,2.163489,3.093923,4.943535,4.298039,2.957697,4.002984,...,3.304428,4.022183,2.987215,2.970891,2.77955,2.572359,1.580958,2.691363,3.177675,3.213405
2,4.722449,3.113225,3.547999,3.763481,3.277243,3.421224,2.487729,2.386824,4.093911,3.254019,...,3.904301,3.710354,3.44463,2.917371,3.051644,3.241207,3.744355,2.109861,4.05939,2.632372
3,4.268493,3.594347,3.911735,3.35079,2.456152,3.233919,3.315442,3.589309,5.563479,3.702511,...,3.198184,3.640382,3.453832,3.093994,3.069023,3.238938,3.065845,2.738419,3.229419,3.149284
4,2.877485,3.58907,3.878472,3.350356,2.478397,3.233986,3.395737,3.352484,3.360377,3.169454,...,3.2163,3.63838,3.45428,3.094129,3.112289,3.308742,3.044857,2.764971,3.207398,3.088351


In [8]:
rand_search_moses50 = param_tuning(X_moses50, y_train, jobs=14)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 0 minutes and 24.1 seconds.
Best Score: 80.412%
{'subsample': 0.8, 'n_estimators': 600, 'min_child_weight': 3, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 5, 'colsample_bytree': 0.6}


[Parallel(n_jobs=14)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=14)]: Done   4 tasks      | elapsed:    1.6s
[Parallel(n_jobs=14)]: Done 125 out of 125 | elapsed:   21.7s finished


In [6]:
params_moses50 = {'subsample': 0.8,
 'n_estimators': 600,
 'min_child_weight': 3,
 'max_depth': 6,
 'learning_rate': 0.01,
 'gamma': 5,
 'colsample_bytree': 0.6}

clf_moses50 = XGBClassifier(**params_moses50, n_jobs=4)

In [7]:
cv_results_moses50 = cross_validate(clf_moses50, X_moses50, y_train,
                               scoring=scoring, cv=st_cv, n_jobs=-1)
cv_results_moses50

{'fit_time': array([1.11466408, 1.02984762, 1.42984104, 1.44669271, 1.33950138]),
 'score_time': array([0.02916408, 0.03314805, 0.0126574 , 0.0124681 , 0.01332188]),
 'test_balanced_accuracy': array([0.74215807, 0.75472973, 0.729095  , 0.76928747, 0.7246724 ]),
 'test_recall_0': array([0.84242424, 0.8       , 0.77575758, 0.78181818, 0.79393939]),
 'test_precision_0': array([0.72395833, 0.75428571, 0.73142857, 0.78181818, 0.71978022]),
 'test_recall_1': array([0.64189189, 0.70945946, 0.68243243, 0.75675676, 0.65540541]),
 'test_precision_1': array([0.78512397, 0.76086957, 0.73188406, 0.75675676, 0.74045802]),
 'test_auc': array([0.80679771, 0.79178952, 0.81322686, 0.81355446, 0.79520885])}

In [8]:
scores_moses50_df = get_scores(cv_results_moses50, score_cols, df_cols=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
scores_moses50_df.mean()

balanced_accuracy    0.743989
recall_0             0.742254
precision_0          0.798788
recall_1             0.755018
precision_1          0.689189
auc                  0.804115
dtype: float64

In [9]:
test_df = pd.read_csv("datasets/test.csv")
X_test, y_test = test_df[test_df.columns.difference(["patient_ID", "posOutcome"])], test_df["posOutcome"]

In [10]:
clf_moses50.fit(X_moses50, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, gamma=5, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=6,
              min_child_weight=3, missing=nan, monotone_constraints='()',
              n_estimators=600, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.8,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [11]:
X_test_moses50 = X_test[fts_moses50]

test_scores_moses50 = calc_scores(clf_moses50, X_test_moses50, y_test)
scores_test_50_df = pd.DataFrame(data=test_scores_moses50, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
scores_test_50_df.mean()

balanced_accuracy    0.749094
recall_0             0.749333
precision_0          0.793785
recall_1             0.754209
precision_1          0.704403
auc                  0.799568
dtype: float64

In [12]:
from sklearn.preprocessing import LabelEncoder

def discretize_dataset(X, features, bins_labels = None):
    if bins_labels is None:
	    bins_labels = [-1, 0, 1]
    X_disc = X[features]
    bin_dict = {}

    for ft in fts_moses50:
        r1 = X_disc[ft].mean() - X_disc[ft].std() / 2
        r2 = X_disc[ft].mean() + X_disc[ft].std() / 2
        bin_dict[ft]= [-np.inf, r1, r2, np.inf]
    le = LabelEncoder()

    le.fit(bins_labels)

    for ft in bin_dict:
        X_disc[ft] = le.transform(pd.cut(X_disc[ft], bins=bin_dict[ft], labels=bins_labels))
        # X_disc[ft] = X_disc[ft].astype(dtype=np.int64)
    return X_disc

In [13]:
X_moses50_disc = discretize_dataset(X_moses50, fts_moses50)
X_moses50_disc.head()

Unnamed: 0,PCOLCE2,SGCA,SFTPA2,INSL6,RHAG,OMP,NPY5R,STMN2,SCARF1,KIF13A,...,MPO,PPY,UBQLN3,CYP7A1,HOXB8,HSD11B2,RRAD,PAH,T,PON1
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,1,0,1,1,0,0,2,2,0,2,...,1,2,0,0,0,0,0,1,1,1
2,2,0,0,2,2,2,0,0,2,1,...,2,1,1,0,1,1,2,0,2,0
3,1,1,1,1,1,1,1,1,2,2,...,1,1,1,1,1,1,1,1,1,1
4,0,1,1,1,1,1,1,0,1,1,...,1,1,1,1,1,1,1,1,1,1


In [43]:
rand_search_moses50_disc = param_tuning(X_moses50_disc, y_train, jobs=14)

array([1, 0, 2])

In [14]:
fts_moses83 = []

with open("datasets/moses_ft83.txt", "r") as fp:
    for line in fp.readlines():
        fts_moses83.append(line.strip())

X_moses83 = X_train[fts_moses83]
X_moses83.head()

Unnamed: 0,PCOLCE2,SGCA,SFTPA2,INSL6,RHAG,OMP,NPY5R,STMN2,SCARF1,KIF13A,...,PGC,PRDM16,GRIA2,PDE6H,SLC7A4,ZFP2,ZNF10,PRL,SLC5A4,GALR2
0,3.14843,3.592749,3.90313,3.34936,2.492889,3.232596,3.372126,3.698328,3.380497,3.232026,...,3.066523,2.98156,3.722755,3.06605,3.190915,3.331363,3.647163,3.258032,3.371849,3.437293
1,3.189867,3.18745,4.298787,3.315854,2.163489,3.093923,4.943535,4.298039,2.957697,4.002984,...,2.185713,2.600702,4.108615,2.671217,3.577205,3.113802,4.302391,4.042981,3.63203,3.644259
2,4.722449,3.113225,3.547999,3.763481,3.277243,3.421224,2.487729,2.386824,4.093911,3.254019,...,3.002238,2.867571,3.165894,2.852804,3.03243,3.334047,3.056203,3.079016,3.055022,3.336115
3,4.268493,3.594347,3.911735,3.35079,2.456152,3.233919,3.315442,3.589309,5.563479,3.702511,...,3.024053,2.976027,7.697695,3.066539,3.147233,3.280103,4.32222,3.196184,3.373126,3.439084
4,2.877485,3.58907,3.878472,3.350356,2.478397,3.233986,3.395737,3.352484,3.360377,3.169454,...,3.045817,2.981492,3.660311,3.066645,3.178023,3.224418,3.929318,3.263807,3.373217,3.439146


In [56]:
rand_search_moses83 = param_tuning(X_moses83, y_train, jobs=14, scoring="balanced_accuracy")

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 0 minutes and 35.99 seconds.
Best Score: 74.718%
{'subsample': 1.0, 'n_estimators': 400, 'min_child_weight': 4, 'max_depth': 6, 'learning_rate': 0.02, 'gamma': 5, 'colsample_bytree': 1.0}


[Parallel(n_jobs=14)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=14)]: Done   4 tasks      | elapsed:    2.5s
[Parallel(n_jobs=14)]: Done 125 out of 125 | elapsed:   31.9s finished


In [15]:
params_moses83 = {'subsample': 1.0,
                  'n_estimators': 400,
                  'min_child_weight': 4,
                  'max_depth': 6,
                  'learning_rate': 0.02,
                  'gamma': 5,
                  'colsample_bytree': 1.0}

clf_moses83 = XGBClassifier(**params_moses83, n_jobs=4)

In [16]:
cv_results_moses83 = cross_validate(clf_moses83, X_moses83, y_train,
                               scoring=scoring, cv=st_cv, n_jobs=-1)
cv_results_moses83

{'fit_time': array([2.07262707, 1.96360373, 1.33789372, 1.35235715, 2.11978889]),
 'score_time': array([0.01216316, 0.01650262, 0.01989007, 0.02052927, 0.01205277]),
 'test_balanced_accuracy': array([0.74250614, 0.76287879, 0.75169943, 0.74330467, 0.73550369]),
 'test_recall_0': array([0.83636364, 0.77575758, 0.79393939, 0.76363636, 0.78181818]),
 'test_precision_0': array([0.72631579, 0.77575758, 0.75287356, 0.75449102, 0.73714286]),
 'test_recall_1': array([0.64864865, 0.75      , 0.70945946, 0.72297297, 0.68918919]),
 'test_precision_1': array([0.7804878 , 0.75      , 0.75539568, 0.73287671, 0.73913043]),
 'test_auc': array([0.80937756, 0.8018018 , 0.81398444, 0.8031122 , 0.80126945])}

In [17]:
scores_moses83_df = get_scores(cv_results_moses83, score_cols, df_cols=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
scores_moses83_df.mean()

balanced_accuracy    0.747179
recall_0             0.749316
precision_0          0.790303
recall_1             0.751578
precision_1          0.704054
auc                  0.805909
dtype: float64

In [18]:
clf_moses83.fit(X_moses83, y_train)
X_test_moses83 = X_test[fts_moses83]

test_scores_moses83 = calc_scores(clf_moses83, X_test_moses83, y_test)
scores_test_83_df = pd.DataFrame(data=test_scores_moses83, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
scores_test_83_df.mean()



balanced_accuracy    0.726628
recall_0             0.728723
precision_0          0.774011
recall_1             0.729730
precision_1          0.679245
auc                  0.795193
dtype: float64

In [61]:
clf_moses83.save_model("datasets/models/clf_moses83.json")

In [19]:
fts_moses500 = []

with open("datasets/moses_ft500_bmc.txt", "r") as fp:
    for line in fp.readlines():
        fts_moses500.append(line.strip())

X_moses500 = X_train[fts_moses500]
X_moses500.head()


Unnamed: 0,PPY,RGN,CDH20,CPZ,HESX1,TBL1Y,PRG2,CRY2,MFAP4,PVRL3,...,WISP1,WISP3,WNT2B,ZBTB3,ZFHX4,ZFYVE9,ZNF10,ZNF214,ZNF215,ZNF80
0,3.634918,6.678143,3.418152,3.698522,3.572909,3.571194,6.950399,6.055297,4.347075,3.330911,...,3.685007,3.302871,3.308621,3.552972,3.946286,3.761024,3.647163,3.125024,3.016956,3.547274
1,4.022183,4.527047,3.381002,3.925558,3.715131,3.460038,4.056473,5.584762,5.62921,4.045242,...,3.685147,3.697984,2.467508,4.411691,4.910276,4.296368,4.302391,3.821208,3.928156,4.072608
2,3.710354,3.087923,3.083944,4.947658,5.083375,3.309398,2.735483,5.931768,6.306764,4.481756,...,5.280646,3.242144,3.546346,4.125019,5.797823,3.8077,3.056203,3.409324,3.458087,3.87868
3,3.640382,3.325559,3.4195,6.530958,3.543959,3.571256,3.402627,6.579714,7.918224,3.474763,...,5.593913,3.347911,3.251575,3.527236,6.548565,2.997048,4.32222,3.075615,2.999787,3.46642
4,3.63838,3.373206,3.419794,3.603261,6.968873,3.573625,3.428841,5.690064,2.977116,3.338315,...,3.350563,3.359097,3.266639,3.571667,3.462968,3.849481,3.929318,3.135227,2.959384,3.549222


In [20]:
rand_search_moses500 = param_tuning(X_moses500, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 3 minutes and 27.02 seconds.
Best Score: 82.742%
{'subsample': 0.6, 'n_estimators': 400, 'min_child_weight': 2, 'max_depth': 5, 'learning_rate': 0.05, 'gamma': 1, 'colsample_bytree': 1.0}


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:   14.1s
[Parallel(n_jobs=12)]: Done 125 out of 125 | elapsed:  3.2min finished


In [22]:
params_moses500 = {'subsample': 0.6,
 'n_estimators': 400,
 'min_child_weight': 2,
 'max_depth': 5,
 'learning_rate': 0.05,
 'gamma': 1,
 'colsample_bytree': 1.0}

clf_moses500 = XGBClassifier(**params_moses500)

In [23]:
cv_results_moses500 = cross_validate(clf_moses500, X_moses500, y_train,
                               scoring=scoring, cv=st_cv, n_jobs=-1)
cv_results_moses500 = get_scores(cv_results_moses500, score_cols, df_cols=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
cv_results_moses500.mean()

balanced_accuracy    0.767310
recall_0             0.774468
precision_0          0.792727
recall_1             0.763548
precision_1          0.741892
auc                  0.827420
dtype: float64

In [24]:
clf_moses500.fit(X_moses500, y_train)
X_test_moses500 = X_test[fts_moses500]

test_scores_moses500 = calc_scores(clf_moses500, X_test_moses500, y_test)
test_scores_moses500_df = pd.DataFrame(data=test_scores_moses500, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
test_scores_moses500_df.mean()



balanced_accuracy    0.761992
recall_0             0.768595
precision_0          0.788136
recall_1             0.757282
precision_1          0.735849
auc                  0.820861
dtype: float64

In [25]:
clf_moses500.save_model("datasets/models/clf_moses500.json")