In [1]:
### Experiment for comparing pam50 features to moses and xgboost selected features

import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import seaborn as sns
from scipy.stats import norm
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.metrics import roc_curve, auc
import warnings
from sklearn.metrics import roc_auc_score
warnings.filterwarnings('ignore')
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
import matplotlib.pyplot as plt

In [2]:
from sklearn.metrics import precision_score, recall_score, balanced_accuracy_score
def calc_results_simple(X, y, train_index, test_index, clf):
    X, y = X.to_numpy(), y.to_numpy(dtype=np.int64)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf.fit(X_train, y_train)
    y_pred  = clf.predict(X_test)
    y_pred_prob = clf.predict_proba(X_test)[:,1]
    acc = balanced_accuracy_score(y_test, y_pred)

    recall_0 =  recall_score(y_test, y_pred, pos_label=0)
    recall_1 =  recall_score(y_test, y_pred, pos_label=1)
    prec_0 = precision_score(y_test, y_pred, pos_label=0)
    prec_1 = precision_score(y_test, y_pred, pos_label=1)
    auc = roc_auc_score(y_test, y_pred_prob)

    return np.array([[acc, recall_0, prec_0, recall_1, prec_1 ,auc]])

#cross_validation
def run_cross_val(X, y, params, n_folds=5, random_seed=42):
    res = np.empty(shape=[0, 6])
    clf = XGBClassifier(**params, n_jobs=8)
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_seed)
    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
        res = np.append(res, calc_results_simple(X, y, train_index, test_index, clf), axis=0)
    return res, clf

def print_score_comparison(raw_score, emb_score, target_feature="RFS",
                           header_1="Raw Score", header_2="Embedding Score"):
    print("\t\t{0}\n\t\t\t{1}\t\t{2}".format(target_feature, header_1, header_2))
    print("\t\t-----------------------------------------------")
    print("balanced_accuracy:\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["balanced_accuracy"].mean(), emb_score["balanced_accuracy"].mean()))
    print("precision_0:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["precision_0"].mean(), emb_score["precision_0"].mean()))
    print("recall_0:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["recall_0"].mean(), emb_score["recall_0"].mean()))
    print("precision_1:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["precision_1"].mean(), emb_score["precision_1"].mean()))
    print("recall_1:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["recall_1"].mean(), emb_score["recall_1"].mean()))
    print("auc:\t\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["auc"].mean(), emb_score["auc"].mean()))


In [3]:
params = {'n_estimators': [300, 400, 500, 600, 700],
              'learning_rate': [0.01, 0.02, 0.03, 0.05, 0.07],
              'gamma': [0.5, 1, 1.5, 2, 5],
              'max_depth': [3, 4, 5, 6],
              'subsample': [0.6, 0.8, 1.0],
              'colsample_bytree': [0.6, 0.8, 1.0],
              'min_child_weight': [1, 2, 3, 4, 5]}
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time

    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

def param_tuning(X, y, n_folds=5, param_comb=25, scoring='roc_auc', jobs=12):
    xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1)
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    rand_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring=scoring, n_jobs=jobs,
                                   cv=skf.split(X, y), verbose=3, random_state=42)

    start_time = timer(None) # timing starts from this point for "start_time" variable
    rand_search.fit(X, y)
    timer(start_time)
    print("Best Score: {:.3%}".format(rand_search.best_score_))
    print(rand_search.best_params_)
    return rand_search

In [5]:
ge_outcome_df = pd.read_csv("datasets/train.csv")

X_outcome, y_outcome = ge_outcome_df[ge_outcome_df.columns.difference(["patient_ID", "posOutcome"])], ge_outcome_df["posOutcome"]

X_outcome.head()

Unnamed: 0,A4GALT,AAAS,AACS,AADAC,AAK1,AAMP,AANAT,AARS,AARSD1,AASDHPPT,...,ZNHIT2,ZP2,ZPBP,ZSCAN2,ZW10,ZWINT,ZXDC,ZYX,ZZEF1,ZZZ3
0,3.827808,5.564794,7.204352,3.240755,5.534022,7.163646,3.20297,7.593308,5.117947,5.592944,...,3.42008,3.144302,3.158701,3.521218,3.782229,6.718352,5.430783,7.427696,5.050724,6.400798
1,3.248177,5.122207,7.140361,3.791904,5.152121,7.238353,2.340434,6.965726,5.491372,6.226471,...,3.736214,2.89956,2.76686,3.199441,3.437693,7.240525,5.941886,5.710997,6.326005,6.232045
2,5.745078,6.354522,6.655388,3.140749,5.691268,6.06531,3.187345,6.901199,5.121201,6.575723,...,3.631283,3.14982,3.110608,3.619549,3.46482,8.683671,7.158653,6.021711,5.124773,5.641704
3,4.071474,6.892231,8.27845,3.194189,5.52863,7.411861,3.202969,9.046099,5.61174,5.402383,...,3.471157,3.154758,3.147681,3.547679,4.097028,7.096574,6.851807,6.750012,6.241452,3.214936
4,3.479792,5.565916,7.56636,4.322661,5.116781,6.579106,3.204731,7.106134,5.149881,6.688667,...,3.363895,3.139913,3.185299,3.572568,4.252681,6.376437,5.788609,6.497776,5.837898,5.666425


In [7]:
pam35_genes = ["BAG1", "BIRC5", "BLVRA", "CCNB1", "CCNE1", "CDC20", "CDC6", "CDH3", "CENPF", "CEP55", "EGFR", "ERBB2", "ESR1", "EXO1", "FOXA1", "FOXC1",  "GRB7", "KIF2C", "KRT14", "KRT17", "KRT5", "MAPT", "MDM2", "MELK", "MIA", "MKI67", "MMP11", "MYBL2", "MYC", "PGR", "RRM2", "SFRP1", "SLC39A6", "TYMS", "UBE2C"]

X_pam35_outcome, y_pam35_outcome = ge_outcome_df[pam35_genes], y_outcome

In [12]:
#parameter tuning
rand_search_pam = param_tuning(X_pam35_outcome, y_outcome, jobs=14)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 0 minutes and 39.96 seconds.
Best Score: 71.466%
{'subsample': 0.8, 'n_estimators': 400, 'min_child_weight': 3, 'max_depth': 5, 'learning_rate': 0.03, 'gamma': 1.5, 'colsample_bytree': 0.8}


[Parallel(n_jobs=14)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=14)]: Done   4 tasks      | elapsed:    3.7s
[Parallel(n_jobs=14)]: Done 125 out of 125 | elapsed:   37.1s finished


In [8]:
outcome_pam_params = {'subsample': 0.8,
 'n_estimators': 400,
 'min_child_weight': 3,
 'max_depth': 5,
 'learning_rate': 0.03,
 'gamma': 1.5,
 'colsample_bytree': 0.8}

In [9]:
outcome_pam_scores, clf_pam = run_cross_val(X_pam35_outcome, y_outcome, outcome_pam_params)
outcome_pam_scores_df = pd.DataFrame(data=outcome_pam_scores, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
outcome_pam_scores_df.describe()



Unnamed: 0,balanced_accuracy,recall_0,precision_0,recall_1,precision_1,auc
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,0.608707,0.367246,0.595679,0.850167,0.687747,0.684849
std,0.03384,0.069243,0.050204,0.007418,0.022331,0.031989
min,0.552684,0.25641,0.508475,0.839378,0.652,0.649706
25%,0.609287,0.364407,0.605634,0.848958,0.686192,0.662341
50%,0.611891,0.364407,0.6125,0.848958,0.6875,0.686794
75%,0.629091,0.418803,0.614286,0.854167,0.704348,0.692751
max,0.640581,0.432203,0.6375,0.859375,0.708696,0.732654


In [12]:
xgb50_genes = ['CDX4','GLRA1', 'OR12D3', 'DSCR4', 'HOXB8', 'C9', 'MTNR1B', 'MOS', 'HSD17B3', 'FGF20', 'KCNH4', 'ATP4B', 'CPB2', 'CRYBB1', 'ANGPTL3', 'MYH8', 'GYS2', 'SLC25A21', 'TAS2R7', 'F11', 'GABRA6', 'MYT1L', 'DEFB126', 'RPL18', 'GABRQ', 'ZFP37', 'PIP5K1B', 'MCM5', 'PRKAA1', 'WDR76', 'CHRM4', 'RPS6KC1', 'EIF1AY', 'WNT1', 'SCN3B', 'NLGN4Y', 'MAGEB1', 'NUDC', 'HIGD1A', 'OXCT2', 'GALR2', 'EEF1B2', 'RXRG', 'CALCA', 'TEX13A', 'CST3', 'IGFBP4', 'CRYGA', 'ESR1', 'ZNF750']

X_xgb50_outcome = ge_outcome_df[xgb50_genes]
X_xgb50_outcome.head()

Unnamed: 0,CDX4,GLRA1,OR12D3,DSCR4,HOXB8,C9,MTNR1B,MOS,HSD17B3,FGF20,...,GALR2,EEF1B2,RXRG,CALCA,TEX13A,CST3,IGFBP4,CRYGA,ESR1,ZNF750
0,3.441946,3.649732,3.478564,3.108122,3.069023,3.446326,3.337807,3.308069,3.180385,3.472109,...,3.439084,10.292747,3.286798,3.359011,3.134921,9.606278,8.655444,3.419224,6.224477,3.474853
1,3.624959,3.683105,3.249003,2.974811,2.767113,2.911851,3.262293,2.916112,2.833526,3.541518,...,3.268099,9.880033,3.175353,3.262555,2.837685,8.690348,7.177881,3.194742,11.332193,4.393775
2,3.422609,3.65022,3.46769,3.109336,3.127845,3.440665,3.348068,3.300216,3.173102,3.47289,...,3.439532,8.824005,3.262668,3.38407,3.154594,9.314709,9.44899,3.405249,9.788834,3.282551
3,3.444202,3.650176,3.493488,3.107932,3.127844,3.447169,3.348173,3.336874,3.213852,3.47644,...,3.439732,9.026309,3.313969,3.541921,3.173018,8.084809,9.028642,3.403133,9.794439,3.44493
4,3.439717,3.649347,3.494068,3.109121,3.112289,3.446726,3.347787,3.339719,3.251569,3.476081,...,3.439146,10.912199,3.313402,3.310447,3.165812,8.275915,9.124577,3.419337,4.190633,3.871


In [23]:
rand_search_xg50 = param_tuning(X_xgb50_outcome, y_outcome, jobs=14)


Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 0 minutes and 33.64 seconds.
Best Score: 77.627%
{'subsample': 0.6, 'n_estimators': 300, 'min_child_weight': 4, 'max_depth': 4, 'learning_rate': 0.02, 'gamma': 5, 'colsample_bytree': 0.6}


[Parallel(n_jobs=14)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=14)]: Done   4 tasks      | elapsed:    3.4s
[Parallel(n_jobs=14)]: Done 125 out of 125 | elapsed:   32.5s finished


In [13]:
outcome_xg50_params = {'subsample': 0.6,
 'n_estimators': 300,
 'min_child_weight': 4,
 'max_depth': 4,
 'learning_rate': 0.02,
 'gamma': 5,
 'colsample_bytree': 0.6}


In [15]:
outcome_xg50_scores, clf_xg50 = run_cross_val(X_xgb50_outcome, y_outcome, outcome_xg50_params)
outcome_xg50_scores_df = pd.DataFrame(data=outcome_xg50_scores, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
outcome_xg50_scores_df.describe()



Unnamed: 0,balanced_accuracy,recall_0,precision_0,recall_1,precision_1,auc
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,0.686042,0.505244,0.699509,0.866839,0.741859,0.765369
std,0.023444,0.061686,0.022456,0.022265,0.021116,0.01819
min,0.657839,0.440678,0.676768,0.834197,0.717949,0.750662
25%,0.663047,0.440678,0.684211,0.854167,0.720339,0.750794
50%,0.702507,0.521368,0.698925,0.875,0.752212,0.758146
75%,0.703392,0.550847,0.702703,0.885417,0.75576,0.774232
max,0.703423,0.57265,0.73494,0.885417,0.763033,0.793012


In [16]:
print_score_comparison(outcome_pam_scores_df, outcome_xg50_scores_df, target_feature="posOutcome",
                       header_1="PAM 35 Genes", header_2="Xgboost 50 genes")


		posOutcome
			PAM 35 Genes		Xgboost 50 genes
		-----------------------------------------------
balanced_accuracy:	62.923%			68.975%

precision_0:		63.061%			69.427%

recall_0:		39.952%			51.628%

precision_1:		70.413%			74.813%

recall_1:		85.893%			86.321%

auc:			71.466%			77.627%



In [16]:
moses50_genes = ["PRND", "FRS3", "FCN3", "DSCR4", "BRCA2", "CXCL6", "LMX1B", "DLX5", "OMP", "ADH6", "PGAP1", "ART3", "BCHE", "FGB", "IL1RAPL1", "FSTL4", "ASGR1", "ZNF135", "DLL3", "NPHS2", "ANGPT2", "GLP2R", "GRIA3", "HOXB8", "MSC", "PLA2R1", "CYP2F1", "TAS2R7", "NKX6-1", "WNT11", "CHST11", "CLCA4", "ENPEP", "PAH", "WFDC1", "CHGA", "SEZ6L", "UGT2A3", "PRDM16", "GALR2", "GUCA1A", "CASQ1", "NOS1AP", "CACNA2D3", "FHOD3", "SRGAP3", "TMOD2", "ATOH1", "SLC6A1", "HAS1"]

X_moses50_outcome = ge_outcome_df[moses50_genes]
X_moses50_outcome.head()

Unnamed: 0,PRND,FRS3,FCN3,DSCR4,BRCA2,CXCL6,LMX1B,DLX5,OMP,ADH6,...,GUCA1A,CASQ1,NOS1AP,CACNA2D3,FHOD3,SRGAP3,TMOD2,ATOH1,SLC6A1,HAS1
0,4.064458,3.665316,3.317154,3.108122,3.188253,3.103419,3.256618,3.338217,3.233919,2.752916,...,3.287726,3.308403,3.801125,3.238214,3.473143,3.943095,3.408503,3.071108,3.177592,3.197234
1,4.411729,3.565053,3.8761,2.974811,2.364823,2.653616,3.981706,3.196761,3.193351,3.227007,...,3.377446,3.233431,4.26138,3.208233,2.790065,4.773244,3.379353,3.21321,3.827159,2.938968
2,3.838553,3.820966,3.334539,3.109336,3.183788,3.076428,3.304853,3.218209,3.234288,2.793834,...,3.263957,3.313104,4.042518,3.292422,3.440582,4.01083,3.383331,3.1109,3.840973,3.239986
3,3.667958,3.678612,3.364912,3.107932,3.205506,3.110146,3.298162,3.233099,3.234435,2.859151,...,3.303262,3.293444,3.990541,3.204114,3.281651,3.911317,3.396894,3.104531,3.172796,3.165924
4,4.304432,3.642962,3.322351,3.109121,3.063962,3.183318,3.284932,3.369782,3.233986,2.807474,...,3.307657,3.285745,3.927508,3.260717,4.174131,3.932029,3.418361,3.08699,3.25848,3.192801


In [39]:
rand_search_moses50 = param_tuning(X_moses50_outcome, y_outcome, jobs=14)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 0 minutes and 31.46 seconds.
Best Score: 76.013%
{'subsample': 1.0, 'n_estimators': 500, 'min_child_weight': 2, 'max_depth': 3, 'learning_rate': 0.05, 'gamma': 5, 'colsample_bytree': 0.8}


[Parallel(n_jobs=14)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=14)]: Done   4 tasks      | elapsed:    3.2s
[Parallel(n_jobs=14)]: Done 125 out of 125 | elapsed:   29.6s finished


In [17]:
outcome_moses50_params = {'subsample': 1.0,
 'n_estimators': 500,
 'min_child_weight': 2,
 'max_depth': 3,
 'learning_rate': 0.05,
 'gamma': 5,
 'colsample_bytree': 0.8}


In [18]:
outcome_moses50_scores, clf_moses50 = run_cross_val(X_moses50_outcome, y_outcome, outcome_moses50_params)
outcome_moses50_scores_df = pd.DataFrame(data=outcome_moses50_scores, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
outcome_moses50_scores_df.describe()




Unnamed: 0,balanced_accuracy,recall_0,precision_0,recall_1,precision_1,auc
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,0.675363,0.493278,0.679022,0.857448,0.734792,0.738903
std,0.021009,0.046904,0.020962,0.01396,0.017963,0.0248
min,0.649055,0.449153,0.646341,0.84375,0.714912,0.711622
25%,0.669939,0.466102,0.67033,0.848958,0.727273,0.730734
50%,0.670551,0.470085,0.6875,0.849741,0.729258,0.731882
75%,0.68035,0.516949,0.694737,0.869792,0.739726,0.741459
max,0.706922,0.564103,0.696203,0.875,0.762791,0.778818


In [20]:
print_score_comparison(outcome_pam_scores_df, outcome_moses50_scores_df, target_feature="posOutcome",
                       header_1="Pam 35 genes", header_2="MOSES 50 genes")

		posOutcome
			Pam 35 genes		MOSES 50 genes
		-----------------------------------------------
balanced_accuracy:	60.871%			67.536%

precision_0:		59.568%			67.902%

recall_0:		36.725%			49.328%

precision_1:		68.775%			73.479%

recall_1:		85.017%			85.745%

auc:			68.485%			73.890%



In [21]:
print_score_comparison(outcome_moses50_scores_df, outcome_xg50_scores_df, target_feature="posOutcome",
                       header_1="MOSES 50 genes", header_2="Xgboost 50 genes")


		posOutcome
			MOSES 50 genes		Xgboost 50 genes
		-----------------------------------------------
balanced_accuracy:	67.536%			68.604%

precision_0:		67.902%			69.951%

recall_0:		49.328%			50.524%

precision_1:		73.479%			74.186%

recall_1:		85.745%			86.684%

auc:			73.890%			76.537%



In [31]:
clf_moses50 = XGBClassifier(**outcome_moses50_params, n_jobs=4)
clf_moses50.fit(X_moses50_outcome.to_numpy(), y_outcome.to_numpy())
clf_pam = XGBClassifier(**outcome_pam_params, n_jobs=4)
clf_pam.fit(X_pam35_outcome.to_numpy(), y_outcome.to_numpy())
clf_xg50 = XGBClassifier(**outcome_xg50_params, n_jobs=4)
clf_xg50.fit(X_xgb50_outcome.to_numpy(), y_outcome.to_numpy(dtype=np.int64))




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, gamma=5, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.02, max_delta_step=0, max_depth=4,
              min_child_weight=4, missing=nan, monotone_constraints='()',
              n_estimators=300, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.6,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [25]:
def find_misclassified_patients(df, clf, X, y):
    y_test = y.to_numpy()
    X_test = X.to_numpy()
    miss = np.where(y_test != clf.predict(X_test))
    return df.iloc[miss]["patient_ID"].to_numpy(dtype=np.int64)

def calc_overlap(a, b):
    intr = np.intersect1d(a, b)
    union = np.union1d(a, b)
    return intr, (len(intr) / len(union))

def print_overlap(model1, model2, intr, perc):
    print("{0} patients misclassified by {1} and {2} - {3:.1%} overlap\n".format(len(intr) ,model1, model2, perc))

In [43]:
xg50_miss = find_misclassified_patients(ge_outcome_df, clf_xg50, X_xgb50_outcome, y_outcome)
moses50_miss = find_misclassified_patients(ge_outcome_df, clf_moses50, X_moses50_outcome, y_outcome)
pam_miss = find_misclassified_patients(ge_outcome_df, clf_pam, X_pam35_outcome, y_pam35_outcome)

In [44]:
xg_moses_intr, perc = calc_overlap(xg50_miss, moses50_miss)
print_overlap("Xg50", "Moses50", xg_moses_intr, perc)

203 patients misclassified by Xg50 and Moses50 - 60.4% overlap



In [45]:
xg_pam_intr, perc = calc_overlap(xg50_miss, pam_miss)
print_overlap("Xg50", "PAM35", xg_pam_intr, perc)

18 patients misclassified by Xg50 and PAM35 - 6.8% overlap



In [46]:
moses_pam_intr, perc = calc_overlap(moses50_miss, pam_miss)
print_overlap("Moses50", "PAM35", moses_pam_intr, perc)

19 patients misclassified by Moses50 and PAM35 - 6.8% overlap



In [36]:
outcome_raw_params = {'subsample': 0.6,
 'n_estimators': 700,
 'min_child_weight': 5,
 'max_depth': 5,
 'learning_rate': 0.03,
 'gamma': 0.5,
 'colsample_bytree': 0.8}

In [37]:
outcome_scores, clf_outcome = run_cross_val(X_outcome, y_outcome, outcome_raw_params)
outcome_scores_df = pd.DataFrame(data=outcome_scores, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
outcome_scores_df.describe()



Unnamed: 0,balanced_accuracy,recall_0,precision_0,recall_1,precision_1,auc
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,0.707667,0.539157,0.726433,0.876177,0.757377,0.786875
std,0.029621,0.067676,0.015388,0.012353,0.026166,0.029915
min,0.680355,0.470085,0.707317,0.859375,0.733906,0.766375
25%,0.683263,0.491525,0.71875,0.870466,0.736842,0.766728
50%,0.701183,0.516949,0.723684,0.875,0.748899,0.777911
75%,0.72206,0.584746,0.73494,0.885417,0.771028,0.784825
max,0.751472,0.632479,0.747475,0.890625,0.796209,0.838537


In [47]:
clf_outcome = XGBClassifier(**outcome_raw_params, n_jobs=4)
clf_outcome.fit(X_outcome.to_numpy(), y_outcome.to_numpy(dtype=np.int64))
raw_miss = find_misclassified_patients(ge_outcome_df, clf_outcome, X_outcome, y_outcome)



In [48]:
xg50_raw_intr, perc = calc_overlap(xg50_miss, raw_miss)
print_overlap("Xgb50", "Raw", xg50_raw_intr, perc)

0 patients misclassified by Xgb50 and Raw - 0.0% overlap



In [49]:
moses50_raw_intr, perc = calc_overlap(moses50_miss, raw_miss)
print_overlap("Moses50", "Raw", moses50_raw_intr, perc)


0 patients misclassified by Moses50 and Raw - 0.0% overlap



In [50]:
raw_pam_intr, perc = calc_overlap(raw_miss, pam_miss)
print_overlap("Raw", "Pam35", raw_pam_intr, perc)

0 patients misclassified by Raw and Pam35 - 0.0% overlap



In [52]:
overlap_xg_moses_pam, perc = calc_overlap(moses_pam_intr, xg_pam_intr)
print_overlap("Pam", "Xgboost and MOSES", overlap_xg_moses_pam, perc)

17 patients misclassified by Pam and Xgboost and MOSES - 85.0% overlap



In [53]:
def write_misclassified(file_name, ls):
    with open("datasets/" + file_name + ".txt", "w") as f:
        for p in ls:
            f.write(str(p) + "\n")

In [54]:
write_misclassified("xg50_misclassified", xg50_miss)
write_misclassified("moses50_misclassified", moses50_miss)
write_misclassified("pam35_misclassified", pam_miss)
write_misclassified("raw_genes_misclassified", raw_miss)



In [55]:
from sklearn.metrics import roc_auc_score
def calc_scores(clf, X_test, y_test):
    y_pred = clf.predict(X_test)
    recall_0, recall_1 = recall_score(y_test, y_pred, pos_label=0), recall_score(y_test, y_pred, pos_label=1)
    precision_0, precision_1 =  precision_score(y_test, y_pred, pos_label=0), precision_score(y_test, y_pred, pos_label=1)
    acc = balanced_accuracy_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
    return np.array([[acc, recall_0, precision_0, recall_1, precision_1, auc_score]])


In [57]:
#Infogan 48 vector embedding
gan_df = pd.read_csv("datasets/codes_48.csv")
pos_outcome_df = ge_outcome_df[["patient_ID", "posOutcome"]]
gan_outcome_df = pd.merge(pos_outcome_df, gan_df, on="patient_ID")
X_gan_outcome, y_gan_outcome = gan_outcome_df[gan_outcome_df.columns.difference(["patient_ID", "posOutcome"])], gan_outcome_df["posOutcome"]
gan_outcome_df.head()

Unnamed: 0,patient_ID,posOutcome,0,1,2,3,4,5,6,7,...,38,39,40,41,42,43,44,45,46,47
0,305219,1,0.047016,0.043061,-0.022339,0.051153,0.02759,-0.065559,0.085302,0.032765,...,-0.108024,-0.033934,0.185561,-0.072791,-0.066151,-0.085863,-0.086953,0.056512,-0.124921,-0.132547
1,508677,0,0.047391,0.023785,-0.031051,-0.081737,-0.029602,-0.117081,0.106415,0.06455,...,-0.07234,0.011308,-0.024829,0.062021,0.06354,0.031951,-0.029503,-0.175566,0.053623,-0.029845
2,615188,1,-0.034273,0.005418,0.014059,0.017087,0.01725,-0.066139,0.015178,-0.012907,...,-0.012421,0.013563,0.013321,0.021066,-0.040252,-0.05044,-0.088349,-0.043951,0.082986,-0.017349
3,37010,1,-0.044358,0.053781,0.020593,0.019112,-0.008456,-0.138595,0.014994,-0.054167,...,-0.076462,0.028969,-0.062609,0.038502,-0.051895,0.045812,0.034085,-0.104103,0.092417,0.085315
4,441701,1,0.019478,0.030197,-0.017105,0.031387,0.003052,-0.047903,0.056778,0.015543,...,-0.075613,-0.01122,0.032653,0.00133,-0.023895,-0.029395,-0.005134,-0.019635,-0.020527,-0.066561


In [58]:
outcome_gan_params = {'subsample': 0.6, 'n_estimators': 300, 'min_child_weight': 4, 'max_depth': 4, 'learning_rate': 0.02, 'gamma': 5, 'colsample_bytree': 0.6}

In [59]:
outcome_gan_scores, clf_outcome_gan = run_cross_val(X_gan_outcome, y_gan_outcome, outcome_gan_params)
outcome_gan_scores_df  = pd.DataFrame(data=outcome_gan_scores, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
outcome_gan_scores_df.to_csv("datasets/results/outcome_scores_gan_48.csv")
outcome_gan_scores_df.describe()



Unnamed: 0,balanced_accuracy,recall_0,precision_0,recall_1,precision_1,auc
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,0.559182,0.261973,0.526107,0.85639,0.654932,0.630236
std,0.019509,0.044378,0.037248,0.019772,0.013038,0.016824
min,0.537562,0.194915,0.483871,0.833333,0.640152,0.602092
25%,0.543785,0.254237,0.5,0.838542,0.645161,0.62836
50%,0.560497,0.25641,0.530303,0.864583,0.656126,0.635417
75%,0.567576,0.29661,0.535714,0.865285,0.659836,0.641861
max,0.586489,0.307692,0.580645,0.880208,0.673387,0.64345


In [60]:
gan_miss = find_misclassified_patients(ge_outcome_df, clf_outcome_gan, X_gan_outcome, y_gan_outcome)
raw_gan_intr, perc = calc_overlap(raw_miss, gan_miss)
print_overlap("Raw", "Infogan", raw_gan_intr, perc)

0 patients misclassified by Raw and Infogan - 0.0% overlap



In [61]:
moses_gan_intr, perc = calc_overlap(moses50_miss, gan_miss)
print_overlap("Moses", "Infogan", moses_gan_intr, perc)

122 patients misclassified by Moses and Infogan - 25.3% overlap



In [62]:
xgb_gan_intr, perc = calc_overlap(xg50_miss, gan_miss)
print_overlap("Xgboost", "Infogan", xgb_gan_intr, perc)

122 patients misclassified by Xgboost and Infogan - 26.2% overlap



In [63]:
pam_gan_intr, perc = calc_overlap(pam_miss, gan_miss)
print_overlap("Pam35", "Infogan", pam_gan_intr, perc)

10 patients misclassified by Pam35 and Infogan - 3.0% overlap



In [64]:
write_misclassified("infogan_misclassified", gan_miss)

In [65]:
#save the models
clf_moses50.save_model("datasets/models/moses50_raw.json")
clf_xg50.save_model("datasets/models/xgb50_raw.json")
clf_pam.save_model("datasets/models/pam35_raw.json")
clf_outcome.save_model("datasets/models/raw_model.json")
clf_outcome_gan.save_model("datasets/models/infogan_model.json")