In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import seaborn as sns
from scipy.stats import norm
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, balanced_accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import RepeatedStratifiedKFold
from mlxtend.classifier import StackingCVClassifier
from mlxtend.feature_selection import ColumnSelector
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

In [4]:

ge_outcome_df = pd.read_csv("datasets/train.csv")
ge_outcome_test_df = pd.read_csv("datasets/test.csv")
X_outcome, y_outcome = ge_outcome_df[ge_outcome_df.columns.difference(["patient_ID", "posOutcome"])], ge_outcome_df["posOutcome"]

X_test, y_test = ge_outcome_test_df[ge_outcome_df.columns.difference(["patient_ID", "posOutcome"])], ge_outcome_test_df["posOutcome"]

print("Train shape: {0}\nTest shape:{1}".format(X_outcome.shape, X_test.shape))

Train shape: (1549, 8833)
Test shape:(664, 8833)


In [5]:
#Load the models
clf_xg50 = XGBClassifier()
clf_xg50.load_model("datasets/models/xgb50_raw.json")
clf_moses50 = XGBClassifier()
clf_moses50.load_model("datasets/models/moses50_raw.json")
clf_raw = XGBClassifier()
clf_raw.load_model("datasets/models/raw_model.json")
clf_pam = XGBClassifier()
clf_pam.load_model("datasets/models/pam35_raw.json")
clf_gan = XGBClassifier()
clf_gan.load_model("datasets/models/infogan_model.json")

In [6]:
def calc_scores(clf, X_test, y_test):
    y_pred = clf.predict(X_test)
    recall_0, recall_1 = recall_score(y_test, y_pred, pos_label=0), recall_score(y_test, y_pred, pos_label=1)
    precision_0, precision_1 =  precision_score(y_test, y_pred, pos_label=0), precision_score(y_test, y_pred, pos_label=1)
    acc = balanced_accuracy_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
    return np.array([[acc, recall_0, precision_0, recall_1, precision_1, auc_score]])

In [7]:
moses50_genes = ["PRND", "FRS3", "FCN3", "DSCR4", "BRCA2", "CXCL6", "LMX1B", "DLX5", "OMP", "ADH6", "PGAP1", "ART3", "BCHE", "FGB", "IL1RAPL1", "FSTL4", "ASGR1", "ZNF135", "DLL3", "NPHS2", "ANGPT2", "GLP2R", "GRIA3", "HOXB8", "MSC", "PLA2R1", "CYP2F1", "TAS2R7", "NKX6-1", "WNT11", "CHST11", "CLCA4", "ENPEP", "PAH", "WFDC1", "CHGA", "SEZ6L", "UGT2A3", "PRDM16", "GALR2", "GUCA1A", "CASQ1", "NOS1AP", "CACNA2D3", "FHOD3", "SRGAP3", "TMOD2", "ATOH1", "SLC6A1", "HAS1"]
xgb50_genes = ['CDX4','GLRA1', 'OR12D3', 'DSCR4', 'HOXB8', 'C9', 'MTNR1B', 'MOS', 'HSD17B3', 'FGF20', 'KCNH4', 'ATP4B', 'CPB2', 'CRYBB1', 'ANGPTL3', 'MYH8', 'GYS2', 'SLC25A21', 'TAS2R7', 'F11', 'GABRA6', 'MYT1L', 'DEFB126', 'RPL18', 'GABRQ', 'ZFP37', 'PIP5K1B', 'MCM5', 'PRKAA1', 'WDR76', 'CHRM4', 'RPS6KC1', 'EIF1AY', 'WNT1', 'SCN3B', 'NLGN4Y', 'MAGEB1', 'NUDC', 'HIGD1A', 'OXCT2', 'GALR2', 'EEF1B2', 'RXRG', 'CALCA', 'TEX13A', 'CST3', 'IGFBP4', 'CRYGA', 'ESR1', 'ZNF750']
pam35_genes = ["BAG1", "BIRC5", "BLVRA", "CCNB1", "CCNE1", "CDC20", "CDC6", "CDH3", "CENPF", "CEP55", "EGFR", "ERBB2", "ESR1", "EXO1", "FOXA1", "FOXC1",  "GRB7", "KIF2C", "KRT14", "KRT17", "KRT5", "MAPT", "MDM2", "MELK", "MIA", "MKI67", "MMP11", "MYBL2", "MYC", "PGR", "RRM2", "SFRP1", "SLC39A6", "TYMS", "UBE2C"]

In [14]:
seed = 42
cv = StratifiedKFold(shuffle=True, n_splits=5, random_state=seed)

moses50_pipe = make_pipeline(ColumnSelector(cols=moses50_genes),
                           clf_moses50)

xgb50_pipe = make_pipeline(ColumnSelector(cols=xgb50_genes),
                           clf_xg50)

pam35_pipe = make_pipeline(ColumnSelector(cols=pam35_genes),
                           clf_pam)

sclf_1 = StackingCVClassifier(classifiers=[moses50_pipe, xgb50_pipe, pam35_pipe, clf_raw], meta_classifier=LogisticRegression(),
                              cv=cv, use_clones=False, verbose=True,
                              n_jobs=14,
                              random_state=seed)

sclf_1.fit(X_outcome, y_outcome)

Fitting 4 classifiers...
Fitting classifier1: pipeline (1/4)
Fitting classifier2: pipeline (2/4)
Fitting classifier3: pipeline (3/4)
Fitting classifier4: xgbclassifier (4/4)


[Parallel(n_jobs=14)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=14)]: Done   2 out of   5 | elapsed:    1.7s remaining:    2.5s
[Parallel(n_jobs=14)]: Done   5 out of   5 | elapsed:    2.0s finished
[Parallel(n_jobs=14)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=14)]: Done   2 out of   5 | elapsed:    1.0s remaining:    1.5s
[Parallel(n_jobs=14)]: Done   5 out of   5 | elapsed:    1.3s finished
[Parallel(n_jobs=14)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=14)]: Done   2 out of   5 | elapsed:    1.4s remaining:    2.1s
[Parallel(n_jobs=14)]: Done   5 out of   5 | elapsed:    1.9s finished
[Parallel(n_jobs=14)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=14)]: Done   2 out of   5 | elapsed:  3.2min remaining:  4.8min
[Parallel(n_jobs=14)]: Done   5 out of   5 | elapsed:  3.3min finished


StackingCVClassifier(classifiers=[Pipeline(steps=[('columnselector',
                                                   ColumnSelector(cols=['PRND',
                                                                        'FRS3',
                                                                        'FCN3',
                                                                        'DSCR4',
                                                                        'BRCA2',
                                                                        'CXCL6',
                                                                        'LMX1B',
                                                                        'DLX5',
                                                                        'OMP',
                                                                        'ADH6',
                                                                        'PGAP1',
                                               

In [20]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(sclf_1, X_outcome, y_outcome, n_jobs=10,
                                              cv=3, scoring='balanced_accuracy')
print("Accuracy: %0.2f (+/- %0.2f)"
      % (scores.mean(), scores.std()))

Accuracy: 0.72 (+/- 0.01)


0.7217961418602318