In [2]:
%cd "~/pln-brca-xp/notebooks"
%load_ext autoreload
%autoreload 2

/home/xabush/pln-brca-xp/notebooks


In [3]:
from utils import *
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import median_abs_deviation
from pymrmre import mrmr

In [4]:
def preprocess_data(df, scaler=None):
    """
    Preprocess the dataset
    1. Applys median absolute deviation (MAD)
    2. Selects the top 20% genes using MAD values
    3. Apply scaling to be in (0, 1)
    :param df: the pandas dataframe
    :param scaler: The transformer to use for scaling, by default MinMaxSclaer
    :return: a preprocessed dataframe
    """

    mad_arr = median_abs_deviation(df, axis=0)
    df_mad = pd.DataFrame(mad_arr.reshape(1, -1), columns=df.columns)
    df_mad = df_mad.sort_values(by=0, ascending=False, axis=1)
    num_cols = int(df_mad.shape[1] * 0.2)
    high_var_genes = df_mad.iloc[:, :num_cols].columns.to_list()
    df_final = df.loc[:, high_var_genes]
    if scaler is None:
        norm = MinMaxScaler()
    else:
        norm = scaler
    x = norm.fit_transform(df_final)
    df_final = pd.DataFrame(x, columns=df_final.columns, index=df_final.index)
    return df_final

In [5]:
ge_df_path = "/var/www/datasets/merged-combat15.csv.xz"
state_df_path = "/var/www/datasets/embedding_vector_state_and_outcome.csv"
tamoxifen_studies = ["GSE12093",  "GSE1379", "GSE17705", "GSE6577",  "GSE9893"]
ge_df = pd.read_csv(ge_df_path, index_col="patient_ID")
state_df = pd.read_csv(state_df_path, index_col="patient_ID")
tax_trt_df = state_df[state_df["series_id"].isin(tamoxifen_studies)]
ge_tamx_df = ge_df.loc[tax_trt_df.index,:]
ge_tamx_df = ge_tamx_df.join(state_df["posOutcome"])
X_tam_df, y_tamx_df = ge_tamx_df.drop(["posOutcome"], axis=1), ge_tamx_df["posOutcome"]
X_scaled = preprocess_data(X_tam_df)
print(X_scaled.shape)
X_scaled

(649, 1766)


Unnamed: 0_level_0,TFAP2B,SCGB1D2,DHRS2,SCGB2A2,CPB1,LTF,S100P,PIP,SCGB2A1,CYP2B6,...,DLG5,SERPINB1,SNAPC1,JOSD1,ALMS1,FUS,STAU2,GLG1,ZNF268,CTSO
patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
249296,0.312098,0.324304,0.385159,0.551446,0.107906,0.385246,0.373817,0.502808,0.272663,0.768693,...,0.582445,0.768956,0.144002,0.659206,0.502714,0.802432,0.059775,0.797122,0.130086,0.176577
249524,0.136203,0.208200,0.463417,0.525977,1.000000,0.207136,0.591409,0.439236,0.289637,1.000000,...,0.595341,0.711269,0.146812,0.791981,0.524508,0.974547,0.434257,0.816539,0.171427,0.124036
249527,0.126140,0.898438,0.141661,0.928443,0.476218,0.695349,0.740925,0.408495,0.893788,0.894466,...,0.737108,0.862702,0.167977,0.834964,0.426567,0.925897,0.209239,0.828620,0.140970,0.199764
249529,0.256343,0.763848,0.184840,0.799118,0.798501,0.425631,0.528707,0.434600,0.923083,0.842613,...,0.626282,0.769154,0.140055,0.752708,0.393765,0.867743,0.306340,0.714705,0.112959,0.174203
249530,0.471651,0.588570,0.474179,0.625999,0.000000,0.624659,0.439110,0.381255,0.314408,0.634108,...,0.655757,0.879676,0.117141,0.752675,0.521737,0.941098,0.360808,0.852167,0.162523,0.220044
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305260,0.057840,0.221925,0.229260,0.124437,0.323815,0.186135,0.082081,0.480966,0.147167,0.760505,...,0.563893,0.658029,0.193752,0.529967,0.427364,0.639396,0.374160,0.657968,0.492918,0.832546
305261,0.196474,0.614002,0.238328,0.580901,0.314771,0.445792,0.404507,0.570273,0.546146,0.589412,...,0.383422,0.095244,0.479955,0.345655,0.462728,0.492059,0.486635,0.463083,0.605367,0.531432
305262,0.414299,0.682621,0.193703,0.636208,0.318074,0.296531,0.442509,0.572413,0.624450,0.519011,...,0.387024,0.595301,0.443945,0.634357,0.475175,0.369180,0.589576,0.000000,0.822313,0.296249
305263,0.468658,0.283043,0.201599,0.345914,0.318074,0.189903,0.183957,0.655451,0.199281,0.368795,...,0.367569,0.387289,0.541773,0.595527,0.364533,0.336226,0.711637,0.022291,0.851226,0.359374


In [6]:
X_mad_train, X_mad_test, y_tamx_train, y_tamx_test = train_test_split(X_scaled, y_tamx_df, test_size=0.3,
                                stratify=y_tamx_df, random_state=seed)

In [7]:
params_acc_all, clf_acc_all, cv_scores_acc_all, test_scores_acc_all = evaluate_ge((X_mad_train, X_mad_test, y_tamx_train, y_tamx_test),rand_scoring="balanced_accuracy", split=False)
print(params_acc_all)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 0 minutes and 52.99 seconds.
Best Score: 65.342%
{'subsample': 0.6, 'scale_pos_weight': 0.1, 'n_estimators': 120, 'min_child_weight': 1, 'max_depth': 5, 'max_delta_step': 3, 'learning_rate': 0.07, 'gamma': 1, 'colsample_bytree': 0.8}
CV Score: 
balanced_accuracy      0.653420
recall_0               0.675000
precision_0            0.398865
recall_1               0.631841
precision_1            0.846086
auc                    0.741060
specificity            0.675000
average_precision_0    0.230410
dtype: float64

Test Score:
balanced_accuracy      0.704861
recall_0               0.666667
precision_0            0.478873
recall_1 

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:   47.5s finished


In [8]:
params_ap_all, clf_ap_all, cv_scores_ap_all, test_scores_ap_all = evaluate_ge((X_mad_train, X_mad_test, y_tamx_train, y_tamx_test),rand_scoring=average_precision_0, split=False)
print(params_ap_all)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 0 minutes and 50.47 seconds.
Best Score: 25.178%
{'subsample': 0.6, 'scale_pos_weight': 0.1, 'n_estimators': 80, 'min_child_weight': 3, 'max_depth': 5, 'max_delta_step': 3, 'learning_rate': 0.07, 'gamma': 2, 'colsample_bytree': 1.0}
CV Score: 
balanced_accuracy      0.602448
recall_0               0.908333
precision_0            0.316917
recall_1               0.296563
precision_1            0.905667
auc                    0.695969
specificity            0.908333
average_precision_0    0.251778
dtype: float64

Test Score:
balanced_accuracy      0.616013
recall_0               0.843137
precision_0            0.328244
recall_1  

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:   47.6s finished


In [9]:
params_auc_all, clf_auc_all, cv_scores_auc_all, test_scores_auc_all = evaluate_ge((X_mad_train, X_mad_test, y_tamx_train, y_tamx_test), split=False)
print(params_auc_all)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 0 minutes and 54.35 seconds.
Best Score: 74.515%
{'subsample': 0.6, 'scale_pos_weight': 0.5, 'n_estimators': 120, 'min_child_weight': 1, 'max_depth': 6, 'max_delta_step': 3, 'learning_rate': 0.05, 'gamma': 0.5, 'colsample_bytree': 0.8}
CV Score: 
balanced_accuracy      0.604953
recall_0               0.266667
precision_0            0.639814
recall_1               0.943238
precision_1            0.782287
auc                    0.745153
specificity            0.266667
average_precision_0    0.231714
dtype: float64

Test Score:
balanced_accuracy      0.544935
recall_0               0.117647
precision_0            0.600000
recall_

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:   47.2s finished


In [None]:
solutions = mrmr.mrmr_ensemble(features=X_mad_train, targets=y_tamx_train.to_frame(), solution_length=100, solution_count=1)
feats_100 = solutions[0][0]