In [1]:
%cd /home/xabush/code/snet/moses-incons-pen-xp/notebooks/variable_selection/cancer/nn
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import jax
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from deap import tools, algorithms, creator, base, gp
import ea_utils
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import itertools
%load_ext autoreload

/home/xabush/code/snet/moses-incons-pen-xp/notebooks/variable_selection/cancer/nn


In [2]:
# Use cpu for PRNG generation for speed-up
jax.config.update("jax_platform_name", "cpu") # If you change this, you have to restart the notebook
jax.default_backend()

'cpu'

In [3]:
data_dir = "/home/xabush/code/snet/moses-incons-pen-xp/data"

In [4]:
tamox_df = pd.read_csv(f"{data_dir}/tamoxBinaryEntrez.csv")
tamox_df.head()

Unnamed: 0,posOutcome,4111,4110,10661,131,4438,330,1109,2637,2642,...,7634,55769,7637,7644,741,54993,79364,7791,23140,26009
0,0,0,0,0,1,0,1,1,1,1,...,0,0,0,1,1,0,0,1,0,0
1,1,1,0,0,0,0,0,1,0,1,...,0,0,0,1,1,0,0,1,0,1
2,0,0,0,0,1,0,0,1,1,1,...,0,0,0,1,1,0,0,1,0,0
3,0,0,0,0,0,0,0,1,1,1,...,0,0,0,1,0,0,0,1,0,0
4,1,0,0,0,0,0,1,1,1,1,...,0,0,0,1,1,0,0,1,0,0


In [5]:
X_df, y_df = tamox_df[tamox_df.columns.difference(["posOutcome"])], tamox_df["posOutcome"]

In [7]:
exp_dir = f"{data_dir}/exp_data_4/cancer/fisher"

exp_seeds = []
with open(f"{exp_dir}/deap/seeds_main.txt", "r") as fp:
    for line in fp.readlines():
        exp_seeds.append(int(line.strip()))

In [23]:
%autoreload
from ea_train_cosmic import run_deap, run_logistc_regression
from nn_util import setup_logger
import datetime
import warnings
import time
from tqdm import tqdm
warnings.filterwarnings("ignore")

bnn_deap_dict = {"seed": [], "classifier": [], "num_feats": [], "top_5_feats": [] ,"cv_score": [], "test_score": []}

cxpb, mutpb = 0.5, 0.3
n_gen = 1000

num_feats = 70

for seed in tqdm(exp_seeds):
    # print(f"Running seed {seed}")
    logger = setup_logger(None, seed)
    start_time = time.time()

    rng_key = jax.random.PRNGKey(seed)
    cv = StratifiedKFold(random_state=seed, shuffle=True, n_splits=5)

    idx_sig = np.load(f"{exp_dir}/fisher_idx_sig_{seed}.npy")
    selected_idx = np.load(f"{exp_dir}/bnn_sel_idx_s_{seed}_n_{num_feats}.npy")

    # X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, random_state=seed, shuffle=True, stratify=y_df)
    X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, random_state=seed, shuffle=True,
                                                        stratify=y_df, test_size=0.3)

    X_train_sig, X_test_sig = X_train.iloc[:, idx_sig], X_test.iloc[:,idx_sig]
    X_train_sel, X_test_sel = X_train_sig.iloc[:,selected_idx].to_numpy(), X_test_sig.iloc[:,selected_idx].to_numpy()
    y_train, y_test = y_train.to_numpy(), y_test.to_numpy()

    X_train_sel, X_val_sel, y_train, y_val = train_test_split(X_train_sel, y_train, random_state=seed, shuffle=True,
                                                              stratify=y_train, test_size=0.2)


    hof, val_score, test_score, train_preds, val_preds, test_preds = run_deap(rng_key, X_train_sel, X_val_sel, X_test_sel,
                                                                              y_train, y_val, y_test, cxpb, mutpb, n_gen, logger)

    X_train_sel_ea = np.concatenate([X_train_sel, train_preds], axis=1)
    X_val_sel_ea = np.concatenate([X_val_sel, val_preds], axis=1)
    X_test_sel_ea = np.concatenate([X_test_sel, test_preds], axis=1)

    clf_log, log_best_params, log_cv_score, log_test_score = run_logistc_regression(X_train_sel_ea, X_val_sel_ea, X_test_sel_ea,
                                                                                    y_train, y_val ,y_test, cv, logger)

    # Check the top 5 feats acc. Logistic Regression classifier
    top_5_feats = ",".join([str(idx) for idx in np.argsort(np.abs(clf_log.coef_[0]))[::-1][:5]])

    bnn_deap_dict["classifier"].append("DEAP")
    bnn_deap_dict["seed"].append(seed)
    bnn_deap_dict["num_feats"].append(num_feats)
    bnn_deap_dict["cv_score"].append(val_score)
    bnn_deap_dict["test_score"].append(test_score)
    bnn_deap_dict["top_5_feats"].append("-")

    bnn_deap_dict["classifier"].append("DEAP + LR")
    bnn_deap_dict["seed"].append(seed)
    bnn_deap_dict["num_feats"].append(num_feats + len(hof))
    bnn_deap_dict["cv_score"].append(log_cv_score)
    bnn_deap_dict["test_score"].append(log_test_score)
    bnn_deap_dict["top_5_feats"].append(top_5_feats)

    end_time = time.time()

    elapsed = datetime.timedelta(seconds=(end_time - start_time))
    # logger.info(f"Done for seed {seed}. Time elapsed - {elapsed}")

bnn_deap_df = pd.DataFrame(bnn_deap_dict)
bnn_deap_df

  0%|                                                                                            | 0/20 [00:00<?, ?it/s]

2022-10-25 22:01:28,531 [INFO], Num models: 3, EA Validation Score: 0.5707070707070707, Test Score: 0.509596796465065
Fitting 5 folds for each of 10 candidates, totalling 50 fits
2022-10-25 22:01:29,895 [INFO], LR best params {'C': 0.046415888336127774}
2022-10-25 22:01:29,910 [INFO], LR scores - cv score:  0.8845, test_score:  0.6722


  5%|████▏                                                                               | 1/20 [00:50<15:53, 50.19s/it]


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/home/xabush/miniconda3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3398, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_10067/2558079440.py", line 40, in <cell line: 17>
    hof, val_score, test_score, train_preds, val_preds, test_preds = run_deap(rng_key, X_train_sel, X_val_sel, X_test_sel,
  File "/home/xabush/code/snet/moses-incons-pen-xp/notebooks/variable_selection/cancer/nn/ea_train_cosmic.py", line 154, in run_deap
    _, logbook = ea_utils.eaSimple(rng_key ,pop, toolbox, cxpb, mutbp, num_gen, stats=mstats, halloffame=hof,
  File "/home/xabush/code/snet/moses-incons-pen-xp/notebooks/variable_selection/cancer/nn/ea_utils.py", line 168, in eaSimple
    offspring = varAnd(gen_key, offspring, toolbox, cxpb, mutpb)
  File "/home/xabush/code/snet/moses-incons-pen-xp/notebooks/variable_selection/cancer/nn/ea_utils.py", line 70, in varAnd
    if random.uniform(mut_key) < mutpb:


In [20]:
bnn_deap_df.groupby(["classifier"])[["num_feats" ,"cv_score", "test_score"]].mean()

Unnamed: 0_level_0,num_feats,cv_score,test_score
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DEAP,70.0,0.627825,0.582743
DEAP + LR,74.35,0.845297,0.692495


In [21]:
bnn_deap_df.groupby(["classifier"])[["num_feats" ,"cv_score", "test_score"]].std()

Unnamed: 0_level_0,num_feats,cv_score,test_score
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DEAP,0.0,0.072141,0.049923
DEAP + LR,1.136708,0.040808,0.034661


In [17]:
bnn_feat_len_svm_df = pd.read_csv(f"{exp_dir}/moses/bnn_fisher_feat_len_svm.csv")

In [18]:
%autoreload
from ea_train_cosmic import run_deap, run_logistc_regression
from nn_util import setup_logger
import datetime
import warnings

warnings.filterwarnings("ignore")

bnn_feat_len_deap_dict = {"seed": [], "classifier": [], "num_feats": [], "top_5_feats": [] ,"cv_score": [], "test_score": [], "kernel": []}

cxpb, mutpb = 0.5, 0.2
n_gen = 1000

for seed in exp_seeds:
    print(f"Running seed {seed}")
    logger = setup_logger(None, seed)
    start_time = time.time()

    rng_key = jax.random.PRNGKey(seed)
    cv = StratifiedKFold(random_state=seed, shuffle=True, n_splits=5)
    gamma_means = np.load(f"{exp_dir}/bnn_disc_mean_s_{seed}.npy")
    gamma_means_idx_s = np.argsort(gamma_means)[::-1]
    X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.3, random_state=seed, shuffle=True,
                                                        stratify=y_df)
    # X_train, X_test = X_train.to_numpy(), X_test.to_numpy()
    # y_train, y_test = y_train.to_numpy(), y_test.to_numpy()
    idx_sig = np.load(f"{exp_dir}/fisher_idx_sig_{seed}.npy")
    X_train_sig, X_test_sig = X_train.iloc[:, idx_sig], X_test.iloc[:,idx_sig]

    feat_len = bnn_feat_len_svm_df[bnn_feat_len_svm_df["seed"] == seed]["num_feats"].iloc[0]
    kernel = bnn_feat_len_svm_df[bnn_feat_len_svm_df["seed"] == seed]["kernel"].iloc[0]
    gamma_idx = gamma_means_idx_s[:feat_len]
    X_train_sel, X_test_sel = X_train_sig.iloc[:,gamma_idx].to_numpy(), X_test_sig.iloc[:,gamma_idx].to_numpy()
    y_train, y_test = y_train.to_numpy(), y_test.to_numpy()

    X_train_sel, X_val_sel, y_train, y_val = train_test_split(X_train_sel, y_train, random_state=seed, shuffle=True,
                                                              stratify=y_train, test_size=0.2)


    hof, val_score, test_score, train_preds, val_preds, test_preds = run_deap(rng_key, X_train_sel, X_val_sel, X_test_sel,
                                                                              y_train, y_val, y_test, cxpb, mutpb, n_gen, logger)

    X_train_sel_ea = np.concatenate([X_train_sel, train_preds], axis=1)
    X_val_sel_ea = np.concatenate([X_val_sel, val_preds], axis=1)
    X_test_sel_ea = np.concatenate([X_test_sel, test_preds], axis=1)

    clf_log, log_best_params, log_cv_score, log_test_score = run_logistc_regression(X_train_sel_ea, X_val_sel_ea, X_test_sel_ea,
                                                                                    y_train, y_val ,y_test, cv, logger)

    # Check the top 5 feats acc. Logistic Regression classifier
    top_5_feats = ",".join([str(idx) for idx in np.argsort(np.abs(clf_log.coef_[0]))[::-1][:5]])

    bnn_feat_len_deap_dict["classifier"].append("DEAP")
    bnn_feat_len_deap_dict["seed"].append(seed)
    bnn_feat_len_deap_dict["num_feats"].append(feat_len)
    bnn_feat_len_deap_dict["cv_score"].append(val_score)
    bnn_feat_len_deap_dict["test_score"].append(test_score)
    bnn_feat_len_deap_dict["kernel"].append(kernel)
    bnn_feat_len_deap_dict["top_5_feats"].append("-")

    bnn_feat_len_deap_dict["classifier"].append("DEAP + LR")
    bnn_feat_len_deap_dict["seed"].append(seed)
    bnn_feat_len_deap_dict["num_feats"].append(feat_len + len(hof))
    bnn_feat_len_deap_dict["cv_score"].append(log_cv_score)
    bnn_feat_len_deap_dict["test_score"].append(log_test_score)
    bnn_feat_len_deap_dict["kernel"].append(kernel)
    bnn_feat_len_deap_dict["top_5_feats"].append(top_5_feats)

    end_time = time.time()

    elapsed = datetime.timedelta(seconds=(end_time - start_time))
    logger.info(f"Done for seed {seed}. Time elapsed - {elapsed}")

bnn_feat_len_deap_df = pd.DataFrame(bnn_feat_len_deap_dict)
bnn_feat_len_deap_df

Running seed 422
2022-10-25 18:32:30,558 [INFO], Num models: 7, EA Validation Score: 0.6571969696969697, Test Score: 0.5424606462303232
Fitting 5 folds for each of 10 candidates, totalling 50 fits
2022-10-25 18:32:31,334 [INFO], LR best params {'C': 0.1}
2022-10-25 18:32:31,343 [INFO], LR scores - cv score:  0.8592, test_score:  0.6613
2022-10-25 18:32:31,345 [INFO], Done for seed 422. Time elapsed - 0:00:17.515286
Running seed 261
2022-10-25 18:33:20,876 [INFO], Num models: 3, EA Validation Score: 0.5804924242424242, Test Score: 0.5808478320905827
Fitting 5 folds for each of 10 candidates, totalling 50 fits
2022-10-25 18:33:21,806 [INFO], LR best params {'C': 0.021544346900318832}
2022-10-25 18:33:21,814 [INFO], LR scores - cv score:  0.9072, test_score:  0.7117
2022-10-25 18:33:21,816 [INFO], Done for seed 261. Time elapsed - 0:00:50.466828
Running seed 968
2022-10-25 18:33:40,779 [INFO], Num models: 6, EA Validation Score: 0.7111742424242424, Test Score: 0.5796741231703949
Fitting 5

Unnamed: 0,seed,classifier,num_feats,top_5_feats,cv_score,test_score,kernel
0,422,DEAP,40,-,0.657197,0.542461,linear
1,422,DEAP + LR,47,63732191,0.859217,0.661281,linear
2,261,DEAP,90,-,0.580492,0.580848,poly
3,261,DEAP + LR,93,881548744,0.907197,0.711682,poly
4,968,DEAP,100,-,0.711174,0.579674,linear
5,968,DEAP + LR,106,31012210058,0.861742,0.697735,linear
6,282,DEAP,80,-,0.694444,0.658036,poly
7,282,DEAP + LR,84,2978771459,0.890783,0.708506,poly
8,739,DEAP,100,-,0.556818,0.545706,rbf
9,739,DEAP + LR,103,7223616102,0.891414,0.694007,rbf


In [22]:
bnn_feat_len_deap_df.groupby(["classifier"])[["num_feats", "cv_score", "test_score"]].mean()

Unnamed: 0_level_0,num_feats,cv_score,test_score
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DEAP,87.0,0.625142,0.574845
DEAP + LR,91.05,0.859722,0.697121
