In [3]:
import os
os.environ["XLA_PYTHON_CLIENT_PREALLOCATE"] = "False"
os.environ["XLA_PYTHON_CLIENT_ALLOCATOR"] = "platform"
SERVER = 1

if not SERVER:
    %cd /home/xabush/code/snet/moses-incons-pen-xp/notebooks/variable_selection/cancer/nn

else:
    %cd /home/abdu/bio_ai/moses-incons-pen-xp/notebooks/variable_selection/cancer/nn

import pandas as pd
import matplotlib.pyplot as plt
import tensorflow_probability.substrates.jax as tfp
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
tfd = tfp.distributions
import jax
import haiku as hk
import numpy as np
import optax
from nn_util import *
from optim_util import *
from bnn_models import *
from train_utils import *
from data_utils import *
from hpo_util import *
plt.style.use('ggplot')
%load_ext autoreload

/home/abdu/bio_ai/moses-incons-pen-xp/notebooks/variable_selection/cancer/nn


In [4]:
if SERVER:
    data_dir = "/home/abdu/bio_ai/moses-incons-pen-xp/data"
else:
    data_dir = "/home/xabush/code/snet/moses-incons-pen-xp/data"

In [5]:
gdsc_dir = f"{data_dir}/cell_line/gdsc2"
gdsc_exp_tamox_data = pd.read_csv(f"{gdsc_dir}/tamoxifen_response_gene_expr.csv")
gdsc_exp_tamox_data.shape

(406, 37265)

In [6]:
X_df, target_df = gdsc_exp_tamox_data.iloc[:,:-1], gdsc_exp_tamox_data.iloc[:,-1]
# change to -log10(IC_50) to make it comparable
target_df = -np.log10(np.exp(target_df)) # exp b/c the values are natural logs of raw IC_50
cancer_driver_genes_df = pd.read_csv(f"{data_dir}/cell_line/driver_genes_20221018.csv")
cols = X_df.columns.to_list()
driver_syms = cancer_driver_genes_df["symbol"].to_list()
driver_sym_list = [sym.strip() for sym in cols if sym in driver_syms]
X_selected_df = X_df[driver_sym_list]
X_selected_df.shape

(406, 768)

In [10]:
%autoreload
from sklearn.preprocessing import QuantileTransformer, PowerTransformer, RobustScaler, MinMaxScaler, Normalizer, StandardScaler
seed = 739
transformer = QuantileTransformer(random_state=seed, output_distribution="normal")
X_train_outer, X_train, X_val, X_test, \
    y_train_outer, y_train, yval, y_test, _ = preprocess_data(seed, X_selected_df, target_df,
                                                              transformer, 0.2, 0.2)



In [11]:
from scipy.sparse import csgraph
J = np.load(f"{data_dir}/cell_line/cancer_genes_net.npy")
L = csgraph.laplacian(J, normed=True)

### Elasticnet Regression

In [20]:
from scipy import stats
def get_rank_test(seed, betas, G, p, n=1000, alternative="two-sided"):
    q = betas.T @ G @ betas
    # vals = np.full(n, q)
    rand_vals = np.zeros(n)
    key = jax.random.PRNGKey(seed)
    for i in range(n):
        _, key = jax.random.split(key, 2)
        idxs = jax.random.permutation(key, p)
        betas_shuffled = betas[idxs]
        rand_vals[i] = betas_shuffled.T @ G @ betas_shuffled

    # return stats.wilcoxon(q, rand_vals, alternative=alternative)
    return np.searchsorted(rand_vals, q)

In [8]:
from sklearn.linear_model import ElasticNet
def train_elastic_net_model(seed, X_train, X_test, y_train, y_test):
    cv = KFold(n_splits=5, random_state=seed, shuffle=True)
    param_grid = {"alpha": np.logspace(-3, 2, 20)}
    grid_cv = GridSearchCV(estimator=ElasticNet(random_state=seed ,max_iter=10000), param_grid=param_grid,
                           verbose=0, scoring="r2", cv=cv).fit(X_train, y_train)
    lin_model = ElasticNet(random_state=seed, max_iter=10000, **grid_cv.best_params_)
    lin_model.fit(X_train, y_train)
    y_test_pred = lin_model.predict(X_test)

    test_rmse_score = np.sqrt(np.mean((y_test - y_test_pred)**2))
    test_r2_score = r2_score(y_test, y_test_pred)

    return lin_model, test_rmse_score, test_r2_score

In [17]:
esnet_model, test_rmse_score, test_r2_score = train_elastic_net_model(seed, X_train_outer, X_test, y_train_outer, y_test)
print(f"Test RMSE: {test_rmse_score}, r2_score: {test_r2_score}, alpha: {esnet_model.alpha}")

Test RMSE: 0.4512041661016546, r2_score: 0.2859926031437129, alpha: 0.23357214690901212


In [433]:
nonzero_idxs = np.nonzero(esnet_model.coef_)[0]
esnet_selected_fts = np.argsort(np.abs(esnet_model.coef_))[::-1][:len(nonzero_idxs)]
esnet_selected_fts

array([576,  59, 531, 534, 229, 376, 367, 596, 457, 578, 277, 483, 284,
       365,  87, 580, 218, 619, 178, 239, 530, 733, 644, 324, 150, 587,
       142, 448, 317])

In [439]:
sorted(X_selected_df.iloc[:, esnet_selected_fts].columns.to_list())

['BCL11A',
 'CBFA2T3',
 'CSF3R',
 'CYSLTR2',
 'EGFR',
 'FAM135B',
 'FAT2',
 'FEV',
 'GPC3',
 'HGF',
 'IL7R',
 'IRS4',
 'LMO1',
 'LOX',
 'MAF',
 'NKX2-1',
 'NRG1',
 'PAX5',
 'PREX2',
 'PRF1',
 'PRKCB',
 'RGL3',
 'RGS7',
 'RHOH',
 'ROS1',
 'RSPH10B2',
 'SFRP4',
 'SOX17',
 'WIF1']

In [21]:
get_rank_test(seed, esnet_model.coef_, J, X_test.shape[-1], alternative="greater")

997

In [22]:
%autoreload
import warnings
warnings.filterwarnings("ignore")
seeds = [422,261,968,282,739,573,220,413,745,775,482,442,210,423,760,57,769,920,226,196]

res_dict = {"seed":[], "test_rmse": [], "test_r2_score": [], "num_sel_feats": [] ,"adj_p_value": [], "lap_p_value" : []}

for seed in tqdm(seeds):
    transformer = QuantileTransformer(random_state=seed, output_distribution="normal")
    X_train_outer, X_train, X_val, X_test,\
    y_train_outer, y_train, yval, y_test, _ = preprocess_data(seed, X_selected_df, target_df,
                                                              transformer, val_size=0.2, test_size=0.2)

    p = X_train_outer.shape[-1]

    esnet_model, test_rmse_score, test_r2_score = train_elastic_net_model(seed, X_train_outer, X_test, y_train_outer, y_test)
    nonzero_idxs = np.nonzero(esnet_model.coef_)[0]
    rank_adj = get_rank_test(seed, esnet_model.coef_, J, p, alternative="greater")
    rank_lap = get_rank_test(seed, esnet_model.coef_, L, p, alternative="less")
    res_dict["seed"].append(seed)
    res_dict["test_rmse"].append(test_rmse_score)
    res_dict["test_r2_score"].append(test_r2_score)
    res_dict["num_sel_feats"].append(len(nonzero_idxs))
    res_dict["adj_p_value"].append(rank_adj)
    res_dict["lap_p_value"].append(rank_lap)

100%|███████████████████████████████████████████████████████████████████████████████████████| 20/20 [04:10<00:00, 12.54s/it]


In [23]:
res_df = pd.DataFrame(res_dict)
res_df

Unnamed: 0,seed,test_rmse,test_r2_score,num_sel_feats,adj_p_value,lap_p_value
0,422,0.447172,0.131775,19,1000,0
1,261,0.484312,0.1733,20,1000,126
2,968,0.504478,0.205721,22,1000,0
3,282,0.52339,0.214603,24,741,142
4,739,0.452662,0.164775,29,750,0
5,573,0.489139,0.262072,24,991,32
6,220,0.442798,0.25519,21,1000,0
7,413,0.451204,0.285993,23,997,95
8,745,0.563548,0.124755,26,1000,501
9,775,0.492953,0.250564,22,469,1


In [428]:
res_df[["test_rmse", "test_r2_score"]].mean()

test_rmse        0.490435
test_r2_score    0.209777
dtype: float64