In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from functools import partial
from multiprocessing import shared_memory
from multiprocessing.dummy import Pool
from sklearn.ensemble import RandomForestRegressor
import multiprocessing as mp
from itertools import chain, combinations
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from scipy.stats import wilcoxon
from scipy.stats import ttest_rel
import sys
import os
from scipy.stats import norm
from numpy import mean
from numpy import std
from sklearn.linear_model import BayesianRidge

import mp_run

import concurrent.futures
os.environ["OMP_NUM_THREADS"] = "1" # export OMP_NUM_THREADS=1
os.environ["OPENBLAS_NUM_THREADS"] = "1" # export OPENBLAS_NUM_THREADS=1
os.environ["MKL_NUM_THREADS"] = "1" # export MKL_NUM_THREADS=1
os.environ["VECLIB_MAXIMUM_THREADS"] = "1" # export VECLIB_MAXIMUM_THREADS=1
os.environ["NUMEXPR_NUM_THREADS"] = "1" # export NUMEXPR_NUM_THREADS=1

In [2]:
nof_iters = 10000

def pairedtest(n1, n2, nof_iters):
    import random
   
    ndiff = sum([ n1[i]-n2[i] for i in range(len(n1)) ])
    if ndiff < 0:
         nbig = n2
         nsmall = n1
         ndiff = -ndiff
    else:
         nbig = n1
         nsmall = n2
    
    bcount = 0
    for niter in range(nof_iters):
        tdiff = 0
        for i in range(len(nbig)):
            a = random.random()
            if a <0.5:
                tdiff += nsmall[i]-nbig[i]
            else:
                tdiff += nbig[i]-nsmall[i]
        if tdiff > ndiff:
            bcount += 1
    return bcount / nof_iters

In [3]:
data1 = 5 * np.random.randn(100) + 51
data2 = 5 * np.random.randn(100) + 511231
# compare samples
stat, p = ttest_rel(data1, data2)
print(pairedtest(data1, data2, 10000))

0.0


In [4]:
perturbation_factor = 3
num_rf_predictors = 500

target_tf = 'AT2G46680'

induction_flag = 0
mp_threads = 20
# if (len(sys.argv)>=3):
#     induction_flag = bool(sys.argv[1])
#     mp_threads = int(sys.argv[2])

In [5]:
tf_df = pd.read_csv('data/wrky_regulators.csv')
tf_list = tf_df['Gene']
scaler = StandardScaler()

In [6]:
def choose_2_3(iterable):
    "powerset([1,2,3]) -->  (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(2,4))

In [7]:
target_df = pd.read_csv('data/wrky_targets_neg.csv')
if (induction_flag):
    target_df = pd.read_csv('data/wrky_targets_pos.csv')
deg_genes = target_df['Gene']

In [8]:
ts_df_raw = pd.read_csv('data/GSE97500/expression.tsv', sep='\t', index_col=0)
ts_df = pd.DataFrame(scaler.fit_transform(ts_df_raw), columns=ts_df_raw.columns)
ts_df.index = ts_df_raw.index

c = pd.Series(list(set(deg_genes).intersection(set(ts_df.index))))
tf_list = pd.Series(list(set(tf_list).intersection(set(ts_df.index))))
meta_df = pd.read_csv('data/GSE97500/meta_data.tsv', sep='\t')
ts_exp_index = meta_df[meta_df['isTs']]
ts_exp_index_target =  ts_exp_index[ts_exp_index['is1stLast'] == 'm'].condName
ts_exp_index_source =  ts_exp_index[ts_exp_index['is1stLast'] == 'm'].prevCol

In [9]:
ts_exp_index_target_val = ts_exp_index[ts_exp_index['is1stLast'] == 'l'].condName
ts_exp_index_source_val =  ts_exp_index[ts_exp_index['is1stLast'] == 'l'].prevCol

In [10]:
target_genes = set(deg_genes).intersection(set(ts_df.index))

In [12]:
non_trivial_targets = []
for target in target_genes:
    if ts_df.loc[target].mean() != 0.0:
        non_trivial_targets.append(target)


In [13]:
target_genes = pd.Series(non_trivial_targets)

In [14]:
from sklearn.linear_model import BayesianRidge
from sklearn.naive_bayes import GaussianNB
brr = BayesianRidge()

In [15]:
tf_df = ts_df[ts_exp_index_source].T[tf_list]

In [16]:
y_train =tf_df['AT2G37260'].values

In [17]:
X_train = tf_df.loc[:, tf_df.columns != 'AT2G37260'].values


In [18]:
brr = brr.fit(X_train, y_train)

In [19]:
y_train[2]

-0.047080746728569864

In [20]:
res = brr.predict(X_train[2].reshape(1,-1), return_std=True)
res

(array([-0.04717058]), array([0.00635785]))

In [22]:
test_dist = norm(-0.04717058, 0.00635785)

In [68]:
np.random.normal(-0.04717058, 0.00635785, size=(1,10))

array([[-0.05458319, -0.04464968, -0.04472361, -0.04248731, -0.05128317,
        -0.04088759, -0.05082357, -0.04362429, -0.04826484, -0.04084617]])

In [None]:
from sklearn.preprocessing import Normalizer
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

regr = RandomForestRegressor(random_state=0)
# regr = xgb.XGBRegressor()

ts_train_y_list = ts_df[ts_exp_index_target]
# set training target to expression level change
ts_train_y_list.iloc[:, :] = ts_df[ts_exp_index_target].values - ts_df[ts_exp_index_source].values
ts_val_y_list = ts_df[ts_exp_index_target_val]
# set training target to expression level change
ts_val_y_list.iloc[:, :] = ts_df[ts_exp_index_target_val].values - ts_df[ts_exp_index_source_val].values

ts_val_naive_y_list = ts_df[ts_exp_index_source_val]
ts_val_naive_y_list.loc[:,:] = 0


result_list = []
result_measure_list = []
mse_list = []
naive_mse_list = []
bottom_mse_list = []
top_feature_size_list = []


for target_gene in tqdm(target_genes):
    train_gene_index = tf_list[tf_list != target_gene]
    ts_val_y = ts_val_y_list.loc[target_gene]
    old_mse = sys.maxsize
    old_mse = np.random.randn(len(ts_val_y)) + sys.maxsize
    top_influence_genes = train_gene_index
    data_mean = ts_df.T[top_influence_genes].mean()
    data_std = ts_df.T[top_influence_genes].std()
    ts_train_X = ts_df[ts_exp_index_source].T[top_influence_genes]
    ts_val_X = ts_df[ts_exp_index_source_val].T[top_influence_genes]
    ts_train_y = ts_train_y_list.loc[target_gene]
    ts_val_y = ts_val_y_list.loc[target_gene]
    ts_val_naive_y = ts_val_naive_y_list.loc[target_gene]
    regr = RandomForestRegressor(random_state=42, warm_start=True, n_estimators=300, n_jobs=20)
    regr = regr.fit(ts_train_X, ts_train_y)
    # cur_mse = mean_squared_error(ts_val_y,regr.predict(ts_val_X))
    cur_mse = np.square(ts_val_y - regr.predict(ts_val_X))
    old_mse = cur_mse
    first_mse = cur_mse
    top_half = np.argsort(regr.feature_importances_)[int(top_influence_genes.size/2):]
    old_top_influence_genes = top_influence_genes
    top_influence_genes = top_influence_genes.iloc[top_half]
    if (target_tf not in top_influence_genes.values):
        top_influence_genes = top_influence_genes.append(tf_list[tf_list == target_tf])
    old_data_mean = data_mean
    old_data_std = data_std
    old_regr = regr
    old_ts_val_X = ts_val_X


    while(True):
        data_mean = ts_df.T[top_influence_genes].mean()
        data_std = ts_df.T[top_influence_genes].std()
        regr = RandomForestRegressor(random_state=42, warm_start=True, n_estimators=300, n_jobs=20)
        ts_train_X = ts_df[ts_exp_index_source].T[top_influence_genes]
        ts_val_X = ts_df[ts_exp_index_source_val].T[top_influence_genes]
        ts_train_y = ts_train_y_list.loc[target_gene]
        ts_val_y = ts_val_y_list.loc[target_gene]
        ts_val_naive_y = ts_val_naive_y_list.loc[target_gene]
        regr = regr.fit(ts_train_X, ts_train_y)
        # cur_mse = mean_squared_error(ts_val_y,regr.predict(ts_val_X))
        cur_mse = np.square(ts_val_y - regr.predict(ts_val_X))
        p = pairedtest(cur_mse, first_mse, 10000)
        if (p < 0.05 and np.mean(cur_mse) > np.mean(first_mse)) or len(top_influence_genes) < 3:
            data_mean = old_data_mean
            data_std = old_data_std
            regr = old_regr
            ts_val_X = old_ts_val_X
            top_influence_genes = old_top_influence_genes
            break
        else:
            old_mse = cur_mse
            top_half = np.argsort(regr.feature_importances_)[int(top_influence_genes.size/2):]
            old_top_influence_genes = top_influence_genes
            top_influence_genes = top_influence_genes.iloc[top_half]
            if (target_tf not in top_influence_genes.values):
                top_influence_genes = top_influence_genes.append(tf_list[tf_list == target_tf])
            if (len(old_top_influence_genes) == len(top_influence_genes)):
                data_mean = old_data_mean
                data_std = old_data_std
                regr = old_regr
                ts_val_X = old_ts_val_X
                top_influence_genes = old_top_influence_genes
                break

            old_data_mean = data_mean
            old_data_std = data_std
            old_regr = regr
            old_ts_val_X = ts_val_X

            
    top_feature_size_list.append(top_influence_genes.size)
    if (top_influence_genes.size < tf_list.size):
        bottom_influence_genes = list(set(tf_list) - (set(top_influence_genes)))
        bottom_regr = RandomForestRegressor(random_state=42, warm_start=True, n_estimators=300, n_jobs=20)
        ts_train_X_comp = ts_df[ts_exp_index_source].T[bottom_influence_genes]
        ts_val_X_comp = ts_df[ts_exp_index_source_val].T[bottom_influence_genes]
        ts_train_y = ts_train_y_list.loc[target_gene]
        ts_val_y = ts_val_y_list.loc[target_gene]
        bottom_regr = bottom_regr.fit(ts_train_X_comp, ts_train_y)
        bottom_mse_list.append(mean_squared_error(ts_val_y,bottom_regr.predict(ts_val_X_comp)))
    else:
        bottom_mse_list.append(0)

    

    input_mean = data_mean
    # input_std = ts_train_X.std()
    naive_mse_list.append(mean_squared_error(ts_val_y,ts_val_naive_y))
    mse_list.append(mean_squared_error(ts_val_y,regr.predict(ts_val_X)))

    base_prediction = regr.predict(np.array(input_mean).reshape(1,-1))[0]
    y_std = ts_df.T.std()[target_gene]
    perturbation_list = list(choose_2_3(top_influence_genes))
    # perturbation_list = [set(perturbation).union(set([target_tf])) for perturbation in perturbation_list]
    # perturbation_list.insert(0, set([target_tff]))
    
    perturbation_input_list = []
    for perturbation_genes in perturbation_list:
        perturbation_input = data_mean.copy()
        for gene in perturbation_genes:
            perturbation_input[gene] += data_std[gene] * perturbation_factor
        perturbation_input_list.append(perturbation_input)
    
    perturbation_list_names = ['; '.join(perturbation_genes) for perturbation_genes in perturbation_list]
    perturbation_input = np.array(perturbation_input_list)
    perturbation_result_list = (regr.predict(perturbation_input) - base_prediction)/y_std
    if (perturbation_result_list[0] > 0):
        result_list.append(np.array(perturbation_list_names)[np.argsort(perturbation_result_list)[::-1][:5]])
        result_measure_list.append(np.array(perturbation_result_list)[np.argsort(perturbation_result_list)[::-1][:5]])
    else:
        result_list.append(np.array(perturbation_list_names)[np.argsort(perturbation_result_list)[:5]])
        result_measure_list.append(np.array(perturbation_result_list)[np.argsort(perturbation_result_list)[:5]])
   
    result_list.append([target_tf])
    result_measure_list.append(perturbation_result_list[0])


In [33]:
from sklearn.preprocessing import Normalizer
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

regr = RandomForestRegressor(random_state=0)
# regr = xgb.XGBRegressor()

ts_train_y_list = ts_df[ts_exp_index_target]
# set training target to expression level change
ts_train_y_list.iloc[:, :] = ts_df[ts_exp_index_target].values - ts_df[ts_exp_index_source].values
ts_val_y_list = ts_df[ts_exp_index_target_val]
# set training target to expression level change
ts_val_y_list.iloc[:, :] = ts_df[ts_exp_index_target_val].values - ts_df[ts_exp_index_source_val].values

ts_val_naive_y_list = ts_df[ts_exp_index_source_val]
ts_val_naive_y_list.loc[:,:] = 0


result_list = []
result_measure_list = []
mse_list = []
naive_mse_list = []
bottom_mse_list = []
top_feature_size_list = []


for target_gene in tqdm(target_genes[1:2]):
    train_gene_index = tf_list[tf_list != target_gene]
    ts_val_y = ts_val_y_list.loc[target_gene]
    old_mse = sys.maxsize
    old_mse = np.random.randn(len(ts_val_y)) + sys.maxsize
    top_influence_genes = train_gene_index
    data_mean = ts_df.T[top_influence_genes].mean()
    data_std = ts_df.T[top_influence_genes].std()
    ts_train_X = ts_df[ts_exp_index_source].T[top_influence_genes]
    ts_val_X = ts_df[ts_exp_index_source_val].T[top_influence_genes]
    ts_train_y = ts_train_y_list.loc[target_gene]
    ts_val_y = ts_val_y_list.loc[target_gene]
    ts_val_naive_y = ts_val_naive_y_list.loc[target_gene]
    regr = RandomForestRegressor(random_state=42, warm_start=True, n_estimators=300, n_jobs=20)
    regr = regr.fit(ts_train_X, ts_train_y)
    # cur_mse = mean_squared_error(ts_val_y,regr.predict(ts_val_X))
    cur_mse = np.square(ts_val_y - regr.predict(ts_val_X))
    old_mse = cur_mse
    first_mse = cur_mse
    top_half = np.argsort(regr.feature_importances_)[int(top_influence_genes.size/2):]
    old_top_influence_genes = top_influence_genes
    top_influence_genes = top_influence_genes.iloc[top_half]
    if (target_tf not in top_influence_genes.values):
        top_influence_genes = top_influence_genes.append(tf_list[tf_list == target_tf])
    old_data_mean = data_mean
    old_data_std = data_std
    old_regr = regr
    old_ts_val_X = ts_val_X


    while(True):
        data_mean = ts_df.T[top_influence_genes].mean()
        data_std = ts_df.T[top_influence_genes].std()
        regr = RandomForestRegressor(random_state=42, warm_start=True, n_estimators=300, n_jobs=20)
        ts_train_X = ts_df[ts_exp_index_source].T[top_influence_genes]
        ts_val_X = ts_df[ts_exp_index_source_val].T[top_influence_genes]
        ts_train_y = ts_train_y_list.loc[target_gene]
        ts_val_y = ts_val_y_list.loc[target_gene]
        ts_val_naive_y = ts_val_naive_y_list.loc[target_gene]
        regr = regr.fit(ts_train_X, ts_train_y)
        # cur_mse = mean_squared_error(ts_val_y,regr.predict(ts_val_X))
        cur_mse = np.square(ts_val_y - regr.predict(ts_val_X))
        p = pairedtest(cur_mse, first_mse, 10000)
        if (p < 0.05 and np.mean(cur_mse) > np.mean(first_mse)) or len(top_influence_genes) < 3:
            data_mean = old_data_mean
            data_std = old_data_std
            regr = old_regr
            ts_val_X = old_ts_val_X
            top_influence_genes = old_top_influence_genes
            break
        else:
            old_mse = cur_mse
            top_half = np.argsort(regr.feature_importances_)[int(top_influence_genes.size/2):]
            old_top_influence_genes = top_influence_genes
            top_influence_genes = top_influence_genes.iloc[top_half]
            if (target_tf not in top_influence_genes.values):
                top_influence_genes = top_influence_genes.append(tf_list[tf_list == target_tf])
            if (len(old_top_influence_genes) == len(top_influence_genes)):
                data_mean = old_data_mean
                data_std = old_data_std
                regr = old_regr
                ts_val_X = old_ts_val_X
                top_influence_genes = old_top_influence_genes
                break

            old_data_mean = data_mean
            old_data_std = data_std
            old_regr = regr
            old_ts_val_X = ts_val_X

            
    top_feature_size_list.append(top_influence_genes.size)
    if (top_influence_genes.size < tf_list.size):
        bottom_influence_genes = list(set(tf_list) - (set(top_influence_genes)))
        bottom_regr = RandomForestRegressor(random_state=42, warm_start=True, n_estimators=300, n_jobs=20)
        ts_train_X_comp = ts_df[ts_exp_index_source].T[bottom_influence_genes]
        ts_val_X_comp = ts_df[ts_exp_index_source_val].T[bottom_influence_genes]
        ts_train_y = ts_train_y_list.loc[target_gene]
        ts_val_y = ts_val_y_list.loc[target_gene]
        bottom_regr = bottom_regr.fit(ts_train_X_comp, ts_train_y)
        bottom_mse_list.append(mean_squared_error(ts_val_y,bottom_regr.predict(ts_val_X_comp)))
    else:
        bottom_mse_list.append(0)

    

    input_mean = data_mean
    # input_std = ts_train_X.std()
    naive_mse_list.append(mean_squared_error(ts_val_y,ts_val_naive_y))
    mse_list.append(mean_squared_error(ts_val_y,regr.predict(ts_val_X)))

    base_prediction = regr.predict(np.array(input_mean).reshape(1,-1))[0]
    y_std = ts_df.T.std()[target_gene]
    perturbation_list = list(choose_2_3(top_influence_genes))
    # perturbation_list = [set(perturbation).union(set([target_tf])) for perturbation in perturbation_list]
    # perturbation_list.insert(0, set([target_tff]))
    
    perturbation_input_list = []
    for perturbation_genes in perturbation_list:
        perturbation_input = data_mean.copy()
        for gene in perturbation_genes:
            perturbation_input[gene] += data_std[gene] * perturbation_factor
        perturbation_input_list.append(perturbation_input)
    
    perturbation_list_names = ['; '.join(perturbation_genes) for perturbation_genes in perturbation_list]
    perturbation_input = np.array(perturbation_input_list)
    perturbation_result_list = (regr.predict(perturbation_input) - base_prediction)/y_std
    if (perturbation_result_list[0] > 0):
        result_list.append(np.array(perturbation_list_names)[np.argsort(perturbation_result_list)[::-1][:5]])
        result_measure_list.append(np.array(perturbation_result_list)[np.argsort(perturbation_result_list)[::-1][:5]])
    else:
        result_list.append(np.array(perturbation_list_names)[np.argsort(perturbation_result_list)[:5]])
        result_measure_list.append(np.array(perturbation_result_list)[np.argsort(perturbation_result_list)[:5]])
   
    result_list.append([target_tf])
    result_measure_list.append(perturbation_result_list[0])

    if (top_influence_genes.size <=10 and top_influence_genes.size>=3):
        print('small enough for testing')



100%|██████████| 1/1 [00:05<00:00,  5.20s/it]

small enough for testing





In [36]:
ts_train_X = ts_df[ts_exp_index_source].T[top_influence_genes]
ts_train_y = ts_train_y_list.loc[target_gene]
regr = regr.fit(ts_train_X, ts_train_y)

In [63]:
ts_train_X

Unnamed: 0,AT4G30935,AT2G46680,AT2G30590
R5C-1,-0.003761,0.008375,0.029359
R10C-1,0.010456,0.031066,-0.017023
R15C-1,0.003102,0.021848,-0.021699
R20C-1,-0.016350,0.031911,0.007918
R30C-1,-0.009277,-0.007815,-0.002415
...,...,...,...
S15N-3,0.010778,0.064185,0.429609
S20N-3,0.034535,0.007062,0.437467
S30N-3,0.009021,0.022948,0.480769
S45N-3,0.018972,0.027370,0.394831


In [82]:
max_iter = 10000
for col, i in zip(ts_train_X.columns, range(len(ts_train_X.columns))):
    brr = BayesianRidge()
    p_y_train =ts_train_X[col].values
    p_X_train = ts_train_X.loc[:, ts_train_X.columns != col].values
    brr = brr.fit(p_X_train, p_y_train)
    res = brr.predict(p_X_train, return_std=True)
    feature_dist = np.array([np.random.normal(i,j,max_iter)  for i, j in zip(res[0],res[1])])
    null_count = 0
    orig_importance = regr.feature_importances_[i]
    new_X_train = ts_train_X.copy()
    for j in tqdm(range(max_iter)):
        new_X_train[col] = feature_dist[:,j]
        resampled_regr = RandomForestRegressor(random_state=42, warm_start=True, n_estimators=3, n_jobs=20)
        resampled_regr.fit(new_X_train, ts_train_y)
        if (resampled_regr.feature_importances_[i] >= orig_importance):
            null_count+=1
    print(null_count)
    break

  3%|▎         | 268/10000 [00:03<02:08, 75.57it/s]Exception ignored in: <function WeakKeyDictionary.__init__.<locals>.remove at 0x7fa1c55eb670>
Traceback (most recent call last):
  File "/home/icy/miniconda3/envs/inf/lib/python3.9/weakref.py", line 367, in remove
    def remove(k, selfref=ref(self)):
KeyboardInterrupt: 
100%|██████████| 10000/10000 [02:27<00:00, 67.96it/s]

736





In [74]:
max_iter = 10000
feature_dist = np.array([np.random.normal(i,j,max_iter)  for i, j in zip(res[0],res[1])])

In [72]:
for j in range(max_iter):
    new_X_train = ts_train_X.copy()
    new_X_train[col] = feature_dist[:,i+1]
    print(new_X_train)
    resampled_regr = RandomForestRegressor(random_state=42, warm_start=True, n_estimators=300, n_jobs=20)
    break

        AT4G30935  AT2G46680  AT2G30590
R5C-1   -0.037049   0.008375   0.029359
R10C-1  -0.046722   0.031066  -0.017023
R15C-1  -0.043512   0.021848  -0.021699
R20C-1  -0.053570   0.031911   0.007918
R30C-1  -0.065404  -0.007815  -0.002415
...           ...        ...        ...
S15N-3  -0.090331   0.064185   0.429609
S20N-3  -0.080047   0.007062   0.437467
S30N-3  -0.065179   0.022948   0.480769
S45N-3  -0.033383   0.027370   0.394831
S60N-3  -0.047801  -0.057821   0.418113

[84 rows x 3 columns]


In [69]:
new_X_train

array([[-0.0021623 , -0.01382536,  0.02771586, ...,  0.00713663,
        -0.01299228, -0.03613828],
       [-0.00663374, -0.02857204,  0.02651704, ...,  0.04954102,
        -0.03005081, -0.03233449],
       [ 0.00145041, -0.03286084,  0.06689255, ...,  0.05833003,
        -0.03894635, -0.0438698 ],
       ...,
       [ 0.08428695,  0.11747527, -0.12343633, ..., -0.06357884,
        -0.12580692, -0.1266959 ],
       [ 0.03044204, -0.01462008, -0.09778018, ..., -0.036127  ,
        -0.06889945, -0.08897258],
       [ 0.08429589, -0.0191119 , -0.08141463, ..., -0.05837368,
        -0.04565507, -0.09874142]])

In [62]:
feature_dist[:,1].shape

(84,)

In [52]:
a = norm(res[0], res[1])
a.pdf(1)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [41]:
fixed_result_list = []
for i in range(0, len(result_list), 2):
    current_list = result_list[i]
    if (len(current_list) == 4):
        current_list = np.append(current_list, 'NA')
    fixed_result_list.append(np.append(current_list, target_tf))

In [42]:
fixed_result_measure_list = []
for i in range(0, len(result_measure_list), 2):
    current_list = result_measure_list[i]
    if (len(current_list) == 4):
        current_list = np.append(current_list, 0)
    fixed_result_measure_list.append(np.append(current_list, result_measure_list[i+1]))

In [43]:
fixed_result_measure_list = np.array(fixed_result_measure_list)
fixed_result_list = np.array(fixed_result_list)

In [48]:
result_measure_list = np.array(result_measure_list)
result_list = np.array(result_list)
out_df = pd.DataFrame()
out_df.index = target_genes
for i in range(6):
    comb_name = 'top_{}_combination'.format(i+1)
    score_name = 'top_{}_score'.format(i+1)
    out_df[comb_name] = fixed_result_list[:,i]
    out_df[score_name] = fixed_result_measure_list[:,i]
out_df['mse'] = mse_list
out_df['naive_mse'] = naive_mse_list
out_df['mse_diff'] = np.array(mse_list) - np.array(naive_mse_list)
out_df['bottom_mse'] = bottom_mse_list

# out_df['top_comb'] = fixed_result_list
# out_df['top_comb_score'] = fixed_result_measure_list


out_df['feature_size'] = top_feature_size_list

In [49]:
out_df

Unnamed: 0,top_1_combination,top_1_score,top_2_combination,top_2_score,top_3_combination,top_3_score,top_4_combination,top_4_score,top_5_combination,top_5_score,top_6_combination,top_6_score,mse,naive_mse,mse_diff,bottom_mse,feature_size
AT2G35710,AT2G24570; AT4G30935; AT2G23320,-0.722132,AT2G24570; AT5G22570; AT2G23320,-0.653672,AT2G24570; AT4G30935; AT5G22570,-0.643000,AT4G30935; AT5G22570; AT2G23320,-0.635794,AT4G30935; AT3G01970; AT2G23320,-0.620209,AT2G46680,-0.031325,0.000396,0.000361,0.000034,0.000000,72
AT4G03340,AT4G31800; AT1G13960; AT2G46680,1.581495,AT1G13960; AT2G46680,1.577658,AT4G31800; AT1G13960,1.541560,AT5G43290; AT1G13960; AT2G46680,1.496013,AT4G31800; AT5G43290; AT1G13960,1.479501,AT2G46680,0.357111,0.001426,0.000802,0.000624,0.000807,4
AT5G07620,AT5G13080; AT1G69310; AT1G29860,-0.246690,AT4G18170; AT1G69310; AT1G29860,-0.245287,AT2G37260; AT1G69310; AT1G29860,-0.235850,AT1G64000; AT1G69310; AT1G29860,-0.231660,AT1G69310; AT5G41570; AT1G29860,-0.230540,AT2G46680,-0.002698,0.000585,0.000418,0.000167,0.000431,36
AT3G15520,AT1G13960; AT2G46680,1.637894,AT5G43290; AT1G13960; AT2G46680,1.409111,AT5G43290; AT1G13960,1.288104,AT5G43290; AT2G46680,-0.254938,,0.000000,AT2G46680,1.288104,0.001612,0.000858,0.000754,0.001041,3
AT4G32070,AT1G13960; AT2G46680,-2.809682,AT4G01720; AT1G13960; AT2G46680,-1.523918,AT4G01720; AT1G13960,-0.957326,AT4G01720; AT2G46680,0.594601,,0.000000,AT2G46680,-0.957326,0.021226,0.025846,-0.004620,0.021182,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AT2G46160,AT4G31800; AT3G04670,0.981340,AT4G31800; AT3G04670; AT2G46680,0.858149,AT4G31800; AT2G46680,0.732387,AT3G04670; AT2G46680,0.573989,,0.000000,AT2G46680,0.981340,0.000253,0.000351,-0.000098,0.000206,3
AT5G42390,AT1G13960; AT3G58710,-1.428322,AT1G13960; AT3G58710; AT2G46680,-1.136673,AT1G13960; AT2G46680,-1.002058,AT3G58710; AT2G46680,0.144349,,0.000000,AT2G46680,-1.428322,0.026037,0.025676,0.000361,0.026615,3
AT1G03060,AT2G04880; AT4G12020; AT1G13960,-1.558661,AT2G03340; AT4G12020; AT1G13960,-1.545492,AT2G04880; AT2G03340; AT1G13960,-1.455437,AT2G04880; AT2G03340; AT4G12020,-1.351602,AT4G12020; AT1G13960; AT5G43290,-1.210138,AT2G46680,-0.841420,0.073028,0.080940,-0.007912,0.074305,7
AT3G48450,AT2G23320; AT2G46680,0.376609,AT5G49520; AT2G23320,0.302086,AT5G49520; AT2G23320; AT2G46680,0.274805,AT5G49520; AT2G46680,-0.113292,AT2G23320; AT5G52830; AT2G46680,-1.577914,AT2G46680,0.302086,0.005783,0.001086,0.004697,0.001561,4


In [41]:
out_df.to_csv('output/wrky_presentation_neg_rf_new.csv')

In [4]:
out_df = pd.read_csv('output/wrky_presentation_neg_rf_new.csv', index_col=0)

In [30]:
import re
regex = re.compile(r'[-+]?\d*\.\d+|\d+')

res_list = []
for i in out_df['top_comb_score'].values:
    numbers = regex.findall(i)
    value = float(numbers[-1])
    res_list.append(value)

In [32]:
out_df['top_6_score'] = res_list

In [50]:
(out_df['top_6_score'] < 0).sum()

333

In [52]:
(out_df['mse_diff'] > 0).sum()

478

In [20]:
out_df

Unnamed: 0,top_1_combination,top_1_score,top_2_combination,top_2_score,top_3_combination,top_3_score,top_4_combination,top_4_score,top_5_combination,top_5_score,top_6_combination,top_6_score,mse,naive_mse,mse_diff,bottom_mse,feature_size
AT5G02400,AT2G04880; AT1G69810; AT3G58710,0.477039,AT1G69810; AT5G15130; AT3G58710,0.449460,AT1G69810; AT2G21900; AT3G58710,0.440417,AT1G69810; AT3G58710,0.426258,AT2G04880; AT5G15130; AT3G58710,0.382458,AT2G46680,0.242310,0.000295,0.000664,-0.000369,0.000890,7
AT5G57010,AT2G03340; AT1G13960; AT4G23550,0.306276,AT1G13960; AT4G23550; AT5G24110,0.295107,AT1G13960; AT4G23550; AT3G58710,0.291633,AT4G12020; AT1G13960; AT4G23550,0.284343,AT1G13960; AT4G39410; AT4G23550,0.280021,AT2G46680,0.068326,0.000605,0.001169,-0.000564,0.001222,37
AT4G23010,AT4G01250; AT4G31550; AT4G01720,0.410815,AT4G31550; AT4G01720; AT3G04670,0.403659,AT1G80840; AT4G31550; AT4G01720,0.395708,AT2G46680; AT4G31550; AT4G01720,0.392128,AT4G01250; AT2G46680; AT4G31550,0.391550,AT2G46680,0.002761,0.021592,0.043545,-0.021953,0.000000,72
AT2G40730,AT2G03340; AT2G04880; AT1G13960,-1.201697,AT4G12020; AT2G03340; AT1G13960,-1.147590,AT4G12020; AT2G04880; AT1G13960,-1.136804,AT2G30250; AT2G03340; AT1G13960,-0.987785,AT2G30250; AT2G04880; AT1G13960,-0.983205,AT2G46680,-0.016949,0.001022,0.002144,-0.001122,0.000801,36
AT2G07180,AT2G21900; AT1G69810; AT1G66600,-0.053371,AT2G21900; AT1G30650; AT1G66600,-0.049369,AT2G21900; AT1G66600; AT5G01900,-0.048483,AT2G21900; AT1G66600; AT5G52830,-0.044495,AT2G21900; AT1G30650; AT5G01900,-0.043959,AT2G46680,-0.001911,0.002137,0.002837,-0.000700,0.001309,37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AT1G07000,AT4G01250; AT5G56270; AT4G31550,0.776311,AT1G13960; AT4G01250; AT4G31550,0.775026,AT2G23320; AT4G01250; AT4G31550,0.761206,AT1G62300; AT4G01250; AT4G31550,0.760608,AT2G46680; AT4G01250; AT4G31550,0.758343,AT2G46680,0.002904,0.026028,0.041213,-0.015185,0.029775,36
AT5G19290,AT2G03340; AT1G13960; AT1G30650,-0.149591,AT2G03340; AT1G30650; AT4G18170,-0.148271,AT2G03340; AT2G04880; AT1G30650,-0.146021,AT2G03340; AT2G44745; AT1G30650,-0.142120,AT2G03340; AT1G30650; AT5G43290,-0.140904,AT2G46680,-0.018791,0.005652,0.014436,-0.008785,0.000000,72
AT4G03480,AT1G30650; AT2G46680,0.987003,AT2G38470; AT1G30650; AT2G46680,0.640048,AT2G38470; AT1G30650,0.624371,AT2G38470; AT2G46680,-0.539617,,0.000000,AT2G46680,0.624371,0.000441,0.001059,-0.000618,0.001018,3
AT1G16130,AT2G03340; AT1G13960; AT5G45050,-0.372897,AT2G03340; AT5G22570; AT5G45050,-0.348711,AT2G03340; AT5G45050; AT2G40750,-0.347527,AT2G03340; AT3G01970; AT5G45050,-0.347074,AT1G13960; AT5G45050; AT2G40750,-0.341321,AT2G46680,-0.009542,0.000258,0.000082,0.000176,0.000000,72


In [5]:
A = np.random.rand(5)
B = np.random.rand(5)


In [7]:
A-B

array([ 0.56661186, -0.18662621, -0.36603361,  0.48380576,  0.91346389])