In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from functools import partial
from multiprocessing import shared_memory
from multiprocessing.dummy import Pool
from sklearn.ensemble import RandomForestRegressor
import multiprocessing as mp
from itertools import chain, combinations
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from scipy.stats import wilcoxon
from scipy.stats import ttest_rel
import sys
import os

import mp_run

import concurrent.futures
os.environ["OMP_NUM_THREADS"] = "1" # export OMP_NUM_THREADS=1
os.environ["OPENBLAS_NUM_THREADS"] = "1" # export OPENBLAS_NUM_THREADS=1
os.environ["MKL_NUM_THREADS"] = "1" # export MKL_NUM_THREADS=1
os.environ["VECLIB_MAXIMUM_THREADS"] = "1" # export VECLIB_MAXIMUM_THREADS=1
os.environ["NUMEXPR_NUM_THREADS"] = "1" # export NUMEXPR_NUM_THREADS=1

In [5]:
nof_iters = 10000

def pairedtest(n1, n2, nof_iters):
    import random
   
    ndiff = sum([ n1[i]-n2[i] for i in range(len(n1)) ])
    if ndiff < 0:
         nbig = n2
         nsmall = n1
         ndiff = -ndiff
    else:
         nbig = n1
         nsmall = n2
    
    bcount = 0
    for niter in range(nof_iters):
        tdiff = 0
        for i in range(len(nbig)):
            a = random.random()
            if a <0.5:
                tdiff += nsmall[i]-nbig[i]
            else:
                tdiff += nbig[i]-nsmall[i]
        if tdiff > ndiff:
            bcount += 1
    return bcount / nof_iters

In [6]:
data1 = 5 * np.random.randn(100) + 51
data2 = 5 * np.random.randn(100) + 511231
# compare samples
stat, p = ttest_rel(data1, data2)
print(pairedtest(data1, data2, 10000))

0.0


In [7]:
perturbation_factor = 3
num_rf_predictors = 500

target_tf = 'AT2G46680'

induction_flag = 0
mp_threads = 20
# if (len(sys.argv)>=3):
#     induction_flag = bool(sys.argv[1])
#     mp_threads = int(sys.argv[2])

In [8]:
tf_df = pd.read_csv('data/wrky_regulators.csv')
tf_list = tf_df['Gene']
scaler = StandardScaler()

In [9]:
def choose_2_3(iterable):
    "powerset([1,2,3]) -->  (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(2,4))

In [10]:
target_df = pd.read_csv('data/wrky_targets_neg.csv')
if (induction_flag):
    target_df = pd.read_csv('data/wrky_targets_pos.csv')
deg_genes = target_df['Gene']

In [11]:
ts_df_raw = pd.read_csv('data/GSE97500/expression.tsv', sep='\t', index_col=0)
ts_df = pd.DataFrame(scaler.fit_transform(ts_df_raw), columns=ts_df_raw.columns)
ts_df.index = ts_df_raw.index

c = pd.Series(list(set(deg_genes).intersection(set(ts_df.index))))
tf_list = pd.Series(list(set(tf_list).intersection(set(ts_df.index))))
meta_df = pd.read_csv('data/GSE97500/meta_data.tsv', sep='\t')
ts_exp_index = meta_df[meta_df['isTs']]
ts_exp_index_target =  ts_exp_index[ts_exp_index['is1stLast'] == 'm'].condName
ts_exp_index_source =  ts_exp_index[ts_exp_index['is1stLast'] == 'm'].prevCol

In [12]:
ts_exp_index_target_val = ts_exp_index[ts_exp_index['is1stLast'] == 'l'].condName
ts_exp_index_source_val =  ts_exp_index[ts_exp_index['is1stLast'] == 'l'].prevCol

In [13]:
target_genes = set(deg_genes).intersection(set(ts_df.index))

In [14]:
non_trivial_targets = []
for target in target_genes:
    if ts_df.loc[target].mean() != 0.0:
        non_trivial_targets.append(target)


In [15]:
target_genes = pd.Series(non_trivial_targets)

In [16]:
ts_df[ts_exp_index_target]

Unnamed: 0,R10C-1,R15C-1,R20C-1,R30C-1,R45C-1,R60C-1,R90C-1,R10N-1,R15N-1,R20N-1,...,S45C-3,S60C-3,S90C-3,S10N-3,S15N-3,S20N-3,S30N-3,S45N-3,S60N-3,S90N-3
AT1G01010,-0.014758,0.000533,0.051134,0.049920,0.040884,0.023393,0.061801,-0.000818,-0.012641,0.026244,...,0.003995,-0.005847,-0.034883,-0.046047,-0.018330,-0.040172,-0.058541,-0.052308,-0.054134,-0.059992
AT1G01020,-0.029620,-0.041485,-0.034851,-0.035058,-0.036636,-0.024253,-0.045806,-0.026437,-0.032644,-0.022295,...,-0.127578,-0.115245,-0.120858,-0.124956,-0.149922,-0.153918,-0.144475,-0.088973,-0.102612,-0.094729
AT1G01030,-0.032971,-0.046255,-0.054931,-0.055603,-0.054696,-0.040881,-0.061897,-0.029271,-0.034380,-0.048646,...,-0.119270,-0.103980,-0.117943,-0.114565,-0.130676,-0.133193,-0.142105,-0.092455,-0.109801,-0.098074
AT1G01040,0.019460,0.037902,0.043548,0.028604,0.049269,0.019120,0.026949,0.032965,0.023565,0.045120,...,0.280808,0.283787,0.343744,0.299788,0.389194,0.350711,0.319272,0.203522,0.247794,0.195515
AT1G01046,-0.033851,-0.048671,-0.057677,-0.057998,-0.055946,-0.042538,-0.064528,-0.031363,-0.036653,-0.050065,...,-0.148747,-0.133875,-0.144173,-0.152233,-0.168205,-0.170305,-0.167885,-0.106997,-0.124547,-0.109138
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
no_feature,14.071499,13.087187,22.008419,19.675190,18.590256,13.985658,18.200588,13.284370,13.912443,21.800591,...,120.673816,131.131423,124.569796,103.005479,99.511401,93.729293,97.766088,141.187133,133.233340,141.051041
ambiguous,17.669398,24.501397,30.046224,29.674856,27.086674,22.376220,30.862158,13.986869,16.938505,26.799417,...,96.991219,87.496987,91.405577,112.041366,113.577964,114.259076,113.628324,77.281873,80.272149,67.558997
too_low_aQual,-0.033851,-0.048671,-0.057677,-0.057998,-0.055946,-0.042538,-0.064528,-0.031363,-0.036653,-0.050065,...,-0.148747,-0.133875,-0.144173,-0.152233,-0.168205,-0.170305,-0.167885,-0.106997,-0.124547,-0.109138
not_aligned,-0.033851,-0.048671,-0.057677,-0.057998,-0.055946,-0.042538,-0.064528,-0.031363,-0.036653,-0.050065,...,-0.148747,-0.133875,-0.144173,-0.152233,-0.168205,-0.170305,-0.167885,-0.106997,-0.124547,-0.109138


In [17]:
ts_df[ts_exp_index_target_val]


Unnamed: 0,R120C-1,R120N-1,S120C-1,S120N-1,R120C-2,R120N-2,S120C-2,S120N-2,R120C-3,R120N-3,S120C-3,S120N-3
AT1G01010,0.032329,0.018678,-0.015495,-0.061103,0.042197,0.014170,0.006366,-0.060024,0.018088,0.009175,-0.031350,-0.053659
AT1G01020,-0.050573,-0.025491,-0.122617,-0.103106,-0.043273,-0.029883,-0.131012,-0.114978,-0.037643,-0.020144,-0.126331,-0.125983
AT1G01030,-0.062557,-0.058268,-0.109685,-0.100823,-0.060848,-0.062742,-0.119772,-0.104941,-0.066388,-0.056973,-0.119999,-0.130077
AT1G01040,0.038592,0.020746,0.293800,0.191144,0.049674,0.021380,0.372706,0.248119,0.050390,0.015578,0.305040,0.280864
AT1G01046,-0.063864,-0.060852,-0.140076,-0.118857,-0.062180,-0.064564,-0.147247,-0.130285,-0.068187,-0.059806,-0.145327,-0.151716
...,...,...,...,...,...,...,...,...,...,...,...,...
no_feature,16.730360,22.870386,124.143038,137.902308,17.818821,23.711983,124.198333,129.696637,21.475550,25.994190,122.467421,116.612229
ambiguous,29.063975,36.378000,92.552870,73.472921,28.121311,36.472855,93.405582,81.932176,34.011477,37.741025,91.002413,91.638212
too_low_aQual,-0.063864,-0.060852,-0.140076,-0.118857,-0.062180,-0.064564,-0.147247,-0.130285,-0.068187,-0.059806,-0.145327,-0.151716
not_aligned,-0.063864,-0.060852,-0.140076,-0.118857,-0.062180,-0.064564,-0.147247,-0.130285,-0.068187,-0.059806,-0.145327,-0.151716


In [18]:
ts_df

Unnamed: 0,R5C-1,R10C-1,R15C-1,R20C-1,R30C-1,R45C-1,R60C-1,R90C-1,R120C-1,R5N-1,...,S120C-3,S5N-3,S10N-3,S15N-3,S20N-3,S30N-3,S45N-3,S60N-3,S90N-3,S120N-3
AT1G01010,0.005741,-0.014758,0.000533,0.051134,0.049920,0.040884,0.023393,0.061801,0.032329,0.020875,...,-0.031350,-0.071282,-0.046047,-0.018330,-0.040172,-0.058541,-0.052308,-0.054134,-0.059992,-0.053659
AT1G01020,-0.003468,-0.029620,-0.041485,-0.034851,-0.035058,-0.036636,-0.024253,-0.045806,-0.050573,-0.014970,...,-0.126331,-0.150230,-0.124956,-0.149922,-0.153918,-0.144475,-0.088973,-0.102612,-0.094729,-0.125983
AT1G01030,-0.035530,-0.032971,-0.046255,-0.054931,-0.055603,-0.054696,-0.040881,-0.061897,-0.062557,-0.042973,...,-0.119999,-0.124216,-0.114565,-0.130676,-0.133193,-0.142105,-0.092455,-0.109801,-0.098074,-0.130077
AT1G01040,0.043139,0.019460,0.037902,0.043548,0.028604,0.049269,0.019120,0.026949,0.038592,0.040380,...,0.305040,0.374573,0.299788,0.389194,0.350711,0.319272,0.203522,0.247794,0.195515,0.280864
AT1G01046,-0.038818,-0.033851,-0.048671,-0.057677,-0.057998,-0.055946,-0.042538,-0.064528,-0.063864,-0.044781,...,-0.145327,-0.166261,-0.152233,-0.168205,-0.170305,-0.167885,-0.106997,-0.124547,-0.109138,-0.151716
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
no_feature,26.743196,14.071499,13.087187,22.008419,19.675190,18.590256,13.985658,18.200588,16.730360,23.713908,...,122.467421,96.538787,103.005479,99.511401,93.729293,97.766088,141.187133,133.233340,141.051041,116.612229
ambiguous,25.938918,17.669398,24.501397,30.046224,29.674856,27.086674,22.376220,30.862158,29.063975,27.637701,...,91.002413,115.647371,112.041366,113.577964,114.259076,113.628324,77.281873,80.272149,67.558997,91.638212
too_low_aQual,-0.038818,-0.033851,-0.048671,-0.057677,-0.057998,-0.055946,-0.042538,-0.064528,-0.063864,-0.044781,...,-0.145327,-0.166261,-0.152233,-0.168205,-0.170305,-0.167885,-0.106997,-0.124547,-0.109138,-0.151716
not_aligned,-0.038818,-0.033851,-0.048671,-0.057677,-0.057998,-0.055946,-0.042538,-0.064528,-0.063864,-0.044781,...,-0.145327,-0.166261,-0.152233,-0.168205,-0.170305,-0.167885,-0.106997,-0.124547,-0.109138,-0.151716


In [19]:
ts_df[ts_exp_index_target]

Unnamed: 0,R10C-1,R15C-1,R20C-1,R30C-1,R45C-1,R60C-1,R90C-1,R10N-1,R15N-1,R20N-1,...,S45C-3,S60C-3,S90C-3,S10N-3,S15N-3,S20N-3,S30N-3,S45N-3,S60N-3,S90N-3
AT1G01010,-0.014758,0.000533,0.051134,0.049920,0.040884,0.023393,0.061801,-0.000818,-0.012641,0.026244,...,0.003995,-0.005847,-0.034883,-0.046047,-0.018330,-0.040172,-0.058541,-0.052308,-0.054134,-0.059992
AT1G01020,-0.029620,-0.041485,-0.034851,-0.035058,-0.036636,-0.024253,-0.045806,-0.026437,-0.032644,-0.022295,...,-0.127578,-0.115245,-0.120858,-0.124956,-0.149922,-0.153918,-0.144475,-0.088973,-0.102612,-0.094729
AT1G01030,-0.032971,-0.046255,-0.054931,-0.055603,-0.054696,-0.040881,-0.061897,-0.029271,-0.034380,-0.048646,...,-0.119270,-0.103980,-0.117943,-0.114565,-0.130676,-0.133193,-0.142105,-0.092455,-0.109801,-0.098074
AT1G01040,0.019460,0.037902,0.043548,0.028604,0.049269,0.019120,0.026949,0.032965,0.023565,0.045120,...,0.280808,0.283787,0.343744,0.299788,0.389194,0.350711,0.319272,0.203522,0.247794,0.195515
AT1G01046,-0.033851,-0.048671,-0.057677,-0.057998,-0.055946,-0.042538,-0.064528,-0.031363,-0.036653,-0.050065,...,-0.148747,-0.133875,-0.144173,-0.152233,-0.168205,-0.170305,-0.167885,-0.106997,-0.124547,-0.109138
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
no_feature,14.071499,13.087187,22.008419,19.675190,18.590256,13.985658,18.200588,13.284370,13.912443,21.800591,...,120.673816,131.131423,124.569796,103.005479,99.511401,93.729293,97.766088,141.187133,133.233340,141.051041
ambiguous,17.669398,24.501397,30.046224,29.674856,27.086674,22.376220,30.862158,13.986869,16.938505,26.799417,...,96.991219,87.496987,91.405577,112.041366,113.577964,114.259076,113.628324,77.281873,80.272149,67.558997
too_low_aQual,-0.033851,-0.048671,-0.057677,-0.057998,-0.055946,-0.042538,-0.064528,-0.031363,-0.036653,-0.050065,...,-0.148747,-0.133875,-0.144173,-0.152233,-0.168205,-0.170305,-0.167885,-0.106997,-0.124547,-0.109138
not_aligned,-0.033851,-0.048671,-0.057677,-0.057998,-0.055946,-0.042538,-0.064528,-0.031363,-0.036653,-0.050065,...,-0.148747,-0.133875,-0.144173,-0.152233,-0.168205,-0.170305,-0.167885,-0.106997,-0.124547,-0.109138


In [21]:
ts_df[ts_exp_index_source]

Unnamed: 0,R5C-1,R10C-1,R15C-1,R20C-1,R30C-1,R45C-1,R60C-1,R5N-1,R10N-1,R15N-1,...,S30C-3,S45C-3,S60C-3,S5N-3,S10N-3,S15N-3,S20N-3,S30N-3,S45N-3,S60N-3
AT1G01010,0.005741,-0.014758,0.000533,0.051134,0.049920,0.040884,0.023393,0.020875,-0.000818,-0.012641,...,-0.046909,0.003995,-0.005847,-0.071282,-0.046047,-0.018330,-0.040172,-0.058541,-0.052308,-0.054134
AT1G01020,-0.003468,-0.029620,-0.041485,-0.034851,-0.035058,-0.036636,-0.024253,-0.014970,-0.026437,-0.032644,...,-0.106084,-0.127578,-0.115245,-0.150230,-0.124956,-0.149922,-0.153918,-0.144475,-0.088973,-0.102612
AT1G01030,-0.035530,-0.032971,-0.046255,-0.054931,-0.055603,-0.054696,-0.040881,-0.042973,-0.029271,-0.034380,...,-0.087045,-0.119270,-0.103980,-0.124216,-0.114565,-0.130676,-0.133193,-0.142105,-0.092455,-0.109801
AT1G01040,0.043139,0.019460,0.037902,0.043548,0.028604,0.049269,0.019120,0.040380,0.032965,0.023565,...,0.297076,0.280808,0.283787,0.374573,0.299788,0.389194,0.350711,0.319272,0.203522,0.247794
AT1G01046,-0.038818,-0.033851,-0.048671,-0.057677,-0.057998,-0.055946,-0.042538,-0.044781,-0.031363,-0.036653,...,-0.118176,-0.148747,-0.133875,-0.166261,-0.152233,-0.168205,-0.170305,-0.167885,-0.106997,-0.124547
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
no_feature,26.743196,14.071499,13.087187,22.008419,19.675190,18.590256,13.985658,23.713908,13.284370,13.912443,...,137.623616,120.673816,131.131423,96.538787,103.005479,99.511401,93.729293,97.766088,141.187133,133.233340
ambiguous,25.938918,17.669398,24.501397,30.046224,29.674856,27.086674,22.376220,27.637701,13.986869,16.938505,...,80.272496,96.991219,87.496987,115.647371,112.041366,113.577964,114.259076,113.628324,77.281873,80.272149
too_low_aQual,-0.038818,-0.033851,-0.048671,-0.057677,-0.057998,-0.055946,-0.042538,-0.044781,-0.031363,-0.036653,...,-0.118176,-0.148747,-0.133875,-0.166261,-0.152233,-0.168205,-0.170305,-0.167885,-0.106997,-0.124547
not_aligned,-0.038818,-0.033851,-0.048671,-0.057677,-0.057998,-0.055946,-0.042538,-0.044781,-0.031363,-0.036653,...,-0.118176,-0.148747,-0.133875,-0.166261,-0.152233,-0.168205,-0.170305,-0.167885,-0.106997,-0.124547


In [40]:
from sklearn.preprocessing import Normalizer
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

regr = RandomForestRegressor(random_state=0)
# regr = xgb.XGBRegressor()

ts_train_y_list = ts_df[ts_exp_index_target]
# set training target to expression level change
ts_train_y_list.iloc[:, :] = ts_df[ts_exp_index_target].values - ts_df[ts_exp_index_source].values
ts_val_y_list = ts_df[ts_exp_index_target_val]
# set training target to expression level change
ts_val_y_list.iloc[:, :] = ts_df[ts_exp_index_target_val].values - ts_df[ts_exp_index_source_val].values

ts_val_naive_y_list = ts_df[ts_exp_index_source_val]
ts_val_naive_y_list.loc[:,:] = 0


result_list = []
result_measure_list = []
mse_list = []
naive_mse_list = []
bottom_mse_list = []
top_feature_size_list = []


for target_gene in tqdm(target_genes):
    train_gene_index = tf_list[tf_list != target_gene]
    ts_val_y = ts_val_y_list.loc[target_gene]
    old_mse = sys.maxsize
    old_mse = np.random.randn(len(ts_val_y)) + sys.maxsize
    top_influence_genes = train_gene_index
    data_mean = ts_df.T[top_influence_genes].mean()
    data_std = ts_df.T[top_influence_genes].std()
    ts_train_X = ts_df[ts_exp_index_source].T[top_influence_genes]
    ts_val_X = ts_df[ts_exp_index_source_val].T[top_influence_genes]
    ts_train_y = ts_train_y_list.loc[target_gene]
    ts_val_y = ts_val_y_list.loc[target_gene]
    ts_val_naive_y = ts_val_naive_y_list.loc[target_gene]
    regr = RandomForestRegressor(random_state=42, warm_start=True, n_estimators=300, n_jobs=20)
    regr = regr.fit(ts_train_X, ts_train_y)
    # cur_mse = mean_squared_error(ts_val_y,regr.predict(ts_val_X))
    cur_mse = np.square(ts_val_y - regr.predict(ts_val_X))
    old_mse = cur_mse
    first_mse = cur_mse
    top_half = np.argsort(regr.feature_importances_)[int(top_influence_genes.size/2):]
    old_top_influence_genes = top_influence_genes
    top_influence_genes = top_influence_genes.iloc[top_half]
    if (target_tf not in top_influence_genes.values):
        top_influence_genes = top_influence_genes.append(tf_list[tf_list == target_tf])
    old_data_mean = data_mean
    old_data_std = data_std
    old_regr = regr
    old_ts_val_X = ts_val_X


    while(True):
        data_mean = ts_df.T[top_influence_genes].mean()
        data_std = ts_df.T[top_influence_genes].std()
        regr = RandomForestRegressor(random_state=42, warm_start=True, n_estimators=300, n_jobs=20)
        ts_train_X = ts_df[ts_exp_index_source].T[top_influence_genes]
        ts_val_X = ts_df[ts_exp_index_source_val].T[top_influence_genes]
        ts_train_y = ts_train_y_list.loc[target_gene]
        ts_val_y = ts_val_y_list.loc[target_gene]
        ts_val_naive_y = ts_val_naive_y_list.loc[target_gene]
        regr = regr.fit(ts_train_X, ts_train_y)
        # cur_mse = mean_squared_error(ts_val_y,regr.predict(ts_val_X))
        cur_mse = np.square(ts_val_y - regr.predict(ts_val_X))
        p = pairedtest(cur_mse, first_mse, 10000)
        if (p < 0.05 and np.mean(cur_mse) > np.mean(first_mse)) or len(top_influence_genes) < 3:
            data_mean = old_data_mean
            data_std = old_data_std
            regr = old_regr
            ts_val_X = old_ts_val_X
            top_influence_genes = old_top_influence_genes
            break
        else:
            old_mse = cur_mse
            top_half = np.argsort(regr.feature_importances_)[int(top_influence_genes.size/2):]
            old_top_influence_genes = top_influence_genes
            top_influence_genes = top_influence_genes.iloc[top_half]
            if (target_tf not in top_influence_genes.values):
                top_influence_genes = top_influence_genes.append(tf_list[tf_list == target_tf])
            if (len(old_top_influence_genes) == len(top_influence_genes)):
                data_mean = old_data_mean
                data_std = old_data_std
                regr = old_regr
                ts_val_X = old_ts_val_X
                top_influence_genes = old_top_influence_genes
                break

            old_data_mean = data_mean
            old_data_std = data_std
            old_regr = regr
            old_ts_val_X = ts_val_X

            
    top_feature_size_list.append(top_influence_genes.size)
    if (top_influence_genes.size < tf_list.size):
        bottom_influence_genes = list(set(tf_list) - (set(top_influence_genes)))
        bottom_regr = RandomForestRegressor(random_state=42, warm_start=True, n_estimators=300, n_jobs=20)
        ts_train_X_comp = ts_df[ts_exp_index_source].T[bottom_influence_genes]
        ts_val_X_comp = ts_df[ts_exp_index_source_val].T[bottom_influence_genes]
        ts_train_y = ts_train_y_list.loc[target_gene]
        ts_val_y = ts_val_y_list.loc[target_gene]
        bottom_regr = bottom_regr.fit(ts_train_X_comp, ts_train_y)
        bottom_mse_list.append(mean_squared_error(ts_val_y,bottom_regr.predict(ts_val_X_comp)))
    else:
        bottom_mse_list.append(0)

    

    input_mean = data_mean
    # input_std = ts_train_X.std()
    naive_mse_list.append(mean_squared_error(ts_val_y,ts_val_naive_y))
    mse_list.append(mean_squared_error(ts_val_y,regr.predict(ts_val_X)))

    base_prediction = regr.predict(np.array(input_mean).reshape(1,-1))[0]
    y_std = ts_df.T.std()[target_gene]
    perturbation_list = list(choose_2_3(top_influence_genes))
    # perturbation_list = [set(perturbation).union(set([target_tf])) for perturbation in perturbation_list]
    # perturbation_list.insert(0, set([target_tff]))
    
    perturbation_input_list = []
    for perturbation_genes in perturbation_list:
        perturbation_input = data_mean.copy()
        for gene in perturbation_genes:
            perturbation_input[gene] += data_std[gene] * perturbation_factor
        perturbation_input_list.append(perturbation_input)
    
    perturbation_list_names = ['; '.join(perturbation_genes) for perturbation_genes in perturbation_list]
    perturbation_input = np.array(perturbation_input_list)
    perturbation_result_list = (regr.predict(perturbation_input) - base_prediction)/y_std
    if (perturbation_result_list[0] > 0):
        result_list.append(np.array(perturbation_list_names)[np.argsort(perturbation_result_list)[::-1][:5]])
        result_measure_list.append(np.array(perturbation_result_list)[np.argsort(perturbation_result_list)[::-1][:5]])
    else:
        result_list.append(np.array(perturbation_list_names)[np.argsort(perturbation_result_list)[:5]])
        result_measure_list.append(np.array(perturbation_result_list)[np.argsort(perturbation_result_list)[:5]])
   
    result_list.append([target_tf])
    result_measure_list.append(perturbation_result_list[0])


100%|██████████| 642/642 [50:11<00:00,  4.69s/it]


In [41]:
fixed_result_list = []
for i in range(0, len(result_list), 2):
    current_list = result_list[i]
    if (len(current_list) == 4):
        current_list = np.append(current_list, 'NA')
    fixed_result_list.append(np.append(current_list, target_tf))

In [42]:
fixed_result_measure_list = []
for i in range(0, len(result_measure_list), 2):
    current_list = result_measure_list[i]
    if (len(current_list) == 4):
        current_list = np.append(current_list, 0)
    fixed_result_measure_list.append(np.append(current_list, result_measure_list[i+1]))

In [43]:
fixed_result_measure_list = np.array(fixed_result_measure_list)
fixed_result_list = np.array(fixed_result_list)

In [48]:
result_measure_list = np.array(result_measure_list)
result_list = np.array(result_list)
out_df = pd.DataFrame()
out_df.index = target_genes
for i in range(6):
    comb_name = 'top_{}_combination'.format(i+1)
    score_name = 'top_{}_score'.format(i+1)
    out_df[comb_name] = fixed_result_list[:,i]
    out_df[score_name] = fixed_result_measure_list[:,i]
out_df['mse'] = mse_list
out_df['naive_mse'] = naive_mse_list
out_df['mse_diff'] = np.array(mse_list) - np.array(naive_mse_list)
out_df['bottom_mse'] = bottom_mse_list

# out_df['top_comb'] = fixed_result_list
# out_df['top_comb_score'] = fixed_result_measure_list


out_df['feature_size'] = top_feature_size_list

In [49]:
out_df

Unnamed: 0,top_1_combination,top_1_score,top_2_combination,top_2_score,top_3_combination,top_3_score,top_4_combination,top_4_score,top_5_combination,top_5_score,top_6_combination,top_6_score,mse,naive_mse,mse_diff,bottom_mse,feature_size
AT2G35710,AT2G24570; AT4G30935; AT2G23320,-0.722132,AT2G24570; AT5G22570; AT2G23320,-0.653672,AT2G24570; AT4G30935; AT5G22570,-0.643000,AT4G30935; AT5G22570; AT2G23320,-0.635794,AT4G30935; AT3G01970; AT2G23320,-0.620209,AT2G46680,-0.031325,0.000396,0.000361,0.000034,0.000000,72
AT4G03340,AT4G31800; AT1G13960; AT2G46680,1.581495,AT1G13960; AT2G46680,1.577658,AT4G31800; AT1G13960,1.541560,AT5G43290; AT1G13960; AT2G46680,1.496013,AT4G31800; AT5G43290; AT1G13960,1.479501,AT2G46680,0.357111,0.001426,0.000802,0.000624,0.000807,4
AT5G07620,AT5G13080; AT1G69310; AT1G29860,-0.246690,AT4G18170; AT1G69310; AT1G29860,-0.245287,AT2G37260; AT1G69310; AT1G29860,-0.235850,AT1G64000; AT1G69310; AT1G29860,-0.231660,AT1G69310; AT5G41570; AT1G29860,-0.230540,AT2G46680,-0.002698,0.000585,0.000418,0.000167,0.000431,36
AT3G15520,AT1G13960; AT2G46680,1.637894,AT5G43290; AT1G13960; AT2G46680,1.409111,AT5G43290; AT1G13960,1.288104,AT5G43290; AT2G46680,-0.254938,,0.000000,AT2G46680,1.288104,0.001612,0.000858,0.000754,0.001041,3
AT4G32070,AT1G13960; AT2G46680,-2.809682,AT4G01720; AT1G13960; AT2G46680,-1.523918,AT4G01720; AT1G13960,-0.957326,AT4G01720; AT2G46680,0.594601,,0.000000,AT2G46680,-0.957326,0.021226,0.025846,-0.004620,0.021182,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AT2G46160,AT4G31800; AT3G04670,0.981340,AT4G31800; AT3G04670; AT2G46680,0.858149,AT4G31800; AT2G46680,0.732387,AT3G04670; AT2G46680,0.573989,,0.000000,AT2G46680,0.981340,0.000253,0.000351,-0.000098,0.000206,3
AT5G42390,AT1G13960; AT3G58710,-1.428322,AT1G13960; AT3G58710; AT2G46680,-1.136673,AT1G13960; AT2G46680,-1.002058,AT3G58710; AT2G46680,0.144349,,0.000000,AT2G46680,-1.428322,0.026037,0.025676,0.000361,0.026615,3
AT1G03060,AT2G04880; AT4G12020; AT1G13960,-1.558661,AT2G03340; AT4G12020; AT1G13960,-1.545492,AT2G04880; AT2G03340; AT1G13960,-1.455437,AT2G04880; AT2G03340; AT4G12020,-1.351602,AT4G12020; AT1G13960; AT5G43290,-1.210138,AT2G46680,-0.841420,0.073028,0.080940,-0.007912,0.074305,7
AT3G48450,AT2G23320; AT2G46680,0.376609,AT5G49520; AT2G23320,0.302086,AT5G49520; AT2G23320; AT2G46680,0.274805,AT5G49520; AT2G46680,-0.113292,AT2G23320; AT5G52830; AT2G46680,-1.577914,AT2G46680,0.302086,0.005783,0.001086,0.004697,0.001561,4


In [41]:
out_df.to_csv('output/wrky_presentation_neg_rf_new.csv')

In [4]:
out_df = pd.read_csv('output/wrky_presentation_neg_rf_new.csv', index_col=0)

In [30]:
import re
regex = re.compile(r'[-+]?\d*\.\d+|\d+')

res_list = []
for i in out_df['top_comb_score'].values:
    numbers = regex.findall(i)
    value = float(numbers[-1])
    res_list.append(value)

In [32]:
out_df['top_6_score'] = res_list

In [50]:
(out_df['top_6_score'] < 0).sum()

333

In [52]:
(out_df['mse_diff'] > 0).sum()

478

In [20]:
out_df

Unnamed: 0,top_1_combination,top_1_score,top_2_combination,top_2_score,top_3_combination,top_3_score,top_4_combination,top_4_score,top_5_combination,top_5_score,top_6_combination,top_6_score,mse,naive_mse,mse_diff,bottom_mse,feature_size
AT5G02400,AT2G04880; AT1G69810; AT3G58710,0.477039,AT1G69810; AT5G15130; AT3G58710,0.449460,AT1G69810; AT2G21900; AT3G58710,0.440417,AT1G69810; AT3G58710,0.426258,AT2G04880; AT5G15130; AT3G58710,0.382458,AT2G46680,0.242310,0.000295,0.000664,-0.000369,0.000890,7
AT5G57010,AT2G03340; AT1G13960; AT4G23550,0.306276,AT1G13960; AT4G23550; AT5G24110,0.295107,AT1G13960; AT4G23550; AT3G58710,0.291633,AT4G12020; AT1G13960; AT4G23550,0.284343,AT1G13960; AT4G39410; AT4G23550,0.280021,AT2G46680,0.068326,0.000605,0.001169,-0.000564,0.001222,37
AT4G23010,AT4G01250; AT4G31550; AT4G01720,0.410815,AT4G31550; AT4G01720; AT3G04670,0.403659,AT1G80840; AT4G31550; AT4G01720,0.395708,AT2G46680; AT4G31550; AT4G01720,0.392128,AT4G01250; AT2G46680; AT4G31550,0.391550,AT2G46680,0.002761,0.021592,0.043545,-0.021953,0.000000,72
AT2G40730,AT2G03340; AT2G04880; AT1G13960,-1.201697,AT4G12020; AT2G03340; AT1G13960,-1.147590,AT4G12020; AT2G04880; AT1G13960,-1.136804,AT2G30250; AT2G03340; AT1G13960,-0.987785,AT2G30250; AT2G04880; AT1G13960,-0.983205,AT2G46680,-0.016949,0.001022,0.002144,-0.001122,0.000801,36
AT2G07180,AT2G21900; AT1G69810; AT1G66600,-0.053371,AT2G21900; AT1G30650; AT1G66600,-0.049369,AT2G21900; AT1G66600; AT5G01900,-0.048483,AT2G21900; AT1G66600; AT5G52830,-0.044495,AT2G21900; AT1G30650; AT5G01900,-0.043959,AT2G46680,-0.001911,0.002137,0.002837,-0.000700,0.001309,37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AT1G07000,AT4G01250; AT5G56270; AT4G31550,0.776311,AT1G13960; AT4G01250; AT4G31550,0.775026,AT2G23320; AT4G01250; AT4G31550,0.761206,AT1G62300; AT4G01250; AT4G31550,0.760608,AT2G46680; AT4G01250; AT4G31550,0.758343,AT2G46680,0.002904,0.026028,0.041213,-0.015185,0.029775,36
AT5G19290,AT2G03340; AT1G13960; AT1G30650,-0.149591,AT2G03340; AT1G30650; AT4G18170,-0.148271,AT2G03340; AT2G04880; AT1G30650,-0.146021,AT2G03340; AT2G44745; AT1G30650,-0.142120,AT2G03340; AT1G30650; AT5G43290,-0.140904,AT2G46680,-0.018791,0.005652,0.014436,-0.008785,0.000000,72
AT4G03480,AT1G30650; AT2G46680,0.987003,AT2G38470; AT1G30650; AT2G46680,0.640048,AT2G38470; AT1G30650,0.624371,AT2G38470; AT2G46680,-0.539617,,0.000000,AT2G46680,0.624371,0.000441,0.001059,-0.000618,0.001018,3
AT1G16130,AT2G03340; AT1G13960; AT5G45050,-0.372897,AT2G03340; AT5G22570; AT5G45050,-0.348711,AT2G03340; AT5G45050; AT2G40750,-0.347527,AT2G03340; AT3G01970; AT5G45050,-0.347074,AT1G13960; AT5G45050; AT2G40750,-0.341321,AT2G46680,-0.009542,0.000258,0.000082,0.000176,0.000000,72


In [5]:
A = np.random.rand(5)
B = np.random.rand(5)


In [7]:
A-B

array([ 0.56661186, -0.18662621, -0.36603361,  0.48380576,  0.91346389])