In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from functools import partial
from multiprocessing import shared_memory
from multiprocessing.dummy import Pool
from sklearn.ensemble import RandomForestRegressor
import multiprocessing as mp
from itertools import chain, combinations
import sys
import os
from scipy import stats

from sklearn.preprocessing import Normalizer
from sklearn.ensemble import RandomForestRegressor

import mp_run

import concurrent.futures
os.environ["OMP_NUM_THREADS"] = "1" # export OMP_NUM_THREADS=1
os.environ["OPENBLAS_NUM_THREADS"] = "1" # export OPENBLAS_NUM_THREADS=1
os.environ["MKL_NUM_THREADS"] = "1" # export MKL_NUM_THREADS=1
os.environ["VECLIB_MAXIMUM_THREADS"] = "1" # export VECLIB_MAXIMUM_THREADS=1
os.environ["NUMEXPR_NUM_THREADS"] = "1" # export NUMEXPR_NUM_THREADS=1

# styling:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use(['ggplot'])
sns.set_palette("deep")

In [2]:
perturbation_factor = 3
num_rf_predictors = 500

num_deg_targets = 2000
target_tf = 'AT2G46680'

induction_flag = 1
mp_threads = 20
# if (len(sys.argv)>=3):
#     induction_flag = bool(sys.argv[1])
#     mp_threads = int(sys.argv[2])

In [3]:
def choose_2_3(iterable):
    "powerset([1,2,3]) -->  (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(2,4))

def choose_2(iterable):
    "powerset([1,2,3]) -->  (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(2,3))

In [4]:
ts_df = pd.read_csv('data/GSE97500/expression.tsv', sep='\t', index_col=0)

# filter out chloroplast and mitochondrial genes
ts_df = ts_df[:28433]

meta_df = pd.read_csv('data/GSE97500/meta_data.tsv', sep='\t')
ts_exp_index = meta_df[meta_df['isTs']]
ts_exp_index_target =  ts_exp_index[ts_exp_index['is1stLast'] != 'f'].condName
ts_exp_index_source =  ts_exp_index[ts_exp_index['is1stLast'] != 'f'].prevCol

kirk_res = pd.read_csv('./data/dt_for_Bingran.tsv', sep='\t')
kirk_tf_set = set([name.split('_')[0] for name in kirk_res['V2'].values])
# add HB7 and CRF4 to the list of TFs to be tested
test_tf_set = kirk_tf_set.union(set(['AT2G46680', 'AT4G27950']))

In [5]:
target_exp_df = ts_df.drop(labels=test_tf_set)
tf_exp_df = ts_df.loc[test_tf_set]

deg_targets = target_exp_df.std(axis=1).sort_values(ascending=False)[:num_deg_targets].index
target_df = target_exp_df.loc[deg_targets][ts_exp_index_target]
tf_df = tf_exp_df[ts_exp_index_source]

In [6]:
target_exp_df_stats = pd.DataFrame(index=target_exp_df.index, columns=['exp_std'], data=target_exp_df.std(axis=1).values)

In [7]:
target_exp_df_stats.median(axis=0)

exp_std    176.26594
dtype: float64

In [12]:
target_exp_df_stats = target_exp_df_stats.sort_values(by='exp_std', ascending=False)
deg_exp_df_stats = target_exp_df_stats[:num_deg_targets]

In [13]:
target_df = target_exp_df.loc[deg_exp_df_stats.index][ts_exp_index_target]
tf_df = tf_exp_df[ts_exp_index_source]

val_error_mean_list = []
val_error_std_list = []

for target_gene in tqdm(deg_exp_df_stats.index[:5]):
    ts_train_X = tf_df.T
    ts_train_y = target_df.loc[target_gene]

    func = partial(mp_run.validation_measure, ts_train_X, ts_train_y)
    with mp.Pool() as pool:
        results = pool.map(func, range(num_rf_predictors))
    
    val_error_mean_list.append(np.mean(results))
    val_error_std_list.append(np.std(results))
    

  0%|          | 3/2000 [00:15<2:55:40,  5.28s/it]


KeyboardInterrupt: 

In [22]:
tf_list = list(test_tf_set)
combo_names = []
combo_length = 0
for i, j in combinations(range(len(tf_list)),2):
    combo_names.append(tf_list[i] + '_' + tf_list[j])
    combo_length += 1

In [12]:
deg_targets

Index(['AT3G41768', 'AT4G21960', 'AT1G13440', 'AT1G77120', 'AT1G21310',
       'AT4G13940', 'AT5G53460', 'AT4G05320', 'AT2G43150', 'AT5G02500',
       ...
       'AT5G59090', 'AT1G67350', 'AT4G24780', 'AT3G27380', 'AT3G16400',
       'AT1G80360', 'AT5G37370', 'AT5G65390', 'AT3G51600', 'AT4G11850'],
      dtype='object', length=2000)

In [None]:
# synergy test

res_df = pd.DataFrame(index=combo_names)
res_mat = []


for target_gene in tqdm(deg_targets):
    
    ts_train_X = tf_df.T

    ts_train_y = target_df.loc[target_gene]

    input_mean = ts_train_X.mean()
    input_std = ts_train_X.std()

    # 

    combos = choose_2(range(len(tf_list)))

    perturbation_input_single = input_mean.copy()
    perturbation_input_double = input_mean.copy()
    perturbation_input_single = np.tile(perturbation_input_single.values, (len(tf_list),1))
    perturbation_input_double = np.tile(perturbation_input_double.values, (combo_length,1))

    for i, tf_name in enumerate(tf_list):
        perturbation_input_single[i][i] += input_std[tf_name]*perturbation_factor
    for k, (i,j) in enumerate(combinations(range(len(tf_list)),2)):
        perturbation_input_double[k][i] += input_std[tf_list[i]]*perturbation_factor
        perturbation_input_double[k][j] += input_std[tf_list[j]]*perturbation_factor

    func = partial(mp_run.regr_perturbation, ts_train_X, ts_train_y, perturbation_input_single, perturbation_input_double)

    with mp.Pool() as pool:
        results = pool.map(func, range(num_rf_predictors))

    single_results = []
    double_results = []
    for result in results:
        single_results.append(result[0])
        double_results.append(result[1])
    single_results = np.array(single_results)
    double_results = np.array(double_results)

    p_val_list = []
    for k, (i,j) in enumerate(combinations(range(len(tf_list)),2)):
        double_effects = double_results[:,k]
        single_effect_a = single_results[:,i]
        single_effect_b = single_results[:,j]
        t_val, p_val = stats.ttest_rel(np.abs(double_effects), np.abs(single_effect_a) + np.abs(single_effect_b))
        if (t_val < 0) or (single_effect_a.mean()*single_effect_b.mean() < 0):
            p_val = 1
        p_val_list.append(p_val)
    p_val_list = np.array(p_val_list)
    res_mat.append(p_val_list)

res_mat_np = np.array(res_mat)

 54%|████████████████████████████████████▉                               | 1085/2000 [46:54<39:36,  2.60s/it]

In [12]:
res_df = pd.DataFrame(index=deg_targets, columns=combo_names, data=res_mat)
res_df_count = (res_df < 0.05).sum()

In [13]:
perturbation_res = []
for i, row in kirk_res.iterrows():
    tf_a = row['V1'].split('_')[0]
    tf_b = row['V2'].split('_')[0]
    if (tf_a + '_' + tf_b) in res_df_count.index:
        perturbation_res.append(res_df_count.loc[tf_a + '_' + tf_b])
    elif (tf_b + '_' + tf_a) in res_df_count.index:
        perturbation_res.append(res_df_count.loc[tf_b + '_' + tf_a])
    else:
        perturbation_res.append(np.nan)

In [14]:
kirk_res['synergy_simu'] = perturbation_res

In [15]:
kirk_res.to_csv('./output/kirk_add_xb.csv', index=False)

In [16]:
res_df.to_csv('./output/xb_kirk_tfs.csv')

In [17]:
res_df_rf = pd.read_csv('./output/rf_kirk_tfs.csv', index_col=0)
res_df_rf

Unnamed: 0,AT5G15830_AT2G01930,AT5G15830_AT5G65210,AT5G15830_AT1G31320,AT5G15830_AT5G22570,AT5G15830_AT2G46680,AT5G15830_AT2G41310,AT5G15830_AT4G27330,AT5G15830_AT1G66600,AT5G15830_AT4G27950,AT5G15830_AT2G32700,...,AT1G69180_AT2G42280,AT1G69180_AT1G01060,AT1G69180_AT2G42430,AT1G69180_AT2G41835,AT2G42280_AT1G01060,AT2G42280_AT2G42430,AT2G42280_AT2G41835,AT1G01060_AT2G42430,AT1G01060_AT2G41835,AT2G42430_AT2G41835
AT3G41768,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.000000e+00,1.000000e+00,1.0,1.0,1.000000,1.0
AT4G21960,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.000000e+00,9.623400e-18,1.0,1.0,1.000000,1.0
AT1G13440,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,6.215590e-22,1.000000e+00,1.0,1.0,1.000000,1.0
AT1G77120,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,7.172883e-05,1.000000e+00,1.0,1.0,0.790494,1.0
AT1G21310,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.000000e+00,1.000000e+00,1.0,1.0,1.000000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AT1G80360,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.000000e+00,4.218594e-26,1.0,1.0,1.000000,1.0
AT5G37370,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.000000e+00,1.000000e+00,1.0,1.0,1.000000,1.0
AT5G65390,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.000000e+00,1.000000e+00,1.0,1.0,1.000000,1.0
AT3G51600,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.000000e+00,1.000000e+00,1.0,1.0,1.000000,1.0


In [18]:
res_df

Unnamed: 0,AT1G54690_AT1G32540,AT1G54690_AT2G01930,AT1G54690_AT4G10920,AT1G54690_AT1G22070,AT1G54690_AT4G37790,AT1G54690_AT5G56840,AT1G54690_AT4G31800,AT1G54690_AT1G66600,AT1G54690_AT2G42280,AT1G54690_AT1G14687,...,AT1G19350_AT4G27950,AT1G19350_AT2G42430,AT1G19350_AT2G46680,AT1G19350_AT3G46590,AT4G27950_AT2G42430,AT4G27950_AT2G46680,AT4G27950_AT3G46590,AT2G42430_AT2G46680,AT2G42430_AT3G46590,AT2G46680_AT3G46590
AT3G41768,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,1.0,0.0,...,1.0,1.0,,,,1.0,,1.0,,
AT4G21960,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,,,,,,
AT1G13440,,1.0,,1.0,,1.0,1.0,1.0,1.0,,...,,1.0,,,1.0,,,1.0,1.0,
AT1G77120,,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,0.0,0.0,1.0,1.0,,1.0,0.0,1.0,1.0
AT1G21310,1.0,,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,...,,,,1.0,0.0,0.0,,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AT1G80360,1.0,1.0,1.0,1.0,1.0,1.0,0.0,,1.0,1.0,...,,,,,0.0,0.0,,1.0,,
AT5G37370,,1.0,,1.0,,1.0,,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,1.0,1.0
AT5G65390,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,,...,1.0,,1.0,1.0,1.0,,1.0,1.0,1.0,1.0
AT3G51600,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,...,1.0,1.0,0.0,1.0,,1.0,1.0,1.0,,1.0


In [23]:
res_df_comb = np.logical_and(res_df < 0.05, res_df_rf < 0.05)

In [24]:
res_df_count = (res_df_comb).sum()

In [25]:
res_df_count

AT1G54690_AT1G32540     0
AT1G54690_AT2G01930     9
AT1G54690_AT4G10920    15
AT1G54690_AT1G22070     0
AT1G54690_AT4G37790     0
                       ..
AT4G27950_AT2G46680    43
AT4G27950_AT3G46590     3
AT2G42430_AT2G46680     2
AT2G42430_AT3G46590     3
AT2G46680_AT3G46590     5
Length: 741, dtype: int64

In [26]:
perturbation_res = []
for i, row in kirk_res.iterrows():
    tf_a = row['V1'].split('_')[0]
    tf_b = row['V2'].split('_')[0]
    if (tf_a + '_' + tf_b) in res_df_count.index:
        perturbation_res.append(res_df_count.loc[tf_a + '_' + tf_b])
    elif (tf_b + '_' + tf_a) in res_df_count.index:
        perturbation_res.append(res_df_count.loc[tf_b + '_' + tf_a])
    else:
        perturbation_res.append(np.nan)

In [28]:
kirk_res['synergy_simu'] = perturbation_res

In [29]:
kirk_res.to_csv('./output/kirk_add_comb.csv', index=False)

In [32]:
res_df_count.loc['AT4G27950_AT2G46680']

43

In [33]:
res_df_count.to_csv('./output/tf_rank.csv')