In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from functools import partial
from multiprocessing import shared_memory
from multiprocessing.dummy import Pool
from sklearn.ensemble import RandomForestRegressor
import multiprocessing as mp
from itertools import chain, combinations
import sys
import os
from scipy import stats

from sklearn.preprocessing import Normalizer
from sklearn.ensemble import RandomForestRegressor

import mp_run

import concurrent.futures
os.environ["OMP_NUM_THREADS"] = "1" # export OMP_NUM_THREADS=1
os.environ["OPENBLAS_NUM_THREADS"] = "1" # export OPENBLAS_NUM_THREADS=1
os.environ["MKL_NUM_THREADS"] = "1" # export MKL_NUM_THREADS=1
os.environ["VECLIB_MAXIMUM_THREADS"] = "1" # export VECLIB_MAXIMUM_THREADS=1
os.environ["NUMEXPR_NUM_THREADS"] = "1" # export NUMEXPR_NUM_THREADS=1

# styling:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use(['ggplot'])
sns.set_palette("deep")

In [3]:
perturbation_factor = 3
num_rf_predictors = 500

num_deg_targets = 2000
target_tf = 'AT2G46680'

induction_flag = 1
mp_threads = 20
# if (len(sys.argv)>=3):
#     induction_flag = bool(sys.argv[1])
#     mp_threads = int(sys.argv[2])

In [4]:
def choose_2_3(iterable):
    "powerset([1,2,3]) -->  (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(2,4))

def choose_2(iterable):
    "powerset([1,2,3]) -->  (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(2,3))

In [5]:
ts_df = pd.read_csv('data/GSE97500/expression.tsv', sep='\t', index_col=0)

# filter out chloroplast and mitochondrial genes
ts_df = ts_df[:28433]

meta_df = pd.read_csv('data/GSE97500/meta_data.tsv', sep='\t')
ts_exp_index = meta_df[meta_df['isTs']]
ts_exp_index_target =  ts_exp_index[ts_exp_index['is1stLast'] != 'f'].condName
ts_exp_index_source =  ts_exp_index[ts_exp_index['is1stLast'] != 'f'].prevCol

kirk_res = pd.read_csv('./data/dt_for_Bingran.tsv', sep='\t')
kirk_tf_set = set([name.split('_')[0] for name in kirk_res['V2'].values])
# add HB7 and CRF4 to the list of TFs to be tested
test_tf_set = kirk_tf_set.union(set(['AT2G46680', 'AT4G27950']))

In [6]:
target_exp_df = ts_df.drop(labels=test_tf_set)
tf_exp_df = ts_df.loc[test_tf_set]

deg_targets = target_exp_df.std(axis=1).sort_values(ascending=False)[:num_deg_targets].index
target_df = target_exp_df.loc[deg_targets][ts_exp_index_target]
tf_df = tf_exp_df[ts_exp_index_source]

In [7]:
tf_list = list(test_tf_set)

In [8]:
combo_names = []
combo_length = 0
for i, j in combinations(range(len(tf_list)),2):
    combo_names.append(tf_list[i] + '_' + tf_list[j])
    combo_length += 1

In [9]:
res_df = pd.DataFrame(index=combo_names)
res_mat = []

In [7]:

for target_gene in tqdm(deg_targets)
    
    ts_train_X = tf_df.T

    ts_train_y = target_df.loc[target_gene]

    input_mean = ts_train_X.mean()
    input_std = ts_train_X.std()

    # 

    combos = choose_2(range(len(tf_list)))

    perturbation_input_single = input_mean.copy()
    perturbation_input_double = input_mean.copy()
    perturbation_input_single = np.tile(perturbation_input_single.values, (len(tf_list),1))
    perturbation_input_double = np.tile(perturbation_input_double.values, (combo_length,1))

    for i, tf_name in enumerate(tf_list):
        perturbation_input_single[i][i] += input_std[tf_name]*perturbation_factor
    for k, (i,j) in enumerate(combinations(range(len(tf_list)),2)):
        perturbation_input_double[k][i] += input_std[tf_list[i]]*perturbation_factor
        perturbation_input_double[k][j] += input_std[tf_list[j]]*perturbation_factor

    func = partial(mp_run.regr_perturbation, ts_train_X, ts_train_y, perturbation_input_single, perturbation_input_double)

    with mp.Pool() as pool:
        results = pool.map(func, range(num_rf_predictors))

    single_results = []
    double_results = []
    for result in results:
        single_results.append(result[0])
        double_results.append(result[1])
    single_results = np.array(single_results)
    double_results = np.array(double_results)

    p_val_list = []
    for k, (i,j) in enumerate(combinations(range(len(tf_list)),2)):
        double_effects = double_results[:,k]
        single_effect_a = single_results[:,i]
        single_effect_b = single_results[:,j]
        t_val, p_val = stats.ttest_rel(np.abs(double_effects), np.abs(single_effect_a) + np.abs(single_effect_b))
        if (t_val < 0) or (single_effect_a.mean()*single_effect_b.mean() < 0):
            p_val = 1
        p_val_list.append(p_val)
    p_val_list = np.array(p_val_list)
    res_mat.append(p_val_list)