In [98]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

from sklearn.model_selection import train_test_split
from scipy.stats import ttest_rel
from itertools import chain, combinations

In [99]:
perturbation_factor = 3.0
def choose_2_3(iterable):
    "powerset([1,2,3]) -->  (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(2,4))

In [100]:
exp_name_df = pd.read_csv('./data/bsubtilis/exp_names.tsv', index_col=0, sep='\t', names=['name'])
network_df = pd.read_csv('./data/bsubtilis/gold_standard.tsv', sep='\t', index_col=0)
multi_tf_network_df = network_df.loc[(network_df.sum(axis=1).sort_values(ascending=False) > 4)]
multi_tf_targets = multi_tf_network_df.index
tf_names = multi_tf_network_df.columns.values

In [101]:
exp_genes_set = set(multi_tf_targets).union(set(tf_names))
common_tf_gene = set(multi_tf_targets).intersection(set(tf_names))
for subdir, dirs, files in os.walk('./data/bsubtilis/seq_data/'):
    for file in files:
        df = pd.read_csv(os.path.join(subdir, file), index_col=0, sep='\t', names=['exp'])
        exp_genes_set = exp_genes_set.intersection(set(df.index))

In [102]:
exp_df = pd.DataFrame(index=exp_genes_set)

In [103]:
for subdir, dirs, files in os.walk('./data/bsubtilis/seq_data/'):
    for code_name, exp_name in zip(exp_name_df.index, exp_name_df['name'].values):
        df = pd.read_csv(os.path.join(subdir, code_name+'-tbl-1.txt'), index_col=0, sep='\t', names=['exp'])
        exp_df[exp_name] = df.loc[list(exp_genes_set)]['exp'].values

  exp_df[exp_name] = df.loc[list(exp_genes_set)]['exp'].values
  exp_df[exp_name] = df.loc[list(exp_genes_set)]['exp'].values
  exp_df[exp_name] = df.loc[list(exp_genes_set)]['exp'].values
  exp_df[exp_name] = df.loc[list(exp_genes_set)]['exp'].values
  exp_df[exp_name] = df.loc[list(exp_genes_set)]['exp'].values
  exp_df[exp_name] = df.loc[list(exp_genes_set)]['exp'].values
  exp_df[exp_name] = df.loc[list(exp_genes_set)]['exp'].values
  exp_df[exp_name] = df.loc[list(exp_genes_set)]['exp'].values
  exp_df[exp_name] = df.loc[list(exp_genes_set)]['exp'].values
  exp_df[exp_name] = df.loc[list(exp_genes_set)]['exp'].values
  exp_df[exp_name] = df.loc[list(exp_genes_set)]['exp'].values
  exp_df[exp_name] = df.loc[list(exp_genes_set)]['exp'].values
  exp_df[exp_name] = df.loc[list(exp_genes_set)]['exp'].values
  exp_df[exp_name] = df.loc[list(exp_genes_set)]['exp'].values
  exp_df[exp_name] = df.loc[list(exp_genes_set)]['exp'].values
  exp_df[exp_name] = df.loc[list(exp_genes_set)]['exp']

In [104]:
exp_df = exp_df.T

In [105]:
exp_df.to_csv('./data/bsubtilis/expression.csv')

In [106]:
target_set = set(multi_tf_targets).difference(set(tf_names))
tf_names = np.array(list(set(tf_names).intersection(exp_genes_set)))

In [107]:
from xgboost import XGBRFRegressor

In [108]:
target_set = (set(multi_tf_targets).difference(set(tf_names)))
score_diff_list = []
pval_list = []
regr_score_list = []
regr_network_score_list = []
for target in tqdm(target_set):
    network = network_df.loc[target]
    network_tf = network[network>0].index
    y = exp_df[target]
    X = exp_df[tf_names]
    regr_score = np.empty(0)
    regr_network_score = np.empty(0)
    regr_error_all = np.empty(0)
    regr_network_error_all = np.empty(0)
    for i in range(10):
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=i)
        X_train_network = X_train[network_tf]
        X_test_network = X_test[network_tf]
        regr = XGBRFRegressor()
        regr_network = XGBRFRegressor()
        regr.fit(X_train, y_train)
        regr_network.fit(X_train_network, y_train)
        # regr_score_list.append(regr.score(X_test, y_test))
        # regr_network_score_list.append(regr_network.score(X_test_network, y_test))
        # score_diff_list.append(regr.score(X_test, y_test) - regr_network.score(X_test_network, y_test))
        regr_error = np.square(regr.predict(X_test)-y_test).values
        regr_network_error = np.square(regr_network.predict(X_test_network)-y_test).values
        regr_error_all = np.concatenate([regr_error_all, regr_error])
        regr_network_error_all = np.concatenate([regr_network_error_all, regr_network_error])
    regr_score_list.append(np.mean(regr_error_all))
    regr_network_score_list.append(np.mean(regr_network_error_all))
    score_diff_list.append(np.mean(regr_network_error_all) - np.mean(regr_error_all))
    t_stats, pval = ttest_rel(regr_error_all, regr_network_error_all)
    pval_list.append(pval)
    

100%|██████████| 24/24 [01:13<00:00,  3.08s/it]


In [109]:
model_comp_df = pd.DataFrame(index=list(target_set))
model_comp_df['network_model_score'] = regr_network_score_list
model_comp_df['all_model_score'] = regr_score_list
model_comp_df['score_diff'] = score_diff_list
model_comp_df['score_diff_pval'] = pval_list

In [110]:
model_comp_df = model_comp_df.sort_values(by=['all_model_score'], ascending=[True])
model_comp_df

Unnamed: 0,network_model_score,all_model_score,score_diff,score_diff_pval
BSU29100,0.224275,0.191818,0.032457,0.003233958
BSU00620,0.418906,0.203892,0.215013,2.288739e-11
BSU25840,0.343468,0.224941,0.118527,1.543947e-11
BSU03290,0.341852,0.281599,0.060253,0.0006677751
BSU38540,0.34304,0.297596,0.045444,0.00214806
BSU38530,0.386466,0.311057,0.075409,7.254695e-07
BSU36650,0.591903,0.327349,0.264554,1.944807e-14
BSU11490,0.692294,0.343031,0.349263,5.785387e-15
BSU36640,0.55709,0.365481,0.191609,1.066342e-11
BSU19410,0.484495,0.384823,0.099672,0.001637254


In [111]:
target = list(target_set)[0]

In [144]:
for target in target_set:
    y = exp_df[target]
    X = exp_df[tf_names]
    y_std = y.std()
    input_mean = X.mean()
    input_std = X.std()
    network = network_df.loc[target]
    network_tf = network[network>0].index.values
    X_network = X[network_tf]
    input_mean_network = X_network.mean()
    input_std_network = X_network.std()


    perturbation_list = list(choose_2_3(network_tf))
    perturbation_list_names = ['; '.join(perturbation_genes) for perturbation_genes in perturbation_list]
    perturbation_measure_df = pd.DataFrame(index=perturbation_list_names)
    perturbation_additive_measure_df = pd.DataFrame(index=perturbation_list_names)

    for i in tqdm(range(1000)):

        # regr = XGBRFRegressor()
        regr_network = XGBRFRegressor(random_state=i)
        # regr.fit(X, y)
        regr_network.fit(X_network, y)

        base_prediction = regr_network.predict(np.array(input_mean_network).reshape(1,-1))[0]

        single_effect_dict = dict()
        for tf in network_tf:
            perturbation_input = input_mean_network.copy()
            perturbation_input[tf] += input_std[tf] * perturbation_factor
            perturbation_prediction = regr_network.predict(np.array(perturbation_input).reshape(1,-1))[0]
            perturbation_measure = (perturbation_prediction - base_prediction)/y_std
            single_effect_dict[tf] = perturbation_measure

        perturbation_result_list = []
        perturbation_additive_result_list = []

        for perturbation_genes in perturbation_list:
            perturbation_input = input_mean_network.copy()
            additive_effects = 0
            for gene in perturbation_genes:
                perturbation_input[gene] += input_std[gene] * perturbation_factor
                additive_effects += single_effect_dict[gene]
            perturbation_prediction = regr_network.predict(np.array(perturbation_input).reshape(1,-1))[0]
            perturbation_measure = (perturbation_prediction - base_prediction)/y_std
            perturbation_result_list.append(perturbation_measure)
            perturbation_additive_result_list.append(additive_effects)
        perturbation_measure_df[i] = perturbation_result_list
        perturbation_measure_df = perturbation_measure_df.copy()
        perturbation_additive_measure_df[i] = perturbation_additive_result_list
        perturbation_additive_measure_df = perturbation_additive_measure_df.copy()
    measure_diff_df = pd.DataFrame(index=perturbation_list_names)
    pval_list = []
    measure_diff_list = []
    for i in perturbation_measure_df.index:
        perturbation_measure = perturbation_measure_df.loc[i].abs()
        perturbation_additive_measure = perturbation_additive_measure_df.loc[i].abs()
        mean_measure_diff = perturbation_measure.mean() - perturbation_additive_measure.mean()
        t_stats, pval = ttest_rel(perturbation_measure, perturbation_additive_measure)
        measure_diff_list.append(mean_measure_diff)
        pval_list.append(pval)
    measure_diff_df['mean_measure_diff'] = measure_diff_list
    measure_diff_df['pval'] = pval_list
    measure_diff_df.sort_values(by='mean_measure_diff', ascending=False).to_csv('./output/bsubtilis/'+target+'_tf_synergy_measure.csv')



  1%|          | 8/1000 [00:00<01:34, 10.53it/s]

In [138]:
measure_diff_df = pd.DataFrame(index=perturbation_list_names)
pval_list = []
measure_diff_list = []
for i in perturbation_measure_df.index:
    perturbation_measure = perturbation_measure_df.loc[i].abs()
    perturbation_additive_measure = perturbation_additive_measure_df.loc[i].abs()
    mean_measure_diff = perturbation_measure.mean() - perturbation_additive_measure.mean()
    t_stats, pval = ttest_rel(perturbation_measure, perturbation_additive_measure)
    measure_diff_list.append(mean_measure_diff)
    pval_list.append(pval)
measure_diff_df['mean_measure_diff'] = measure_diff_list
measure_diff_df['pval'] = pval_list

In [140]:
measure_diff_df.sort_values(by='mean_measure_diff', ascending=False)

Unnamed: 0,mean_measure_diff,pval
BSU23520; BSU09380; BSU23120,0.583715,3.008114e-67
BSU23520; BSU23120,0.515578,2.570375e-65
BSU23520; BSU23120; BSU13310,0.101323,1.58315e-10
BSU09380; BSU23120,0.044766,1.902437e-15
BSU37290; BSU13310,0.01034,6.218919e-09
BSU09380; BSU13310,0.007167,1.738509e-06
BSU23520; BSU13310,0.000212,0.8880109
BSU37290; BSU09380; BSU13310,-0.036096,1.114161e-12
BSU37290; BSU09380,-0.039041,3.2949930000000004e-17
BSU37290; BSU23520; BSU23120,-0.064678,3.351103e-05
