In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from functools import partial
from multiprocessing import shared_memory
from multiprocessing.dummy import Pool
from sklearn.ensemble import RandomForestRegressor
import multiprocessing as mp
from itertools import chain, combinations

import mp_run

In [2]:
perturbation_factor = 3
num_rf_predictors = 500

In [3]:
tf_df = pd.read_csv('data/Ath_TF_list.txt', sep='\t')
tf_list = tf_df['Gene_ID']s

In [4]:
def choose_2_3(iterable):
    "powerset([1,2,3]) -->  (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(2,4))

In [5]:
deg_df = pd.read_csv('data/SS/92_DEG_Clusters.csv', index_col=0)
ranking_path = 'output/GSE97500/'
deg_genes = deg_df.index

In [6]:
# deg genes for presentation only:
all_common_tf_set = set(tf_list)
deg_genes = ['AT1G77760', 'AT3G44300', 'AT1G69870', 'AT2G46680', 'AT2G46820', 'AT1G14040']
deg_genes_top_influencer_lists = []
data_sources = ['data/GSE97500/', 'data/GSE111062_RAW/', 'data/GSE158898/']
ranking_sources = ['output/GSE97500/', 'output/GSE111062/', 'output/GSE158898/']
for gene in deg_genes:
    top_set = set()
    for source in ranking_sources:
        df = pd.read_csv(source+gene+'_rankings.csv', index_col=0, names=['impact']).sort_values(by='impact',ascending=False)
        top_set = top_set.union(set(df.head(100).index))
        all_common_tf_set = all_common_tf_set.intersection(df.index)
    deg_genes_top_influencer_lists.append(list(top_set.intersection(all_common_tf_set)))

In [7]:
ts_df = pd.read_csv('data/GSE97500/expression.tsv', sep='\t', index_col=0)

In [90]:
from sklearn.preprocessing import Normalizer

In [92]:
dfs = [pd.read_csv(f, sep='\t', index_col=0) for f in ['data/GSE111062_RAW/expression.tsv', 'data/GSE158898/expression.tsv']]

common_index = pd.Series(list(set(dfs[0].index).intersection(set(dfs[1].index))))
# dfs = [df.div(df.sum(axis=1), axis=0, fill_value=0.00001) for df in dfs]

dfs[0].iloc[:,:] = Normalizer(norm='l1').fit_transform(dfs[0])
dfs[1].iloc[:,:] = Normalizer(norm='l1').fit_transform(dfs[1])
# Combine the list of dataframes
ss_df = pd.concat([dfs[0].loc[common_index], dfs[1].loc[common_index]], axis=1)

In [8]:
target_genes = pd.Series(list(set(deg_genes).intersection(set(ts_df.index))))

In [9]:
meta_df = pd.read_csv('data/GSE97500/meta_data.tsv', sep='\t')
ts_exp_index = meta_df[meta_df['isTs']]
ts_exp_index_target =  ts_exp_index[ts_exp_index['is1stLast'] != 'f'].condName
ts_exp_index_source =  ts_exp_index[ts_exp_index['is1stLast'] != 'f'].prevCol
# ts_exp_index_target =  ts_exp_index.condName
# ts_exp_index_source =  ts_exp_index.condName
regulator_gene_index = ts_df.index
regulator_gene_index = pd.Series(list(set(tf_list).intersection(set(regulator_gene_index))))


In [10]:
importance_df_list = []

In [129]:

ts_train_y_list = ts_df[ts_exp_index_target]

result_list = []
result_measure_list = []

for target_gene, top_influence_genes in zip(target_genes, deg_genes_top_influencer_lists):

    ts_train_y = ts_train_y_list.loc[target_gene]

    ts_train_X = ts_df[ts_exp_index_source].T[top_influence_genes]

    ss_train_y = ss_df.loc[target_gene]
    ss_train_X = ss_df.T[top_influence_genes]
    
    # importance_df = pd.read_csv(ranking_path+target_gene+'_rankings.csv',index_col=0, names=['impact']).sort_values('impact')
    # mean_importance = importance_df.impact.values
    # importance_df_list.append(mean_importance)
    data_mean = ts_df.T[top_influence_genes].mean()
    data_std = ts_df.T[top_influence_genes].std()
    regr_ts = RandomForestRegressor(random_state=42, warm_start=True, n_estimators=300, n_jobs=20)
    regr_ts = regr_ts.fit(ts_train_X, ts_train_y)


    base_prediction = regr_ts.predict(np.array(data_mean).reshape(1,-1))[0]
    y_std = ts_df.T.std()[target_gene]

    perturbation_input_mat = []
    for regulator in top_influence_genes:
        perturbation_input = data_mean.copy()
        perturbation_input[regulator] += data_std[regulator] * perturbation_factor
        perturbation_input_mat.append(perturbation_input.values)
    single_result_measure_list = (regr_ts.predict(perturbation_input_mat) - base_prediction)/y_std

    perturbation_result_list = []
    perturbation_list = list(choose_2_3(top_influence_genes))
    perturbation_single_impact_list = list(choose_2_3(single_result_measure_list))
    perturbation_list_names = ['; '.join(perturbation_genes) for perturbation_genes in perturbation_list]
    perturbation_input_mat = []
    single_max_impact = []
    for single_impacts in perturbation_single_impact_list:
        single_max_impact.append(np.max(single_impacts))

    for perturbation_genes in tqdm(perturbation_list):
        perturbation_input = data_mean.copy()
        for gene in perturbation_genes:
            perturbation_input[gene] += data_std[gene] * perturbation_factor
        perturbation_input_mat.append(perturbation_input.values)
    perturbation_input_mat = np.array(perturbation_input_mat)
    result_measure_list = (regr_ts.predict(perturbation_input_mat) - base_prediction)/y_std
    out_df = pd.DataFrame(index=perturbation_list_names, data=result_measure_list, columns=['ts_impact'])
    out_df['ts_impact_premium'] = np.subtract(result_measure_list, np.array(single_max_impact))

    # steady state data part
    data_mean = ss_df.T[top_influence_genes].mean()
    data_std = ss_df.T[top_influence_genes].std()
    regr_ss = RandomForestRegressor(random_state=42, warm_start=True, n_estimators=300, n_jobs=20)
    regr_ss = regr_ss.fit(ss_train_X, ss_train_y)


    base_prediction = regr_ss.predict(np.array(data_mean).reshape(1,-1))[0]
    y_std = ss_df.T.std()[target_gene]

    perturbation_input_mat = []
    for regulator in top_influence_genes:
        perturbation_input = data_mean.copy()
        perturbation_input[regulator] += data_std[regulator] * perturbation_factor
        perturbation_input_mat.append(perturbation_input.values)
    single_result_measure_list = (regr_ss.predict(perturbation_input_mat) - base_prediction)/y_std

    perturbation_result_list = []
    # perturbation_list = list(choose_2_3(top_influence_genes))
    # perturbation_single_impact_list = list(choose_2_3(single_result_measure_list))
    perturbation_list_names = ['; '.join(perturbation_genes) for perturbation_genes in perturbation_list]
    perturbation_input_mat = []
    single_max_impact = []
    for single_impacts in perturbation_single_impact_list:
        single_max_impact.append(np.max(single_impacts))

    for perturbation_genes in tqdm(perturbation_list):
        perturbation_input = data_mean.copy()
        for gene in perturbation_genes:
            perturbation_input[gene] += data_std[gene] * perturbation_factor
        perturbation_input_mat.append(perturbation_input.values)
    perturbation_input_mat = np.array(perturbation_input_mat)
    result_measure_list = (regr_ss.predict(perturbation_input_mat) - base_prediction)/y_std
    out_df['ss_impact'] = result_measure_list
    out_df['ss_impact_premium'] = np.subtract(result_measure_list, np.array(single_max_impact))

    out_df = out_df[(out_df['ts_impact_premium'] >= 0) & (out_df['ss_impact_premium'] >= 0)]
    out_df['average_premium'] = (out_df.ts_impact_premium + out_df.ss_impact_premium)/2
    out_df['average_premium_rank'] = (np.argsort(np.argsort(out_df.ts_impact_premium)) + np.argsort(np.argsort(out_df.ss_impact_premium)))/2
    out_df.sort_values('average_premium_rank', ascending=False).to_csv('output/'+target_gene+'_synergy_ranking.csv')

    # out_df.sort_values(by='impact', ascending=False).to_csv(ranking_path+'induction_'+target_gene+'_rankings.csv')

    # repression part
    # top_influence_genes = train_gene_index[np.argsort(mean_importance)[:100]]
    # importance_df_list.append(mean_importance)
    # data_mean = ts_df.T[top_influence_genes].mean()
    # data_std = ts_df.T[top_influence_genes].std()
    # regr = RandomForestRegressor(random_state=42, warm_start=True, n_estimators=300, n_jobs=20)
    # ts_train_X = ts_df[ts_exp_index_source].T[top_influence_genes]
    # regr = regr.fit(ts_train_X, ts_train_y)

    # base_prediction = regr.predict(np.array(data_mean).reshape(1,-1))[0]
    # y_std = ts_df.T.std()[target_gene]
    # perturbation_list = list(choose_2_3(top_influence_genes))

    # perturbation_result_list = []
    # perturbation_list_names = ['; '.join(perturbation_genes) for perturbation_genes in perturbation_list]
    # perturbation_input_mat = []
    # for perturbation_genes in perturbation_list:
    #     perturbation_input = data_mean.copy()
    #     for gene in perturbation_genes:
    #         perturbation_input[gene] += data_std[gene] * perturbation_factor
    #     perturbation_input_mat.append(perturbation_input.values)
    # perturbation_input_mat = np.array(perturbation_input_mat)
    # result_measure_list = (regr.predict(perturbation_input_mat) - base_prediction)/y_std
    # out_df = pd.DataFrame(index=perturbation_list_names, data=result_measure_list, columns=['impact'])
    # out_df.sort_values(by='impact').to_csv(ranking_path+'repression_'+target_gene+'_rankings.csv')

    #     perturbation_prediction = regr.predict(np.array(perturbation_input).reshape(1,-1))[0]
    #     perturbation_measure = (perturbation_prediction - base_prediction)/y_std
    #     perturbation_result_list.append(perturbation_measure)
    # result_list.append(np.array(perturbation_list_names)[np.argsort(perturbation_result_list)[::-1][:5]])
    # result_measure_list.append(np.array(perturbation_result_list)[np.argsort(perturbation_result_list)[::-1][:5]])





100%|██████████| 2699004/2699004 [02:22<00:00, 18963.45it/s]
100%|██████████| 2699004/2699004 [02:24<00:00, 18615.05it/s]
100%|██████████| 1456935/1456935 [01:17<00:00, 18807.52it/s]
100%|██████████| 1456935/1456935 [01:19<00:00, 18440.06it/s]
100%|██████████| 2635500/2635500 [02:23<00:00, 18369.02it/s]
100%|██████████| 2635500/2635500 [02:24<00:00, 18181.18it/s]
100%|██████████| 3898895/3898895 [03:36<00:00, 18036.06it/s]
100%|██████████| 3898895/3898895 [03:35<00:00, 18067.43it/s]
100%|██████████| 4499950/4499950 [04:05<00:00, 18353.42it/s]
100%|██████████| 4499950/4499950 [04:08<00:00, 18122.66it/s]
100%|██████████| 3697960/3697960 [03:17<00:00, 18713.39it/s]
100%|██████████| 3697960/3697960 [03:22<00:00, 18283.95it/s]


In [124]:
out_df = out_df[(out_df['ts_impact_premium'] >= 0) & (out_df['ss_impact_premium'] >= 0)]

In [125]:
out_df['average_premium'] = (out_df.ts_impact_premium + out_df.ss_impact_premium)/2
out_df['average_premium_rank'] = (np.argsort(np.argsort(out_df.ts_impact_premium)) + np.argsort(np.argsort(out_df.ss_impact_premium)))/2

In [128]:
out_df.sort_values('average_premium_rank', ascending=False).to_csv('output/'+target_gene+'_synergy_ranking.csv')

In [114]:
test = np.array([3,6,1,2,8,5])
np.argsort(np.argsort(test)[::-1])

array([3, 1, 5, 4, 0, 2])

In [109]:
np.argsort(test)

array([2, 3, 0, 5, 1, 4])

In [96]:
out_df

Unnamed: 0,ts_impact,ts_impact_premium,ss_impact,ss_impact_premium
AT5G05790; AT3G15210,0.001590,0.000239,62167.192396,62167.191044
AT5G05790; AT1G19000,0.001352,0.000000,62167.192396,62167.191044
AT5G05790; AT1G80840,0.001400,0.000049,62167.192396,62167.191044
AT5G05790; AT4G25470,0.002521,0.001170,62167.192396,62167.191044
AT5G05790; AT5G52510,-0.005641,-0.006993,62167.192396,62167.191044
...,...,...,...,...
AT4G36780; AT5G60690; AT4G37260,0.006601,0.000724,62167.192396,62167.186520
AT2G40140; AT4G21750; AT5G60690,0.005532,0.000008,62167.192396,62167.186872
AT2G40140; AT4G21750; AT4G37260,-0.002287,-0.002295,62167.192396,62167.192388
AT2G40140; AT5G60690; AT4G37260,0.003229,-0.002295,62167.192396,62167.186872


In [63]:
test = np.subtract(result_measure_list, np.array(single_max_impact))
test.argmax()

1757861

In [64]:
result_measure_list[1757861]

0.1003040165929691

In [66]:
np.max(result_measure_list)

0.1003040165929691

In [65]:
single_max_impact[1757861]

0.035439268642510874

In [56]:
np.min(single_max_impact)

-0.0548930272772657

In [36]:
for i in choose_2_3(single_perturbation_df.impact.values):
    print(np.max(i))
    break

0.0013516861626811376


In [16]:
perturbation_genes

('AT5G05790', 'AT3G15210')

In [20]:
single_perturbation_df.loc[list(perturbation_genes)].max()

impact    0.001352
dtype: float64

In [45]:
perturbation_input_mat = np.array(perturbation_input_mat)

In [54]:
result_measure_list = (regr.predict(perturbation_input_mat) - base_prediction)/y_std


In [57]:

out_df = pd.DataFrame(index=perturbation_list_names, data=result_measure_list, columns=['impact'])


In [59]:
out_df.sort_values(by='impact', ascending=False)

Unnamed: 0,impact
AT2G23340; AT1G72740; AT2G32250,0.986700
AT5G45710; AT2G23340; AT2G32250,0.985742
AT2G23340; AT2G32250; AT3G06410,0.982229
AT2G23340; AT4G24240; AT2G32250,0.975762
AT2G23340; AT3G09600; AT2G32250,0.964266
...,...
AT2G40750; AT5G67450; AT4G25400,-0.035687
AT2G40750; AT4G25400; AT1G76890,-0.036086
AT2G40750; AT4G24540; AT4G01500,-0.038401
AT2G40750; AT4G01500; AT4G25400,-0.038960


In [None]:
result_measure_list = np.array(result_measure_list)
result_list = np.array(result_list)
out_df = pd.DataFrame()
out_df.index = target_genes
for i in range(5):
    comb_name = 'top_{}_combination'.format(i+1)
    score_name = 'top_{}_score'.format(i+1)
    out_df[comb_name] = result_list[:,i]
    out_df[score_name] = result_measure_list[:,i]

In [None]:
out_df.to_csv('output/GSE111062/presentation_comb.csv')

In [45]:
for importance_series, target_gene in zip(importance_df_list, target_genes):
    importance_series.to_csv('output/GSE111062/'+target_gene+'_rankings.csv')

In [None]:
importance_df_list[5].sort_values().index.get_loc('AT3G49690')

In [None]:
importance_df_list[5].sort_values().index.get_loc('AT3G26744')

In [None]:
importance_df_list[5].sort_values().index.get_loc('AT3G23250')

In [None]:
importance_df_list[5].sort_values().index.get_loc('AT3G26744')