In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import re

import mp_perturbation

from os import listdir
from os.path import isfile, join
from sklearn.metrics import mean_squared_error

import os

from sklearn.model_selection import cross_val_score
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from scipy import stats

from multiprocessing import Pool, cpu_count


%load_ext autoreload
%autoreload 2

In [126]:
perturbation_factor = 3

In [3]:
# ts training/testing data curation 

df_1 = pd.read_csv('../data/yeast/GSE145936_Sis1-AA_Gene_counts_normalized.txt', sep='\t', index_col=0)
df_2 = pd.read_csv('../data/yeast/GSE153609_gene_expression_TPM_all_times.csv', index_col=0)
df_3 = pd.read_csv('../data/yeast/GSE168699_RNA_TPM_all_times.csv', index_col=0)

to_drop = df_3.columns[:7]
df_3 = df_3.drop(labels=to_drop, axis=1)
df_1 = df_1.drop(labels=['gene name'], axis=1)

common_genes = set(df_1.index).intersection(set(df_2.index)).intersection(set(df_3.index))
common_genes = list(common_genes)
df_1 = df_1.loc[common_genes]
df_2 = df_2.loc[common_genes]
df_3 = df_3.loc[common_genes]

normalized_df_1=(df_1-df_1.min())/(df_1.max()-df_1.min())
normalized_df_2=(df_2-df_2.min())/(df_2.max()-df_2.min())
normalized_df_3=(df_3-df_3.min())/(df_3.max()-df_3.min())
normalized_df_1 = normalized_df_1*100.0
normalized_df_2 = normalized_df_2*100.0
normalized_df_3 = normalized_df_3*100.0

test_df_1 = normalized_df_1.iloc[:,[3,4,5,9,10,11]]
test_df_2 = normalized_df_2.iloc[:,[3,4,5]]
test_df_3 = normalized_df_3.iloc[:, -5:]
test_exp = pd.concat([test_df_1, test_df_2, test_df_3], axis=1)
test_source = test_exp.iloc[:,[0,1,3,4,6,7,9,10,11,12]]
test_target = test_exp.iloc[:,[0,1,3,4,6,7,9,10,11,12]]

train_source_df_1 = normalized_df_1.iloc[:, [0,1,2,3,5,6,7,8,9]]
train_target_df_1 = normalized_df_1.iloc[:, [1,2,3,4,6,7,8,9,10]]
train_source_df_2 = normalized_df_2.iloc[:, [0,1,2,3]]
train_target_df_2 = normalized_df_2.iloc[:, [1,2,3,4]]
train_source_df_3 = normalized_df_3.iloc[:, :-4]
train_target_df_3 = normalized_df_3.iloc[:, 1:-3]
train_source = pd.concat([train_source_df_1, train_source_df_2, train_source_df_3], axis=1)
train_target = pd.concat([train_target_df_1, train_target_df_2, train_target_df_3], axis=1)
source_exp = pd.concat([train_source, test_source], axis=1)
target_exp = pd.concat([train_target, test_target], axis=1)


In [4]:
# get network data, training features
network_df = pd.read_csv('./yeat_network.csv', index_col=0)
tf_set = set()
target_gene_list = []
for i, row in network_df.iterrows():
    tf_list = row.tf_list
    if pd.isnull(tf_list): 
        continue
    tf_list = tf_list.split('; ')
    tf_set = tf_set.union(set(tf_list))
    target_gene_list.append(i)

target_gene_list = list(set(target_gene_list).intersection(set(common_genes)))
tf_list = list(tf_set.intersection(set(common_genes)))

X = source_exp.loc[tf_list]


In [5]:
len(tf_list)

213

In [7]:
mp_calc = mp_perturbation.MpCalc(target_gene_list, target_exp, X)

In [9]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    r = np.array(list(tqdm(p.imap(mp_calc.calc, range(iter_length)), total=iter_length)))

100%|██████████| 4897/4897 [08:51<00:00,  9.21it/s]


In [13]:
rf_perf_df = pd.DataFrame(index=target_gene_list)
rf_perf_df['model_score'] = r[:,0]
rf_perf_df['model_score_std'] = r[:,1]

In [22]:
good_perf_df = rf_perf_df[rf_perf_df.model_score > 0.85]
good_perf_target_exp = target_exp.loc[good_perf_df.index]

In [23]:
good_perf_target_exp

Unnamed: 0,HS_15.norm,HS_30.norm,HS_60.norm,HS_90.norm,R_0.norm,R_15.norm,R_30.norm,R_60.norm,R_90.norm,7.5,...,HS_60.norm.1,HS_90.norm.1,R_60.norm.1,R_90.norm.1,30.0,60.0,100 min,110 min,120 min,130 min
YDR294C,0.933837,0.973513,1.443312,1.466891,1.751045,1.141201,1.096673,1.362077,1.258157,0.305370,...,1.443312,1.466891,1.362077,1.258157,0.761509,0.973838,0.311688,0.281554,0.279410,0.297643
YGL084C,0.636975,0.741541,0.907036,0.964161,1.052572,0.930820,0.908401,1.150021,1.117055,0.267844,...,0.907036,0.964161,1.150021,1.117055,0.882971,1.127855,0.214616,0.196820,0.210079,0.237419
YHL023C,0.468886,0.517387,0.802423,0.879789,0.920694,0.857383,0.833092,0.893085,0.730985,0.129432,...,0.802423,0.879789,0.893085,0.730985,0.243300,0.287018,0.091840,0.084891,0.089056,0.100118
YFR040W,0.573082,0.685503,0.607783,0.640479,1.006188,0.791877,0.835461,0.815600,0.809375,0.339346,...,0.607783,0.640479,0.815600,0.809375,0.638968,0.895066,0.198115,0.188261,0.206187,0.240292
YBL037W,0.183819,0.241097,0.426372,0.371883,0.359001,0.371137,0.388305,0.420030,0.415466,0.143743,...,0.426372,0.371883,0.420030,0.415466,0.309772,0.357192,0.115262,0.106450,0.107645,0.121749
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YDR332W,0.282118,0.311472,0.398558,0.473461,0.669167,0.478316,0.513030,0.581115,0.486017,0.063264,...,0.398558,0.473461,0.581115,0.486017,0.112031,0.150718,0.079559,0.069326,0.071872,0.077436
YLR135W,0.168090,0.230674,0.440941,0.437306,0.388307,0.416785,0.350651,0.314002,0.293962,0.067834,...,0.440941,0.437306,0.314002,0.293962,0.122391,0.176542,0.051644,0.043273,0.047960,0.060722
YHR155W,0.213305,0.370117,0.589245,0.686954,0.632534,0.575565,0.560098,0.579083,0.674152,0.154053,...,0.589245,0.686954,0.579083,0.674152,0.272696,0.374387,0.117340,0.103303,0.124386,0.140985
YDL239C,0.299810,0.440492,0.712379,0.637028,0.612979,0.438621,0.421263,0.413915,0.462500,0.037550,...,0.712379,0.637028,0.413915,0.462500,0.242591,0.473186,0.038029,0.038385,0.041484,0.051319


In [170]:
target_index = 0
y = good_perf_target_exp.iloc[target_index]

regr_list = []
for i in range(20):
    regr = RandomForestRegressor(n_jobs=1, random_state=i**2, max_features='sqrt')
    regr.fit(X.T, y)
    regr_list.append(regr)

tf_mean = X.mean(axis=1)
tf_std = X.std(axis=1)
reference_input = pd.concat([tf_mean]*len(tf_mean), axis=1)

single_perturb_input = reference_input.copy()
for i in range(len(tf_std)):
    single_perturb_input[i][i] += tf_std[i] * perturbation_factor

single_effect_list = []
for i in range(20):
    single_effect_list.append(regr_list[i].predict(single_perturb_input.T) - regr_list[i].predict(reference_input.T))
single_effect_list = np.array(single_effect_list)
single_effect = single_effect_list.mean(axis=0)
single_effect_sorted_arg = np.argsort(single_effect)

activator_top = single_effect_sorted_arg[-5:]
repressor_top = single_effect_sorted_arg[:5]

double_effect_input = pd.concat([tf_mean]*45, axis=1)
double_effect_input_list = []
double_compare_list = []
for i in range(0, 5):
    for j in range(i+1, 5):
        tf_i = activator_top[i]
        tf_j = activator_top[j]
        new_input = tf_mean.copy()
        new_input[tf_i] += tf_std[tf_i] * perturbation_factor
        new_input[tf_j] += tf_std[tf_j] * perturbation_factor
        double_effect_input_list.append(new_input)
        double_compare_list.append(np.max([single_effect[tf_i], single_effect[tf_j]]))

for i in range(0, 5):
    for j in range(i+1, 5):
        tf_i = repressor_top[i]
        tf_j = repressor_top[j]
        new_input = tf_mean.copy()
        new_input[tf_i] += tf_std[tf_i] * perturbation_factor
        new_input[tf_j] += tf_std[tf_j] * perturbation_factor
        double_effect_input_list.append(new_input)
        double_compare_list.append(np.min([single_effect[tf_i], single_effect[tf_j]]))

for i in range(0, 5):
    for j in range(0, 5):
        tf_i = activator_top[i]
        tf_j = repressor_top[j]
        new_input = tf_mean.copy()
        new_input[tf_i] += tf_std[tf_i] * perturbation_factor
        new_input[tf_j] += tf_std[tf_j] * perturbation_factor
        double_effect_input_list.append(new_input)
        double_compare_list.append(np.min([single_effect[tf_i], single_effect[tf_j]]))
double_perturb_input = pd.concat(double_effect_input_list, axis=1)

double_effect_res_list = []
for i in range(20):
    double_effect_res_list.append(regr_list[i].predict(double_perturb_input.T) - regr_list[i].predict(tf_mean.values.reshape(1,-1)))
double_effect_res_list = np.array(double_effect_res_list)
double_effect_res_list.mean(axis=0)

array([ 0.02400581,  0.02381548,  0.02641942,  0.03489565,  0.02495106,
        0.02758392,  0.03588077,  0.02781419,  0.03603283,  0.03813505,
       -0.01413864, -0.01253329, -0.01233301, -0.01142639, -0.01209879,
       -0.01212576, -0.01087824, -0.01027089, -0.00936428, -0.00911322,
        0.00415568,  0.00470907,  0.00613205,  0.00642731,  0.00718643,
        0.00496671,  0.0052735 ,  0.00675228,  0.00724603,  0.0081034 ,
        0.00509224,  0.00572987,  0.00728462,  0.00753567,  0.00836541,
        0.00762463,  0.00803942,  0.0096797 ,  0.0099378 ,  0.01084441,
        0.01609468,  0.01672957,  0.01807824,  0.01831511,  0.01928187])

In [176]:
double_perturb_calc = mp_perturbation.MpCalc(list(good_perf_target_exp.index), good_perf_target_exp, X)

In [188]:
iter_length = len(good_perf_target_exp.index)
with Pool(cpu_count()) as p:
    r = np.array(list(tqdm(p.imap(double_perturb_calc.double_perturb, range(iter_length)), total=iter_length)))

100%|██████████| 891/891 [01:52<00:00,  7.93it/s]


In [191]:
AA_list = []
RR_list = []
AR_list = []
for i in range(iter_length):
    res_diff = r[i][0] - r[i][1]
    AA_list.append((res_diff[:10] < 0).sum())
    RR_list.append((res_diff[10:20] < 0).sum())
    AR_list.append((res_diff[20:] < 0).sum())

In [195]:
np.max(AR_list)

0