In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import product
from functools import cmp_to_key, partial
import re

import mp_run
import conf_interval

from os import listdir
from os.path import isfile, join
from sklearn.metrics import mean_squared_error, explained_variance_score

import os

from sklearn.model_selection import cross_val_score
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from scipy import stats

from multiprocessing import Pool, cpu_count


%load_ext autoreload
%autoreload 2

# regex for number extraction from string
number_pattern =  r'(-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?)'

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams.update({'figure.figsize':(7,5), 'figure.dpi':100})

In [3]:
train_source_1 = pd.read_csv('../data/mouse/GSE115553/normalized/train_source.csv', index_col=0)
train_target_1 = pd.read_csv('../data/mouse/GSE115553/normalized/train_target.csv', index_col=0)
test_source_1 = pd.read_csv('../data/mouse/GSE115553/normalized/test_source.csv', index_col=0)
test_target_1 = pd.read_csv('../data/mouse/GSE115553/normalized/test_target.csv', index_col=0)

train_source_2 = pd.read_csv('../data/mouse/GSE151173/normalized/train_source.csv', index_col=0)
train_target_2 = pd.read_csv('../data/mouse/GSE151173/normalized/train_target.csv', index_col=0)
test_source_2 = pd.read_csv('../data/mouse/GSE151173/normalized/test_source.csv', index_col=0)
test_target_2 = pd.read_csv('../data/mouse/GSE151173/normalized/test_target.csv', index_col=0)

train_source_3 = pd.read_csv('../data/mouse/GSE171975/normalized/train_source.csv', index_col=0)
train_target_3 = pd.read_csv('../data/mouse/GSE171975/normalized/train_target.csv', index_col=0)
test_source_3 = pd.read_csv('../data/mouse/GSE171975/normalized/test_source.csv', index_col=0)
test_target_3 = pd.read_csv('../data/mouse/GSE171975/normalized/test_target.csv', index_col=0)

In [4]:
common_exp_genes = list(set(train_source_1.index).intersection(set(train_source_2.index), set(train_source_3.index)))

train_source_1 = train_source_1.loc[common_exp_genes]
train_target_1 = train_target_1.loc[common_exp_genes]
test_source_1 = test_source_1.loc[common_exp_genes]
test_target_1 = test_target_1.loc[common_exp_genes]

train_source_2 = train_source_2.loc[common_exp_genes]
train_target_2 = train_target_2.loc[common_exp_genes]
test_source_2 = test_source_2.loc[common_exp_genes]
test_target_2 = test_target_2.loc[common_exp_genes]

train_source_3 = train_source_3.loc[common_exp_genes]
train_target_3 = train_target_3.loc[common_exp_genes]
test_source_3 = test_source_3.loc[common_exp_genes]
test_target_3 = test_target_3.loc[common_exp_genes]

In [5]:
train_source = pd.concat([train_source_1, train_source_2, train_source_3], axis=1)
train_target = pd.concat([train_target_1, train_target_2, train_target_3], axis=1)
test_source = pd.concat([test_source_1, test_source_2, test_source_3], axis=1)
test_target = pd.concat([test_target_1, test_target_2, test_target_3], axis=1)

In [6]:
network_df = pd.read_csv('../data/mouse/regnetworkweb.org.network', sep='\t')
regulator_set = set(network_df['regulator'])
target_set = set(network_df['target'])

regulator_set = regulator_set.intersection(set(train_source.index))
target_set = target_set.intersection(set(train_source.index))
all_gene_set = regulator_set.union(target_set)
network_dict = {target: [] for target in target_set}
for ind, row in network_df.iterrows():
    if (row['regulator'] in regulator_set) and (row['target'] in target_set):
        network_dict[row['target']].append(row['regulator'])

key_list = []
value_list = []
regulator_set = set()
tf_list_df = pd.read_csv('../data/mouse_tf_list.tsv.gz', sep='\t', compression='gzip', index_col=0)
for name in tf_list_df['Gene Names']:
    name_splits = str(name).split(' ')
    for i in name_splits:
        if i in train_source.index:
            regulator_set.add(i)
target_set = set()
for key in network_dict.keys():
    if (len(network_dict[key]) > 0) and network_dict[key][0] != key:
        key_list.append(key)
        target_set.add(key)
        value_list.append("; ".join(network_dict[key]))
        for regulator in network_dict[key]:
            regulator_set.add(regulator)
all_gene_set = regulator_set.union(target_set)

print(len(regulator_set))

1385


In [7]:
# network_df = pd.read_csv('../data/mouse/regnetworkweb.org.network', sep='\t')
# regulator_set = set(network_df['regulator'])
# target_set = set(network_df['target'])

# regulator_set = regulator_set.intersection(set(train_source.index))
# target_set = target_set.intersection(set(train_source.index))
# all_gene_set = regulator_set.union(target_set)
# network_dict = {target: [] for target in target_set}
# for ind, row in network_df.iterrows():
#     if (row['regulator'] in regulator_set) and (row['target'] in target_set):
#         network_dict[row['target']].append(row['regulator'])

# key_list = []
# value_list = []
# regulator_set = set()
# target_set = set()
# for key in network_dict.keys():
#     if (len(network_dict[key]) > 0) and network_dict[key][0] != key:
#         key_list.append(key)
#         target_set.add(key)
#         value_list.append("; ".join(network_dict[key]))
#         for regulator in network_dict[key]:
#             regulator_set.add(regulator)
# all_gene_set = regulator_set.union(target_set)


# print(len(regulator_set))

In [8]:
network_df = pd.DataFrame(index=key_list)
network_df['tf_list'] = value_list

In [9]:
np.mean([len(i.split('; ')) for i in value_list])

4.911825599805139

In [10]:
target_df = pd.concat([train_target, test_target], axis=1)
source_df = pd.concat([train_source, test_source], axis=1)



In [11]:
target_gene_list = list(target_set)
target_exp = target_df.loc[target_gene_list]
X = source_df.loc[list(regulator_set)]
tf_list = list(regulator_set)

In [12]:
new_test_target = test_target.loc[target_gene_list]
new_test_target = new_test_target.loc[new_test_target.std(axis=1) > 0.5]
target_gene_list = new_test_target.index

In [13]:
print(train_source.shape)
print(test_source.shape)

(9931, 208)
(9931, 121)


In [14]:
mp_calc = mp_run.MpCalc(target_gene_list, target_exp, X, network_df, train_source.loc[tf_list], train_target, test_source.loc[tf_list], test_target)

In [14]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.full_comp_new, range(iter_length)), total=iter_length))

  0%|          | 0/310 [00:00<?, ?it/s]

100%|██████████| 310/310 [01:13<00:00,  4.24it/s]


In [15]:
r = np.array(r)
out_df = pd.DataFrame(index=target_gene_list)
out_df['rf_score'] = r[:, 0]
out_df['linear_score'] = r[:, 1]
out_df['gs_rf_score'] = r[:, 2]
out_df['gs_linear_score'] = r[:, 3]
out_df['rf_with_linear_top_features_score'] = r[:, 4]
out_df['linear_with_rf_top_features_score'] = r[:, 5]
out_df['rf_rmse'] = r[:, 6]
out_df['linear_rmse'] = r[:, 7]
out_df['gs_rf_rmse'] = r[:, 8]
out_df['gs_linear_rmse'] = r[:, 9]
out_df['rf_with_linear_top_features_rmse'] = r[:, 10]
out_df['linear_with_rf_top_features_rmse'] = r[:, 11]
out_df['rf_with_top_features_score'] = r[:, 12]
out_df['linear_with_top_features_score'] = r[:, 13]
out_df['rf_with_top_features_rmse'] = r[:, 14]
out_df['linear_with_top_features_rmse'] = r[:, 15]
out_df['rf_top_feature_num'] = r[:, 16]
out_df['linear_top_feature_num'] = r[:, 17]
out_df['rf_top_features_gs_overlap'] = r[:, 18]
out_df['linear_top_features_gs_overlap'] = r[:, 19]
out_df['rf_linear_top_features_overlap'] = r[:, 20]
out_df['gs_edge_num'] = r[:, 21]
out_df['test_var'] = r[:, 22]
out_df['test_std'] = r[:, 23]
out_df['pca_rf_score'] = r[:, 24]
out_df['pca_rf_rmse'] = r[:, 25]

In [16]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.efron_process_rf, range(iter_length)), total=iter_length))

efron_r = np.array(r)
out_df['rf_efron_feature_num'] = efron_r[:, 0].astype('float64')
out_df['rf_efron_complementary_feature_num_list'] = efron_r[:, 1]
out_df['rf_efron_rmse'] = efron_r[:, 2].astype('float64')
out_df['rf_efron_complementary_rmse_list'] = efron_r[:, 3]
out_df['rf_efron_features'] = efron_r[:, 4]
out_df['rf_efron_complementary_features_list'] = efron_r[:, 5]

100%|██████████| 310/310 [33:13<00:00,  6.43s/it] 


In [17]:
tf_list_df = pd.DataFrame(index=tf_list)
tf_list_df.to_csv('../output/network_model/mouse_tf.csv', header=False)
out_df.to_csv('../output/network_model/mouse_all_tf_high_var_target_new.csv.gz', compression='gzip')

In [14]:
out_df = pd.read_csv('../output/network_model/mouse_all_tf_high_var_target_new.csv.gz', index_col=0, compression='gzip')
iter_length = len(target_gene_list)
new_out_df = pd.DataFrame(index=target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.rf_top_tf_same_count_as_gs, range(iter_length)), total=iter_length))
efron_r = np.array(r)
new_out_df['rf_top_tf_same_count_as_gs_score'] = efron_r[:, 0]
new_out_df['rf_top_tf_same_count_as_gs_rmse'] = efron_r[:, 1]
new_out_df = new_out_df.loc[out_df.index]
out_df['rf_top_tf_same_count_as_gs_score'] = new_out_df['rf_top_tf_same_count_as_gs_score']
out_df['rf_top_tf_same_count_as_gs_rmse'] = new_out_df['rf_top_tf_same_count_as_gs_rmse']

100%|██████████| 310/310 [00:22<00:00, 13.93it/s]


In [15]:
out_df.to_csv('../output/network_model/mouse_all_tf_high_var_target_new.csv.gz', compression='gzip')

In [14]:
out_df = pd.read_csv('../output/network_model/mouse_all_tf_high_var_target_new.csv.gz', index_col=0, compression='gzip')
iter_length = len(target_gene_list)
new_out_df = pd.DataFrame(index=target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.efron_ensemble_process_rf, range(iter_length)), total=iter_length))
ensemble_r = np.array(r)
new_out_df['rf_efron_ensemble_rmse'] = ensemble_r
new_out_df = new_out_df.loc[out_df.index]
out_df['rf_efron_ensemble_rmse'] = new_out_df['rf_efron_ensemble_rmse']

100%|██████████| 310/310 [09:53<00:00,  1.91s/it]


In [16]:
out_df.to_csv('../output/network_model/mouse_all_tf_high_var_target_new.csv.gz', compression='gzip')

In [15]:
tf_list_df = pd.read_csv('../output/network_model/mouse_tf.csv', names=['tf'], index_col=0)
out_df = pd.read_csv('../output/network_model/mouse_all_tf_high_var_target_new.csv.gz', index_col=0, compression='gzip')
available_tfs = set(X.index)
rf_efron_overlap_count = []
for target_gene in out_df.index:
    gs_tf_list = network_df.loc[target_gene].tf_list
    gs_tf_set = set(gs_tf_list.split('; '))
    gs_tf_set = available_tfs.intersection(gs_tf_set)
    if target_gene in gs_tf_set: gs_tf_set.remove(target_gene)
    efron_tf_list = out_df.loc[target_gene]['rf_efron_features']
    efron_tf_list = efron_tf_list.split('; ')
    efron_tf_list = [int(i) for i in efron_tf_list]
    efron_tf_list = tf_list_df.iloc[efron_tf_list].index
    efron_tf_set = set(efron_tf_list)
    rf_efron_overlap_count.append(len(efron_tf_set.intersection(gs_tf_set)))
out_df['rf_efron_overlap_count'] = rf_efron_overlap_count
out_df.to_csv('../output/network_model/mouse_all_tf_high_var_target_new.csv.gz', compression='gzip')


In [19]:
out_df['gs_edge_num'].median()

4.0

In [38]:
train_X, test_X, y_train, y_test, tf_list = mp_calc.get_train_test_sets(10)
regr = RandomForestRegressor(random_state=42, n_jobs=1, max_features='sqrt' )
regr.fit(train_X.T, y_train)
base_error = np.square(regr.predict(test_X.T)-y_test)

In [39]:
base_error

Sbno2_LSB41_repB@18       3.167385
LacZ_LSB49_repA@18        0.020900
Nos2_LSB26_repB@18        0.085199
Tmem258_LSB30_repA@18    13.413396
Sbno2_LSB40_repB@18       2.729760
                           ...    
Liver_PerDKO_DD_C@40      0.000006
Liver_PerDKO_DD_C@44      0.000014
Liver_PerDKO_DD_D@36      0.000057
Liver_PerDKO_DD_D@40      0.000111
Liver_PerDKO_DD_D@44      0.000340
Name: Tmsb10, Length: 121, dtype: float64

In [45]:
train_X, test_X, y_train, y_test, tf_list = mp_calc.get_train_test_sets(10)
train_X = train_X.iloc[[1,3]]
test_X = test_X.iloc[[1,3]]
regr = RandomForestRegressor(random_state=42, n_jobs=1, max_features='sqrt' )
regr.fit(train_X.T, y_train)
current_error = np.square(regr.predict(test_X.T)-y_test)

In [46]:
current_error

Sbno2_LSB41_repB@18       0.207499
LacZ_LSB49_repA@18        0.597212
Nos2_LSB26_repB@18        0.088203
Tmem258_LSB30_repA@18    36.345641
Sbno2_LSB40_repB@18       5.914137
                           ...    
Liver_PerDKO_DD_C@40      0.000012
Liver_PerDKO_DD_C@44      0.000017
Liver_PerDKO_DD_D@36      0.000075
Liver_PerDKO_DD_D@40      0.000078
Liver_PerDKO_DD_D@44      0.000522
Name: Tmsb10, Length: 121, dtype: float64

In [None]:
regr = RandomForestRegressor(random_state=42, n_jobs=1, max_features='sqrt' )

In [None]:
a = mp_calc.efron_process_rf(10)

In [18]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.efron_process_rf, range(iter_length)), total=iter_length))
efron_r = np.array(r)
out_df['rf_efron_feature_num'] = efron_r[:, 0].astype('float64')
out_df['rf_efron_complementary_feature_num'] = efron_r[:, 1].astype('float64')
out_df['rf_efron_rmse'] = efron_r[:, 2].astype('float64')
out_df['rf_efron_complementary_rmse'] = efron_r[:, 3].astype('float64')
out_df['rf_efron_features'] = efron_r[:, 4]
out_df['rf_efron_complementary_features'] = efron_r[:, 5]


100%|██████████| 310/310 [01:00<00:00,  5.08it/s]


In [19]:
out_df.mean()

rf_score                                 0.363458
linear_score                             0.220818
gs_rf_score                              0.128178
gs_linear_score                         -0.003445
rf_with_linear_top_features_score        0.348795
linear_with_rf_top_features_score        0.183726
rf_rmse                                  1.207490
linear_rmse                              1.305100
gs_rf_rmse                               1.493678
gs_linear_rmse                           1.686018
rf_with_linear_top_features_rmse         1.173024
linear_with_rf_top_features_rmse         1.454102
rf_with_top_features_score               0.289418
linear_with_top_features_score           0.217805
rf_with_top_features_rmse                1.255568
linear_with_top_features_rmse            1.342473
rf_top_feature_num                      20.000000
linear_top_feature_num                  20.000000
rf_top_features_gs_overlap               0.161290
linear_top_features_gs_overlap           0.174194


In [20]:
out_df.to_csv('../output/network_model/mouse_all_tf_high_var_target_new.csv.gz', compression='gzip')

In [26]:
temp_df = out_df[(out_df['rf_efron_feature_num'] < 10 ) & (out_df['rf_efron_complementary_feature_num'] < 10 )]
temp_set = set()
for keys in temp_df['rf_efron_features']:
    key_list = keys.split('; ')
    for key in key_list:
        temp_set.add(int(key))
for keys in temp_df['rf_efron_complementary_features']:
    key_list = keys.split('; ')
    for key in key_list:
        temp_set.add(int(key))
print(len(temp_set))

264


In [32]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.efron_process_rf, range(iter_length)), total=iter_length))
efron_r = np.array(r)
out_df['rf_efron_feature_num'] = efron_r[:, 0]
out_df['rf_efron_rmse'] = efron_r[:, 1]
out_df['rf_efron_complementary_rmse'] = efron_r[:, 2]

100%|██████████| 310/310 [00:41<00:00,  7.51it/s]


In [33]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.efron_process_linear, range(iter_length)), total=iter_length))
efron_r = np.array(r)
out_df['linear_efron_feature_num'] = efron_r[:, 0]
out_df['linear_efron_rmse'] = efron_r[:, 1]
out_df['linear_efron_complementary_rmse'] = efron_r[:, 2]

100%|██████████| 310/310 [00:55<00:00,  5.63it/s]


In [35]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.efron_process_90th_rf, range(iter_length)), total=iter_length))
efron_r = np.array(r)
out_df['rf_efron_feature_num_90th'] = efron_r[:, 0]
out_df['rf_efron_rmse_90th'] = efron_r[:, 1]
out_df['rf_efron_complementary_rmse_90th'] = efron_r[:, 2]

100%|██████████| 310/310 [02:04<00:00,  2.49it/s]


In [16]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.dynamic_efron_rf, range(iter_length)), total=iter_length))
efron_r = np.array(r)
out_df['rf_dynamic_efron_feature_num'] = efron_r[:, 0].astype('float64')
out_df['rf_dynamic_efron_rmse'] = efron_r[:, 1].astype('float64')
out_df['rf_dynamic_efron_complementary_rmse'] = efron_r[:, 2].astype('float64')
out_df['rf_dynamic_efron_feature_index'] = efron_r[:, 3]

100%|██████████████████████████████████████████████████████████████████████████████| 310/310 [04:09<00:00,  1.24it/s]


In [17]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.dynamic_efron_linear, range(iter_length)), total=iter_length))
efron_r = np.array(r)
out_df['linear_dynamic_efron_feature_num'] = efron_r[:, 0].astype('float64')
out_df['linear_dynamic_efron_rmse'] = efron_r[:, 1].astype('float64')
out_df['linear_dynamic_efron_complementary_rmse'] = efron_r[:, 2].astype('float64')
out_df['linear_dynamic_efron_feature_index'] = efron_r[:, 3]

100%|██████████████████████████████████████████████████████████████████████████████| 310/310 [07:28<00:00,  1.45s/it]


In [18]:
overlap_list = []
target_comb_list = []
for i in tqdm(range(iter_length)):
    tf_set_i = out_df.iloc[i]['rf_dynamic_efron_feature_index']
    tf_set_i = tf_set_i.split('; ')
    tf_set_i = set([int(v) for v in tf_set_i])
    for j in range(i+1, iter_length):
        tf_set_j = out_df.iloc[j]['rf_dynamic_efron_feature_index']
        tf_set_j = tf_set_j.split('; ')
        tf_set_j = set([int(v) for v in tf_set_j])
        overlap = len(tf_set_i.intersection(tf_set_j))/(len(tf_set_i.union(tf_set_j))+0.0)
        overlap_list.append(overlap)
        if (overlap >= 1) and (len(tf_set_i) > 1):
            print(i, j)
        target_comb_list.append((i,j))




  8%|██████                                                                         | 24/310 [00:01<00:11, 23.84it/s]

19 21
19 37
19 46
19 47
19 50
19 54
19 70
19 76
19 86
19 89
19 112
19 118
19 137
19 168
19 169
19 192
19 242
19 256
19 257
19 273
19 293
21 37
21 46
21 47
21 50
21 54
21 70
21 76
21 86
21 89
21 112
21 118
21 137
21 168
21 169
21 192
21 242
21 256
21 257
21 273
21 293


 14%|██████████▋                                                                    | 42/310 [00:01<00:10, 24.78it/s]

37 46
37 47
37 50
37 54
37 70
37 76
37 86
37 89
37 112
37 118
37 137
37 168
37 169
37 192
37 242
37 256
37 257
37 273
37 293


 16%|████████████▉                                                                  | 51/310 [00:02<00:10, 25.37it/s]

46 47
46 50
46 54
46 70
46 76
46 86
46 89
46 112
46 118
46 137
46 168
46 169
46 192
46 242
46 256
46 257
46 273
46 293
47 50
47 54
47 70
47 76
47 86
47 89
47 112
47 118
47 137
47 168
47 169
47 192
47 242
47 256
47 257
47 273
47 293
50 54
50 70
50 76
50 86
50 89
50 112
50 118
50 137
50 168
50 169
50 192
50 242
50 256
50 257
50 273
50 293


 18%|██████████████▌                                                                | 57/310 [00:02<00:09, 26.42it/s]

54 70
54 76
54 86
54 89
54 112
54 118
54 137
54 168
54 169
54 192
54 242
54 256
54 257
54 273
54 293
57 107
57 178
57 191


 24%|██████████████████▌                                                            | 73/310 [00:02<00:08, 28.40it/s]

70 76
70 86
70 89
70 112
70 118
70 137
70 168
70 169
70 192
70 242
70 256
70 257
70 273
70 293
76 86
76 89
76 112
76 118
76 137
76 168
76 169
76 192


 26%|████████████████████▋                                                          | 81/310 [00:03<00:07, 30.44it/s]

76 242
76 256
76 257
76 273
76 293


 29%|██████████████████████▋                                                        | 89/310 [00:03<00:07, 30.52it/s]

86 89
86 112
86 118
86 137
86 168
86 169
86 192
86 242
86 256
86 257
86 273
86 293
89 112
89 118
89 137
89 168
89 169
89 192
89 242
89 256
89 257
89 273
89 293


 36%|████████████████████████████▍                                                 | 113/310 [00:04<00:05, 33.77it/s]

107 178
107 191
112 118
112 137
112 168
112 169
112 192
112 242
112 256
112 257
112 273
112 293


 40%|███████████████████████████████▍                                              | 125/310 [00:04<00:05, 34.89it/s]

118 137
118 168
118 169
118 192
118 242
118 256
118 257
118 273
118 293


 46%|████████████████████████████████████▏                                         | 144/310 [00:04<00:04, 39.11it/s]

137 168
137 169
137 192
137 242
137 256
137 257
137 273
137 293


 56%|███████████████████████████████████████████▊                                  | 174/310 [00:05<00:02, 45.79it/s]

168 169
168 192
168 242
168 256
168 257
168 273
168 293
169 192
169 242
169 256
169 257
169 273
169 293
178 191


 66%|███████████████████████████████████████████████████▊                          | 206/310 [00:06<00:01, 58.45it/s]

192 242
192 256
192 257
192 273
192 293


 86%|██████████████████████████████████████████████████████████████████▎          | 267/310 [00:06<00:00, 104.88it/s]

242 256
242 257
242 273
242 293
256 257
256 273
256 293
257 273
257 293


100%|██████████████████████████████████████████████████████████████████████████████| 310/310 [00:06<00:00, 45.07it/s]

273 293





In [21]:
out_df.columns

Index(['rf_score', 'linear_score', 'gs_rf_score', 'gs_linear_score',
       'rf_with_linear_top_features_score',
       'linear_with_rf_top_features_score', 'rf_rmse', 'linear_rmse',
       'gs_rf_rmse', 'gs_linear_rmse', 'rf_with_linear_top_features_rmse',
       'linear_with_rf_top_features_rmse', 'rf_with_top_features_score',
       'linear_with_top_features_score', 'rf_with_top_features_rmse',
       'linear_with_top_features_rmse', 'rf_top_feature_num',
       'linear_top_feature_num', 'rf_top_features_gs_overlap',
       'linear_top_features_gs_overlap', 'rf_linear_top_features_overlap',
       'gs_edge_num', 'test_var', 'test_std', 'rf_efron_feature_num',
       'rf_efron_rmse', 'rf_efron_complementary_rmse',
       'linear_efron_feature_num', 'linear_efron_rmse',
       'linear_efron_complementary_rmse', 'rf_efron_feature_num_90th',
       'rf_efron_rmse_90th', 'rf_efron_complementary_rmse_90th',
       'rf_dynamic_efron_feature_num', 'rf_dynamic_efron_rmse',
       'rf_dynamic

In [19]:
out_df.to_csv('../output/network_model/mouse_all_tf_high_var_target.csv.gz', compression='gzip')

In [36]:
out_df.mean()

rf_score                               0.370744
linear_score                           0.220818
gs_rf_score                            0.131542
gs_linear_score                       -0.003445
rf_with_linear_top_features_score      0.386506
linear_with_rf_top_features_score      0.105008
rf_rmse                                1.195792
linear_rmse                            1.305100
gs_rf_rmse                             1.493245
gs_linear_rmse                         1.686018
rf_with_linear_top_features_rmse       1.182053
linear_with_rf_top_features_rmse       1.531368
rf_with_top_features_score             0.309347
linear_with_top_features_score         0.212970
rf_with_top_features_rmse              1.266511
linear_with_top_features_rmse          1.334778
rf_top_feature_num                     9.119355
linear_top_feature_num                14.132258
rf_top_features_gs_overlap             0.109677
linear_top_features_gs_overlap         0.116129
rf_linear_top_features_overlap         0