In [2]:
# candidate = AT5G44610

import pandas as pd
import numpy as np
from tqdm import tqdm

import mp_run
import conf_interval


from scipy import stats

from multiprocessing import Pool, cpu_count


%load_ext autoreload
%autoreload 2

# regex for number extraction from string
number_pattern =  r'(-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?)'

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams.update({'figure.figsize':(7,5), 'figure.dpi':100})

In [3]:
train_source_df = pd.read_csv('../data/GSE97500/train_source_tpm.csv', index_col=0)
train_target_df = pd.read_csv('../data/GSE97500/train_target_tpm.csv', index_col=0)

test_source_df = pd.read_csv('../data/GSE97500/test_source_tpm.csv', index_col=0)
test_target_df = pd.read_csv('../data/GSE97500/test_target_tpm.csv', index_col=0)


train_source_df = train_source_df.apply(stats.zscore, axis=0)
train_target_df = train_target_df.apply(stats.zscore, axis=0)

test_source_df = test_source_df.apply(stats.zscore, axis=0)
test_target_df = test_target_df.apply(stats.zscore, axis=0)


In [4]:
regulator_set = set()
tf_list_df = pd.read_csv('../data/arabidopsis_tf_list.tsv.gz', sep='\t', compression='gzip', index_col=0)
for name in tf_list_df['Gene Names']:
    name_splits = name.split(' ')
    for i in name_splits:
        if i.upper() in train_source_df.index:
            regulator_set.add(i.upper())

network_df = pd.read_csv('../data/arabidopsis_network_connectf.csv', index_col=0)
target_set = set(network_df.index)

for tf_list_string in network_df['tf_list'].values:
    tf_list = tf_list_string.split('; ')
    for tf in tf_list: regulator_set.add(tf)
print(len(regulator_set))

2010


In [5]:
# network_df = pd.read_csv('../data/arabidopsis_network_connectf.csv', index_col=0)
# regulator_set = set()
# target_set = set(network_df.index)

# for tf_list_string in network_df['tf_list'].values:
#     tf_list = tf_list_string.split('; ')
#     for tf in tf_list: regulator_set.add(tf)

# print(len(regulator_set))

In [6]:
regulator_set = regulator_set.intersection(set(train_source_df.index))
target_set = target_set.intersection(set(train_source_df.index))
all_gene_set = regulator_set.union(target_set)


In [7]:

train_source = train_source_df.loc[list(all_gene_set)].apply(stats.zscore, axis=0)
train_target = train_target_df.loc[list(all_gene_set)].apply(stats.zscore, axis=0)

test_source = test_source_df.loc[list(all_gene_set)].apply(stats.zscore, axis=0)
test_target = test_target_df.loc[list(all_gene_set)].apply(stats.zscore, axis=0)


In [8]:
print(train_source.shape)
print(test_source.shape)

(19343, 72)
(19343, 24)


In [9]:

target_df = pd.concat([train_target, test_target], axis=1)
source_df = pd.concat([train_source, test_source], axis=1)


In [10]:
target_gene_list = list(target_set)
target_exp = target_df
X = source_df.loc[list(regulator_set)]
tf_list = list(regulator_set)

tf_list_df = pd.read_csv('../output/network_model/arabidopsis_tf.csv', names=['tf'], index_col=0)
tf_list = list(tf_list_df.index)

In [11]:
new_test_target = test_target.loc[target_gene_list]
new_test_target = new_test_target.loc[new_test_target.std(axis=1) > 0.5]
target_gene_list = new_test_target.index

In [12]:
mp_calc = mp_run.MpCalc(target_gene_list, target_exp, X, network_df, train_source.loc[tf_list], train_target, test_source.loc[tf_list], test_target)

In [12]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.full_comp_new, range(iter_length)), total=iter_length))

100%|██████████| 1397/1397 [04:00<00:00,  5.81it/s]


In [13]:
r = np.array(r)
out_df = pd.DataFrame(index=target_gene_list)
out_df['rf_score'] = r[:, 0]
out_df['linear_score'] = r[:, 1]
out_df['gs_rf_score'] = r[:, 2]
out_df['gs_linear_score'] = r[:, 3]
out_df['rf_with_linear_top_features_score'] = r[:, 4]
out_df['linear_with_rf_top_features_score'] = r[:, 5]
out_df['rf_rmse'] = r[:, 6]
out_df['linear_rmse'] = r[:, 7]
out_df['gs_rf_rmse'] = r[:, 8]
out_df['gs_linear_rmse'] = r[:, 9]
out_df['rf_with_linear_top_features_rmse'] = r[:, 10]
out_df['linear_with_rf_top_features_rmse'] = r[:, 11]
out_df['rf_with_top_features_score'] = r[:, 12]
out_df['linear_with_top_features_score'] = r[:, 13]
out_df['rf_with_top_features_rmse'] = r[:, 14]
out_df['linear_with_top_features_rmse'] = r[:, 15]
out_df['rf_top_feature_num'] = r[:, 16]
out_df['linear_top_feature_num'] = r[:, 17]
out_df['rf_top_features_gs_overlap'] = r[:, 18]
out_df['linear_top_features_gs_overlap'] = r[:, 19]
out_df['rf_linear_top_features_overlap'] = r[:, 20]
out_df['gs_edge_num'] = r[:, 21]
out_df['test_var'] = r[:, 22]
out_df['test_std'] = r[:, 23]
out_df['pca_rf_score'] = r[:, 24]
out_df['pca_rf_rmse'] = r[:, 25]

In [14]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.efron_process_rf, range(iter_length)), total=iter_length))
efron_r = np.array(r)
out_df['rf_efron_feature_num'] = efron_r[:, 0].astype('float64')
out_df['rf_efron_complementary_feature_num_list'] = efron_r[:, 1]
out_df['rf_efron_rmse'] = efron_r[:, 2].astype('float64')
out_df['rf_efron_complementary_rmse_list'] = efron_r[:, 3]
out_df['rf_efron_features'] = efron_r[:, 4]
out_df['rf_efron_complementary_features_list'] = efron_r[:, 5]

100%|██████████| 1397/1397 [2:12:33<00:00,  5.69s/it] 


In [17]:
tf_list_df = pd.DataFrame(index=tf_list)
tf_list_df.to_csv('../output/network_model/arabidopsis_tf.csv', header=False)
out_df.to_csv('../output/network_model/arabidopsis_all_tf_high_var_target_new.csv.gz', compression='gzip')

In [12]:
out_df = pd.read_csv('../output/network_model/arabidopsis_all_tf_high_var_target_new.csv.gz', index_col=0, compression='gzip')
iter_length = len(target_gene_list)
new_out_df = pd.DataFrame(index=target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.rf_top_tf_same_count_as_gs, range(iter_length)), total=iter_length))
efron_r = np.array(r)
new_out_df['rf_top_tf_same_count_as_gs_score'] = efron_r[:, 0]
new_out_df['rf_top_tf_same_count_as_gs_rmse'] = efron_r[:, 1]
new_out_df = new_out_df.loc[out_df.index]
out_df['rf_top_tf_same_count_as_gs_score'] = new_out_df['rf_top_tf_same_count_as_gs_score']
out_df['rf_top_tf_same_count_as_gs_rmse'] = new_out_df['rf_top_tf_same_count_as_gs_rmse']

100%|██████████| 1397/1397 [01:21<00:00, 17.12it/s]


In [13]:
out_df.to_csv('../output/network_model/arabidopsis_all_tf_high_var_target_new.csv.gz', compression='gzip')

In [12]:
out_df = pd.read_csv('../output/network_model/arabidopsis_all_tf_high_var_target_new.csv.gz', index_col=0, compression='gzip')
iter_length = len(target_gene_list)
new_out_df = pd.DataFrame(index=target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.efron_ensemble_process_rf, range(iter_length)), total=iter_length))
ensemble_r = np.array(r)
new_out_df['rf_efron_ensemble_rmse'] = ensemble_r
new_out_df = new_out_df.loc[out_df.index]
out_df['rf_efron_ensemble_rmse'] = new_out_df['rf_efron_ensemble_rmse']

100%|████████████████████████████████████████████████████████████████████████████| 1397/1397 [38:35<00:00,  1.66s/it]


In [14]:
out_df.to_csv('../output/network_model/arabidopsis_all_tf_high_var_target_new.csv.gz', compression='gzip')

In [13]:
tf_list_df = pd.read_csv('../output/network_model/arabidopsis_tf.csv', names=['tf'], index_col=0)
out_df = pd.read_csv('../output/network_model/arabidopsis_all_tf_high_var_target_new.csv.gz', index_col=0, compression='gzip')
available_tfs = set(X.index)
rf_efron_overlap_count = []
for target_gene in out_df.index:
    gs_tf_list = network_df.loc[target_gene].tf_list
    gs_tf_set = set(gs_tf_list.split('; '))
    gs_tf_set = available_tfs.intersection(gs_tf_set)
    if target_gene in gs_tf_set: gs_tf_set.remove(target_gene)
    efron_tf_list = out_df.loc[target_gene]['rf_efron_features']
    efron_tf_list = efron_tf_list.split('; ')
    efron_tf_list = [int(i) for i in efron_tf_list]
    efron_tf_list = tf_list_df.iloc[efron_tf_list].index
    efron_tf_set = set(efron_tf_list)
    rf_efron_overlap_count.append(len(efron_tf_set.intersection(gs_tf_set)))
out_df['rf_efron_overlap_count'] = rf_efron_overlap_count
out_df.to_csv('../output/network_model/arabidopsis_all_tf_high_var_target_new.csv.gz', compression='gzip')


In [None]:
out_df = pd.read_csv('../output/network_model/arabidopsis_all_tf_high_var_target_new.csv.gz', index_col=0, compression='gzip')
iter_length = len(target_gene_list)
new_out_df = pd.DataFrame(index=target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.efron_process_rf_training, range(iter_length)), total=iter_length))
efron_r = np.array(r)
new_out_df['rf_efron_feature_num'] = efron_r[:, 0].astype('float64')
new_out_df['rf_efron_complementary_feature_num_list'] = efron_r[:, 1]
new_out_df['rf_efron_rmse'] = efron_r[:, 2].astype('float64')
new_out_df['rf_efron_complementary_rmse_list'] = efron_r[:, 3]
new_out_df['rf_efron_features'] = efron_r[:, 4]
new_out_df['rf_efron_complementary_features_list'] = efron_r[:, 5]
new_out_df['rf_efron_ensemble_rmse'] = efron_r[:, 6]
new_out_df = new_out_df.loc[out_df.index]
out_df['rf_efron_feature_num'] = new_out_df['rf_efron_feature_num']
out_df['rf_efron_complementary_feature_num_list'] = new_out_df['rf_efron_complementary_feature_num_list']
out_df['rf_efron_rmse'] = new_out_df['rf_efron_rmse']
out_df['rf_efron_complementary_rmse_list'] = new_out_df['rf_efron_complementary_rmse_list']
out_df['rf_efron_features'] = new_out_df['rf_efron_features']
out_df['rf_efron_complementary_features_list'] = new_out_df['rf_efron_complementary_features_list']
out_df['rf_efron_ensemble_rmse'] = new_out_df['rf_efron_ensemble_rmse']

out_df.to_csv('../output/network_model/arabidopsis_all_tf_high_var_target_efron_train.csv.gz', compression='gzip')

In [33]:
temp_df = out_df[(out_df['rf_efron_feature_num'] < 10 ) & (out_df['rf_efron_complementary_feature_num'] < 10 )]
temp_set = set()
for keys in temp_df['rf_efron_features']:
    key_list = keys.split('; ')
    for key in key_list:
        temp_set.add(int(key))
for keys in temp_df['rf_efron_complementary_features']:
    key_list = keys.split('; ')
    for key in key_list:
        temp_set.add(int(key))
print(len(temp_set))

633


In [21]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.efron_process_linear, range(iter_length)), total=iter_length))
efron_r = np.array(r)
out_df['linear_efron_feature_num'] = efron_r[:, 0]
out_df['linear_efron_rmse'] = efron_r[:, 1]
out_df['linear_efron_complementary_rmse'] = efron_r[:, 2]

100%|██████████| 1397/1397 [03:44<00:00,  6.23it/s]


In [12]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.efron_process_90th_rf, range(iter_length)), total=iter_length))
efron_r = np.array(r)
out_df['rf_efron_feature_num_90th'] = efron_r[:, 0]
out_df['rf_efron_rmse_90th'] = efron_r[:, 1]
out_df['rf_efron_complementary_rmse_90th'] = efron_r[:, 2]

100%|██████████| 1397/1397 [08:17<00:00,  2.81it/s]


In [None]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.dynamic_efron_rf, range(iter_length)), total=iter_length))
efron_r = np.array(r)
out_df['rf_dynamic_efron_feature_num'] = efron_r[:, 0].astype('float64')
out_df['rf_dynamic_efron_rmse'] = efron_r[:, 1].astype('float64')
out_df['rf_dynamic_efron_complementary_rmse'] = efron_r[:, 2].astype('float64')
out_df['rf_dynamic_efron_feature_index'] = efron_r[:, 3]

In [None]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.dynamic_efron_linear, range(iter_length)), total=iter_length))
efron_r = np.array(r)
out_df['linear_dynamic_efron_feature_num'] = efron_r[:, 0].astype('float64')
out_df['linear_dynamic_efron_rmse'] = efron_r[:, 1].astype('float64')
out_df['linear_dynamic_efron_complementary_rmse'] = efron_r[:, 2].astype('float64')
out_df['linear_dynamic_efron_feature_index'] = efron_r[:, 3]

In [16]:
overlap_list = []
target_comb_list = []
for i in tqdm(range(iter_length)):
    tf_set_i = out_df.iloc[i]['linear_dynamic_efron_feature_index']
    tf_set_i = tf_set_i.split('; ')
    tf_set_i = set([int(v) for v in tf_set_i])
    for j in range(i+1, iter_length):
        tf_set_j = out_df.iloc[j]['linear_dynamic_efron_feature_index']
        tf_set_j = tf_set_j.split('; ')
        tf_set_j = set([int(v) for v in tf_set_j])
        overlap = len(tf_set_i.intersection(tf_set_j))/(len(tf_set_i.union(tf_set_j))+0.0)
        overlap_list.append(overlap)
        if (overlap >= 1) and (len(tf_set_i) > 1):
            print(len(tf_set_j))
        target_comb_list.append((i,j))




100%|██████████| 1397/1397 [01:57<00:00, 11.93it/s] 


In [None]:
out_df.to_csv('../output/network_model/arabidopsis_all_tf_high_var_target_new.csv.gz', compression='gzip')

In [15]:
out_df.columns

Index(['rf_score', 'linear_score', 'gs_rf_score', 'gs_linear_score',
       'rf_with_linear_top_features_score',
       'linear_with_rf_top_features_score', 'rf_rmse', 'linear_rmse',
       'gs_rf_rmse', 'gs_linear_rmse', 'rf_with_linear_top_features_rmse',
       'linear_with_rf_top_features_rmse', 'rf_with_top_features_score',
       'linear_with_top_features_score', 'rf_with_top_features_rmse',
       'linear_with_top_features_rmse', 'rf_top_feature_num',
       'linear_top_feature_num', 'rf_top_features_gs_overlap',
       'linear_top_features_gs_overlap', 'rf_linear_top_features_overlap',
       'gs_edge_num', 'test_var', 'test_std', 'rf_dynamic_efron_feature_num',
       'rf_dynamic_efron_rmse', 'rf_dynamic_efron_complementary_rmse',
       'rf_dynamic_efron_feature_index', 'linear_dynamic_efron_feature_num',
       'linear_dynamic_efron_rmse', 'linear_dynamic_efron_complementary_rmse',
       'linear_dynamic_efron_feature_index'],
      dtype='object')

In [13]:
out_df.to_csv('arabidopsis_network_v_model.csv')

In [None]:
rf_conf_interval = conf_interval.conf_interval_calc(list(out_df['rf_score'].values))
print('('+', '.join([ '%.3f' % elem for elem in rf_conf_interval[2:] ])+')')
linear_conf_interval = conf_interval.conf_interval_calc(list(out_df['linear_score'].values))
print('('+', '.join([ '%.3f' % elem for elem in linear_conf_interval[2:] ])+')')
gs_rf_conf_interval = conf_interval.conf_interval_calc(list(out_df['gs_rf_score'].values))
print('('+', '.join([ '%.3f' % elem for elem in gs_rf_conf_interval[2:] ])+')')
gs_linear_conf_interval = conf_interval.conf_interval_calc(list(out_df['gs_linear_score'].values))
print('('+', '.join([ '%.3f' % elem for elem in gs_linear_conf_interval[2:] ])+')')
rf_with_linear_top_features_conf_interval = conf_interval.conf_interval_calc(list(out_df['rf_with_linear_top_features_score'].values))
print('('+', '.join([ '%.3f' % elem for elem in rf_with_linear_top_features_conf_interval[2:] ])+')')
linear_with_rf_top_features_conf_interval = conf_interval.conf_interval_calc(list(out_df['linear_with_rf_top_features_score'].values))
print('('+', '.join([ '%.3f' % elem for elem in linear_with_rf_top_features_conf_interval[2:] ])+')')


In [18]:
rf_conf_interval_rmse = conf_interval.conf_interval_calc(list(out_df['rf_rmse'].values))
print('('+', '.join([ '%.3f' % elem for elem in rf_conf_interval_rmse[2:] ])+')')
linear_conf_interval_rmse = conf_interval.conf_interval_calc(list(out_df['linear_rmse'].values))
print('('+', '.join([ '%.3f' % elem for elem in linear_conf_interval_rmse[2:] ])+')')
gs_rf_conf_interval_rmse = conf_interval.conf_interval_calc(list(out_df['gs_rf_rmse'].values))
print('('+', '.join([ '%.3f' % elem for elem in gs_rf_conf_interval_rmse[2:] ])+')')
gs_linear_conf_interval_rmse = conf_interval.conf_interval_calc(list(out_df['gs_linear_rmse'].values))
print('('+', '.join([ '%.3f' % elem for elem in gs_linear_conf_interval_rmse[2:] ])+')')
rf_with_linear_top_features_conf_interval_rmse = conf_interval.conf_interval_calc(list(out_df['rf_with_linear_top_features_rmse'].values))
print('('+', '.join([ '%.3f' % elem for elem in rf_with_linear_top_features_conf_interval_rmse[2:] ])+')')
linear_with_rf_top_features_conf_interval_rmse = conf_interval.conf_interval_calc(list(out_df['linear_with_rf_top_features_rmse'].values))
print('('+', '.join([ '%.3f' % elem for elem in linear_with_rf_top_features_conf_interval_rmse[2:] ])+')')


(0.135, 0.143)
(0.138, 0.146)
(0.146, 0.155)
(0.148, 0.157)
(0.131, 0.139)
(0.136, 0.144)


In [45]:
out_df.mean()

rf_score                             0.134874
linear_score                        -0.075192
gs_rf_score                         -0.057629
gs_linear_score                     -0.010969
rf_with_linear_top_features_score    0.247398
linear_with_rf_top_features_score    0.130533
rf_rmse                              0.138449
linear_rmse                          0.141425
gs_rf_rmse                           0.150306
gs_linear_rmse                       0.152666
rf_with_linear_top_features_rmse     0.134582
linear_with_rf_top_features_rmse     0.138169
dtype: float64

In [20]:
from itertools import combinations
model_combs = list(combinations(out_df.columns[:6], 2))

In [21]:
for a, b in model_combs:
    t, p = stats.ttest_rel(out_df[a], out_df[b])
    c, d, lower, upper = conf_interval.conf_interval_calc(list(out_df[a]-out_df[b]))
    if (p > 0.05):
        print('{} and {} don\'t have statistically different performance'.format(a, b))
        continue
    if (t > 0):
        print('{} has statisically better performance than {}, with p-val of {}'.format(a, b, p))
        print('confidence interval: ({:.3f}, {:.3f})'.format(lower, upper))
    else:
        print('{} has statisically better performance than {}, with p-val of {}'.format(b, a, p))
        print('confidence interval: ({:.3f}, {:.3f})'.format(lower, upper))

rf_score has statisically better performance than linear_score, with p-val of 0.0
confidence interval: (0.186, 0.194)
rf_score has statisically better performance than gs_rf_score, with p-val of 0.0
confidence interval: (0.167, 0.180)
rf_score has statisically better performance than gs_linear_score, with p-val of 0.0
confidence interval: (0.121, 0.130)
rf_with_linear_top_features_score has statisically better performance than rf_score, with p-val of 0.0
confidence interval: (-0.136, -0.129)
rf_score has statisically better performance than linear_with_rf_top_features_score, with p-val of 3.8527320817642676e-11
confidence interval: (0.008, 0.013)
gs_rf_score has statisically better performance than linear_score, with p-val of 0.0004037834222013824
confidence interval: (-0.024, -0.008)
gs_linear_score has statisically better performance than linear_score, with p-val of 1.529393837609289e-76
confidence interval: (-0.070, -0.059)
rf_with_linear_top_features_score has statisically better p

In [46]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    feature_num_r = list(tqdm(p.imap(mp_calc.top_feature_num, range(iter_length)), total=iter_length))

100%|██████████| 18855/18855 [11:33<00:00, 27.18it/s]


In [48]:
feature_num_r = np.array(feature_num_r)

In [53]:
np.std(feature_num_r,axis=0)

array([0.91661575, 0.42218772])

In [55]:
out_df['rf_top_feature_num'] = feature_num_r[:, 0]
out_df['linear_top_feature_num'] = feature_num_r[:, 1]

In [57]:
out_df.to_csv('arabidopsis_network_v_model.csv')

In [9]:
out_df = pd.read_csv('./arabidopsis_network_v_model.csv', index_col=0)
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    top_feature_r = list(tqdm(p.imap(mp_calc.top_feature_model, range(iter_length)), total=iter_length))


100%|██████████| 18855/18855 [12:00<00:00, 26.18it/s]


In [10]:
top_feature_r = np.array(top_feature_r)
out_df['rf_with_top_features_score'] = top_feature_r[:, 0]
out_df['linear_with_top_features_score'] = top_feature_r[:, 1]
out_df['rf_with_top_features_rmse'] = top_feature_r[:, 2]
out_df['linear_with_top_features_rmse'] = top_feature_r[:, 3]
out_df.to_csv('arabidopsis_network_v_model.csv')

In [14]:
out_df = pd.read_csv('./arabidopsis_network_v_model.csv', index_col=0)

iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    top_feature_overlap_r = list(tqdm(p.imap(mp_calc.feature_overlap, range(iter_length)), total=iter_length))


100%|██████████| 18855/18855 [12:28<00:00, 25.20it/s]


In [15]:
top_feature_overlap_r = np.array(top_feature_overlap_r)
out_df['rf_top_features_gs_overlap'] = top_feature_overlap_r[:, 0]
out_df['linear_top_features_gs_overlap'] = top_feature_overlap_r[:, 1]
out_df['rf_linear_top_features_overlap'] = top_feature_overlap_r[:, 2]
out_df['gs_edge_num'] = top_feature_overlap_r[:, 3]
out_df.to_csv('arabidopsis_network_v_model.csv')

In [16]:
out_df.mean()

rf_score                             0.156445
linear_score                        -0.075192
gs_rf_score                         -0.057755
gs_linear_score                     -0.010969
rf_with_linear_top_features_score    0.247223
linear_with_rf_top_features_score    0.130521
rf_rmse                              0.138351
linear_rmse                          0.141425
gs_rf_rmse                           0.150264
gs_linear_rmse                       0.152666
rf_with_linear_top_features_rmse     0.134607
linear_with_rf_top_features_rmse     0.137005
rf_top_feature_num                   5.433466
linear_top_feature_num               5.907239
test_var                             0.289295
test_std                             0.181825
rf_with_top_features_score           0.054952
linear_with_top_features_score       0.086126
rf_with_top_features_rmse            0.142191
linear_with_top_features_rmse        0.144937
rf_top_features_gs_overlap           0.744895
linear_top_features_gs_overlap    

In [24]:
target_df

Unnamed: 0,R10C-1,R15C-1,R20C-1,R30C-1,R45C-1,R60C-1,R10N-1,R15N-1,R20N-1,R30N-1,...,S90N-2,S120N-2,R90C-3,R120C-3,R90N-3,R120N-3,S90C-3,S120C-3,S90N-3,S120N-3
AT3G47510,-0.073287,0.044592,0.108866,0.056709,0.026173,0.211968,0.325949,0.078271,0.271995,0.004527,...,-0.133932,-0.205090,0.168196,0.209031,0.175186,0.241305,-0.177559,-0.172923,-0.112070,-0.253467
AT5G34940,-0.172353,-0.207217,-0.240780,-0.222544,-0.235799,-0.256452,-0.206941,-0.225791,-0.257555,-0.233137,...,-0.247072,-0.208638,-0.244426,-0.257093,-0.262136,-0.279207,-0.216452,-0.219419,-0.155972,-0.248951
AT5G55230,0.086087,0.000364,-0.092941,-0.077297,-0.071898,-0.139925,0.010270,0.008721,-0.079177,-0.104427,...,-0.076585,-0.036653,-0.063932,-0.066495,-0.130500,-0.155244,-0.028138,-0.034644,-0.025206,-0.094539
AT3G07720,0.152534,0.152125,0.017257,0.000355,0.021457,0.004097,0.092049,0.125958,-0.030813,-0.009208,...,0.289274,0.233868,-0.028838,-0.034666,-0.130561,-0.158303,0.719420,0.780061,0.110052,0.307902
AT1G26550,0.588816,0.703724,0.510376,0.492908,0.381464,0.615749,0.541390,0.656111,0.524892,0.517736,...,0.022013,0.048245,0.541573,0.560549,0.403869,0.561052,0.043399,0.036540,0.006674,-0.014086
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AT1G45000,-0.009582,0.186668,0.337630,0.333670,0.337841,0.433535,0.162078,0.198630,0.263497,0.346082,...,-0.079228,-0.084288,0.404269,0.550493,0.380154,0.303492,-0.106269,-0.075161,-0.056819,-0.113930
AT1G43860,-0.138903,-0.120303,-0.202876,-0.177362,-0.191148,-0.206586,-0.113148,-0.125975,-0.190101,-0.181006,...,-0.025147,0.007621,-0.176565,-0.203449,-0.141073,-0.138188,-0.152292,-0.125058,-0.020118,-0.016936
AT1G72070,-0.183591,-0.234980,-0.317215,-0.301367,-0.299725,-0.339127,-0.216766,-0.232316,-0.307900,-0.283160,...,-0.257149,-0.259622,-0.332950,-0.337833,-0.294411,-0.305771,-0.263593,-0.250111,-0.183012,-0.277645
AT1G61610,-0.186876,-0.237492,-0.329804,-0.309103,-0.310761,-0.351665,-0.223406,-0.242004,-0.319920,-0.295885,...,-0.268372,-0.256937,-0.349673,-0.350978,-0.325536,-0.342873,-0.271756,-0.256778,-0.192043,-0.294662
