Scripts for regression experiments on mouse

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import product
import re

import mp_run
import conf_interval

from os import listdir
from os.path import isfile, join
from sklearn.metrics import mean_squared_error, explained_variance_score

import os

from sklearn.model_selection import cross_val_score
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from scipy import stats

from multiprocessing import Pool, cpu_count


%load_ext autoreload
%autoreload 2

# regex for number extraction from string
number_pattern =  r'(-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?)'

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams.update({'figure.figsize':(7,5), 'figure.dpi':100})

In [None]:
# ts training/testing data curation 

df_1 = pd.read_csv('../data/yeast/GSE145936/GSE145936_Sis1-AA_Gene_counts_normalized.txt.gz', sep='\t', index_col=0, compression='gzip')
df_2 = pd.read_csv('../data/yeast/GSE153609/GSE153609_gene_expression_TPM_all_times.csv.gz', index_col=0, compression='gzip')
df_3 = pd.read_csv('../data/yeast/GSE168699/GSE168699_RNA_TPM_all_times.csv.gz', index_col=0, compression='gzip')

to_drop = df_3.columns[:7]
df_3 = df_3.drop(labels=to_drop, axis=1)
df_1 = df_1.drop(labels=['gene name'], axis=1)

common_genes = set(df_1.index).intersection(set(df_2.index)).intersection(set(df_3.index))
common_genes = list(common_genes)
# normalized_df_1=(df_1-df_1.min())/(df_1.max()-df_1.min())
# normalized_df_2=(df_2-df_2.min())/(df_2.max()-df_2.min())
# normalized_df_3=(df_3-df_3.min())/(df_3.max()-df_3.min())
normalized_df_1=df_1.apply(stats.zscore, axis=0)
normalized_df_2=df_2.apply(stats.zscore, axis=0)
normalized_df_3=df_3.apply(stats.zscore, axis=0)
# normalized_df_1 = normalized_df_1*100.0
# normalized_df_2 = normalized_df_2*100.0
# normalized_df_3 = normalized_df_3*100.0
normalized_df_1 = normalized_df_1.loc[common_genes]
normalized_df_2 = normalized_df_2.loc[common_genes]
normalized_df_3 = normalized_df_3.loc[common_genes]



In [None]:
def train_test_source_target_split(df, time_set, rep_set):
    sorted_time_list = sorted(time_set, key=lambda time_str: float(re.search(number_pattern, time_str).group(1)))
    print(sorted_time_list)

    train_source_time = sorted_time_list[:-2]
    train_target_time = sorted_time_list[1:-1]
    test_source_time = sorted_time_list[-2:-1]
    test_target_time = sorted_time_list[-1:]

    train_source_list = ['@'.join([x, y]) for x, y in product(list(rep_set), train_source_time)]
    train_target_list = ['@'.join([x, y]) for x, y in product(list(rep_set), train_target_time)]
    test_source_list = ['@'.join([x, y]) for x, y in product(list(rep_set), test_source_time)]
    test_target_list = ['@'.join([x, y]) for x, y in product(list(rep_set), test_target_time)]
    return df[train_source_list], df[train_target_list], df[test_source_list], df[test_target_list]

def format_index_and_normalize(df):
    normalized_df = df.apply(stats.zscore, axis=0)
    selected_index_list = []
    new_index_list = []
    for index_name in df.index:
        names = index_name.split('Name=')[-1]
        if ('/' in names):
            name_list = names.split('/')[:2]
            for name in name_list:
                if (name in common_genes):
                    selected_index_list.append(index_name)
                    new_index_list.append(name)
                    continue
        elif (names in common_genes):
            selected_index_list.append(index_name)
            new_index_list.append(names)
            continue
    len(new_index_list)
    new_df = normalized_df.loc[selected_index_list]
    new_df.index = new_index_list
    return new_df

In [None]:
df_0_1 = pd.read_csv('../data/yeast/GSE226769/GSE226769_Meiotic_Depletion_TPMs.csv.gz', compression='gzip', index_col=0)
df_0_1 = format_index_and_normalize(df_0_1)
df_0_2 = pd.read_csv('../data/yeast/GSE226769/GSE226769_Mitotic_Depletion_TPMs.csv.gz', compression='gzip', index_col=0)
df_0_2 = df_0_2.drop(columns=df_0_2.columns[-6:])
df_0_2 = format_index_and_normalize(df_0_2)
df_0_3 = pd.read_csv('../data/yeast/GSE226769/GSE226769_UME6_T99N_AltAD_Rescue_TPMs.csv.gz', compression='gzip', index_col=0)
df_0_3 = format_index_and_normalize(df_0_3)
df_0_4 = pd.read_csv('../data/yeast/GSE226769/GSE226769_UME6_T99N_Rescue_TPMs.csv.gz', compression='gzip', index_col=0)
df_0_4 = format_index_and_normalize(df_0_4)

common_genes = set(common_genes).intersection(set(df_0_1.index), set(df_0_2.index), set(df_0_3.index), set(df_0_4.index))
common_genes = list(common_genes)
df_0_1 = df_0_1.loc[common_genes]
df_0_2 = df_0_2.loc[common_genes]
df_0_3 = df_0_3.loc[common_genes]
df_0_4 = df_0_4.loc[common_genes]
df_0_1 = df_0_1[~df_0_1.index.duplicated(keep='first')]
df_0_2 = df_0_2[~df_0_2.index.duplicated(keep='first')]
df_0_3 = df_0_3[~df_0_3.index.duplicated(keep='first')]
df_0_4 = df_0_4[~df_0_4.index.duplicated(keep='first')]
normalized_df_1 = normalized_df_1.loc[common_genes]
normalized_df_2 = normalized_df_2.loc[common_genes]
normalized_df_3 = normalized_df_3.loc[common_genes]


In [None]:
rep_set = set()
time_set = set()
formated_name_list = []
for name in df_0_1.columns:
    name_segments = name.split('_')
    rep_name = ''.join(name_segments[:-1])
    time_name = name_segments[-1]
    rep_set.add(rep_name)
    time_set.add(time_name)
    formated_name_list.append(rep_name+'@'+time_name)
print(len(rep_set), len(time_set))
df_0_1.columns = formated_name_list
normalized_df = df_0_1.apply(stats.zscore, axis=0)
df_split_1 = train_test_source_target_split(normalized_df, time_set, rep_set)



In [None]:
rep_set = set()
time_set = set()
formated_name_list = []
for name in df_0_2.columns:
    pattern = r'(.*)\(([^()]*)\)(.*)' 
    match = re.match(pattern, name)
    rep_name = match.group(1) + match.group(3)
    rep_set.add(rep_name)
    time_name = match.group(2)
    time_set.add(time_name)
    formated_name_list.append(rep_name+'@'+time_name)
df_0_2.columns = formated_name_list
normalized_df = df_0_2.apply(stats.zscore, axis=0)
df_split_2 = train_test_source_target_split(normalized_df, time_set, rep_set)


In [None]:
rep_set = set()
time_set = set()
formated_name_list = []
for name in df_0_3.columns:
    name_segments = name.split('_')
    if (len(name_segments) == 2):
        rep_name = name_segments[0]
        time_name = name_segments[1]
    elif (len(name_segments) == 3):
        rep_name = name_segments[0] + '~' + name_segments[2]
        time_name = name_segments[1]
    elif (len(name_segments) == 4):
        rep_name = name_segments[0] + '~' + name_segments[1] + '~' + name_segments[3]
        time_name = name_segments[2]
    rep_set.add(rep_name)
    time_set.add(time_name)
    formated_name_list.append(rep_name+'@'+time_name)   
print(len(rep_set), len(time_set))
df_0_3.columns = formated_name_list
normalized_df = df_0_3.apply(stats.zscore, axis=0)
df_split_3 = train_test_source_target_split(normalized_df, time_set, rep_set)

In [None]:
rep_set = set()
time_set = set()
formated_name_list = []
for name in df_0_4.columns:
    name_segments = name.split('_')
    if (len(name_segments) == 2):
        rep_name = name_segments[0]
        time_name = name_segments[1]
    elif (len(name_segments) == 3):
        rep_name = name_segments[0] + '~' + name_segments[2]
        time_name = name_segments[1]
    elif (len(name_segments) == 4):
        rep_name = name_segments[0] + '~' + name_segments[1] + '~' + name_segments[3]
        time_name = name_segments[2]
    rep_set.add(rep_name)
    time_set.add(time_name)
    formated_name_list.append(rep_name+'@'+time_name)   
print(len(rep_set), len(time_set))
df_0_4.columns = formated_name_list
normalized_df = df_0_4.apply(stats.zscore, axis=0)
df_split_4 = train_test_source_target_split(normalized_df, time_set, rep_set)

In [None]:


test_df_1 = normalized_df_1.iloc[:,[3,4,5,9,10,11]]
test_df_2 = normalized_df_2.iloc[:,[3,4,5]]
test_df_3 = normalized_df_3.iloc[:, -5:]
test_exp = pd.concat([test_df_1, test_df_2, test_df_3], axis=1)
test_source = test_exp.iloc[:,[0,1,3,4,6,7,9,10,11,12]]
test_target = test_exp.iloc[:,[0,1,3,4,6,7,9,10,11,12]]

train_source_df_1 = normalized_df_1.iloc[:, [0,1,2,3,5,6,7,8,9]]
train_target_df_1 = normalized_df_1.iloc[:, [1,2,3,4,6,7,8,9,10]]
train_source_df_2 = normalized_df_2.iloc[:, [0,1,2,3]]
train_target_df_2 = normalized_df_2.iloc[:, [1,2,3,4]]

train_source_df_3 = normalized_df_3.iloc[:, :-4]
train_target_df_3 = normalized_df_3.iloc[:, 1:-3]
train_source = pd.concat([train_source_df_1, train_source_df_2, train_source_df_3], axis=1)
train_target = pd.concat([train_target_df_1, train_target_df_2, train_target_df_3], axis=1)
source_exp = pd.concat([train_source, test_source], axis=1)
target_exp = pd.concat([train_target, test_target], axis=1)

In [None]:
train_source = pd.concat([train_source, df_split_1[0], df_split_2[0], df_split_3[0], df_split_4[0]], axis=1)
train_target = pd.concat([train_target, df_split_1[1], df_split_2[1], df_split_3[1], df_split_4[1]], axis=1)
test_source = pd.concat([test_source, df_split_1[2], df_split_2[2], df_split_3[2], df_split_4[2]], axis=1)
test_target = pd.concat([test_target, df_split_1[3], df_split_2[3], df_split_3[3], df_split_4[3]], axis=1)

In [3]:
train_source_df_1 = pd.read_csv('../data/yeast/GSE145936/normalized/train_source.csv.gz', compression='gzip', index_col=0)
train_target_df_1 = pd.read_csv('../data/yeast/GSE145936/normalized/train_target.csv.gz', compression='gzip', index_col=0)
test_source_df_1 = pd.read_csv('../data/yeast/GSE145936/normalized/test_source.csv.gz', compression='gzip', index_col=0)
test_target_df_1 = pd.read_csv('../data/yeast/GSE145936/normalized/test_target.csv.gz', compression='gzip', index_col=0)

train_source_df_2 = pd.read_csv('../data/yeast/GSE153609/normalized/train_source.csv.gz', compression='gzip', index_col=0)
train_target_df_2 = pd.read_csv('../data/yeast/GSE153609/normalized/train_target.csv.gz', compression='gzip', index_col=0)
test_source_df_2 = pd.read_csv('../data/yeast/GSE153609/normalized/test_source.csv.gz', compression='gzip', index_col=0)
test_target_df_2 = pd.read_csv('../data/yeast/GSE153609/normalized/test_target.csv.gz', compression='gzip', index_col=0)

train_source_df_3 = pd.read_csv('../data/yeast/GSE168699/normalized/train_source.csv.gz', compression='gzip', index_col=0)
train_target_df_3 = pd.read_csv('../data/yeast/GSE168699/normalized/train_target.csv.gz', compression='gzip', index_col=0)
test_source_df_3 = pd.read_csv('../data/yeast/GSE168699/normalized/test_source.csv.gz', compression='gzip', index_col=0)
test_target_df_3 = pd.read_csv('../data/yeast/GSE168699/normalized/test_target.csv.gz', compression='gzip', index_col=0)

In [5]:
train_source_4_1 = pd.read_csv('../data/yeast/GSE226769/normalized/train_source_1.csv.gz', compression='gzip', index_col=0)
train_target_4_1 = pd.read_csv('../data/yeast/GSE226769/normalized/train_target_1.csv.gz', compression='gzip', index_col=0)
test_source_4_1 = pd.read_csv('../data/yeast/GSE226769/normalized/test_source_1.csv.gz', compression='gzip', index_col=0)
test_target_4_1 = pd.read_csv('../data/yeast/GSE226769/normalized/test_target_1.csv.gz', compression='gzip', index_col=0)

train_source_4_2 = pd.read_csv('../data/yeast/GSE226769/normalized/train_source_2.csv.gz', compression='gzip', index_col=0)
train_target_4_2 = pd.read_csv('../data/yeast/GSE226769/normalized/train_target_2.csv.gz', compression='gzip', index_col=0)
test_source_4_2 = pd.read_csv('../data/yeast/GSE226769/normalized/test_source_2.csv.gz', compression='gzip', index_col=0)
test_target_4_2 = pd.read_csv('../data/yeast/GSE226769/normalized/test_target_2.csv.gz', compression='gzip', index_col=0)

train_source_4_3 = pd.read_csv('../data/yeast/GSE226769/normalized/train_source_3.csv.gz', compression='gzip', index_col=0)
train_target_4_3 = pd.read_csv('../data/yeast/GSE226769/normalized/train_target_3.csv.gz', compression='gzip', index_col=0)
test_source_4_3 = pd.read_csv('../data/yeast/GSE226769/normalized/test_source_3.csv.gz', compression='gzip', index_col=0)
test_target_4_3 = pd.read_csv('../data/yeast/GSE226769/normalized/test_target_3.csv.gz', compression='gzip', index_col=0)

train_source_4_4 = pd.read_csv('../data/yeast/GSE226769/normalized/train_source_4.csv.gz', compression='gzip', index_col=0)
train_target_4_4 = pd.read_csv('../data/yeast/GSE226769/normalized/train_target_4.csv.gz', compression='gzip', index_col=0)
test_source_4_4 = pd.read_csv('../data/yeast/GSE226769/normalized/test_source_4.csv.gz', compression='gzip', index_col=0)
test_target_4_4 = pd.read_csv('../data/yeast/GSE226769/normalized/test_target_4.csv.gz', compression='gzip', index_col=0)


In [9]:
train_source = pd.concat([train_source_df_1, train_source_df_2, train_source_df_3, train_source_4_1, train_source_4_2, train_source_4_3, train_source_4_4], axis=1)
train_target = pd.concat([train_target_df_1, train_target_df_2, train_target_df_3, train_target_4_1, train_target_4_2, train_target_4_3, train_target_4_4], axis=1)
test_source = pd.concat([test_source_df_1, test_source_df_2, test_source_df_3, test_source_4_1, test_source_4_2, test_source_4_3, test_source_4_4], axis=1)
test_target = pd.concat([test_target_df_1, test_target_df_2, test_target_df_3, test_target_4_1, test_target_4_2, test_target_4_3, test_target_4_4], axis=1)

common_genes = list(train_source.index)

In [10]:
# get network data, training features
tf_set = set()
tf_list_df = pd.read_csv('../data/yeast/yeast_tf_list.tsv.gz', sep='\t', compression='gzip', index_col=0)
for name in tf_list_df['Gene Names']:
    name_splits = name.split(' ')
    for i in name_splits:
        if i.upper() in train_source.index:
            tf_set.add(i.upper())

network_df = pd.read_csv('../data/yeast/yeat_network.csv', index_col=0)
target_gene_list = []
for i, row in network_df.iterrows():
    tf_list = row.tf_list
    if pd.isnull(tf_list): 
        continue
    tf_list = tf_list.split('; ')
    tf_set = tf_set.union(set(tf_list))
    target_gene_list.append(i)

print('Number of TFs used:')
print(len(tf_set))

Number of TFs used:
495


In [None]:
# # get network data, training features
# network_df = pd.read_csv('../data/yeast/yeat_network.csv', index_col=0)
# tf_set = set()
# target_gene_list = []
# for i, row in network_df.iterrows():
#     tf_list = row.tf_list
#     if pd.isnull(tf_list): 
#         continue
#     tf_list = tf_list.split('; ')
#     tf_set = tf_set.union(set(tf_list))
#     target_gene_list.append(i)

# print(len(tf_set))


In [17]:

target_exp = pd.concat([train_target, test_target], axis=1)
source_exp = pd.concat([train_source, test_source], axis=1)
target_gene_list = list(set(target_gene_list).intersection(set(common_genes)))
# filterout NaN target:
target_gene_list = list(target_exp.loc[target_gene_list][target_exp.loc[target_gene_list].isnull().any(axis=1)==False].index)
tf_list = list(tf_set.intersection(set(common_genes)))
tf_list_df = pd.read_csv('../output/network_model/yeast_tf.csv', names=['tf'], index_col=0)
tf_list = list(tf_list_df.index)

X = source_exp.loc[tf_list]


In [18]:
# GS stats
edge_count = 0
gs_tf_set = set()
gs_target_set = set()
for i, row in network_df.iterrows():
    if i in target_gene_list:
        cur_tf_list = row.tf_list
        gs_target_set.add(i)
        if pd.isnull(cur_tf_list): 
            continue
        cur_tf_list = cur_tf_list.split('; ')
        for cur_tf in cur_tf_list:
            if cur_tf in tf_list:
                gs_tf_set.add(cur_tf)
                edge_count += 1
print('GS edge count:')
print(edge_count)
print('Number of TFs in GS:')
print(len(gs_tf_set))
print('Number of target genes in GS:')
print(len(gs_target_set))




GS edge count:
162100
Number of TFs in GS:
205
Number of target genes in GS:
4794


In [19]:
# filter for high variance targets

new_test_target = test_target.loc[target_gene_list]
new_test_target = new_test_target.loc[new_test_target.std(axis=1) > 0.5]
target_gene_list = new_test_target.index

In [21]:
mp_calc = mp_run.MpCalc(target_gene_list, target_exp, X, network_df, train_source.loc[tf_list], train_target, test_source.loc[tf_list], test_target)

In [22]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.full_comp_new, range(iter_length)), total=iter_length))

100%|██████████| 385/385 [01:02<00:00,  6.15it/s]


In [23]:
r = np.array(r)
out_df = pd.DataFrame(index=target_gene_list)
out_df['rf_score'] = r[:, 0]
out_df['linear_score'] = r[:, 1]
out_df['gs_rf_score'] = r[:, 2]
out_df['gs_linear_score'] = r[:, 3]
out_df['rf_with_linear_top_features_score'] = r[:, 4]
out_df['linear_with_rf_top_features_score'] = r[:, 5]
out_df['rf_rmse'] = r[:, 6]
out_df['linear_rmse'] = r[:, 7]
out_df['gs_rf_rmse'] = r[:, 8]
out_df['gs_linear_rmse'] = r[:, 9]
out_df['rf_with_linear_top_features_rmse'] = r[:, 10]
out_df['linear_with_rf_top_features_rmse'] = r[:, 11]
out_df['rf_with_top_features_score'] = r[:, 12]
out_df['linear_with_top_features_score'] = r[:, 13]
out_df['rf_with_top_features_rmse'] = r[:, 14]
out_df['linear_with_top_features_rmse'] = r[:, 15]
out_df['rf_top_feature_num'] = r[:, 16]
out_df['linear_top_feature_num'] = r[:, 17]
out_df['rf_top_features_gs_overlap'] = r[:, 18]
out_df['linear_top_features_gs_overlap'] = r[:, 19]
out_df['rf_linear_top_features_overlap'] = r[:, 20]
out_df['gs_edge_num'] = r[:, 21]
out_df['test_var'] = r[:, 22]
out_df['test_std'] = r[:, 23]
out_df['pca_rf_score'] = r[:, 24]
out_df['pca_rf_rmse'] = r[:, 25]

In [24]:
tf_list_df = pd.DataFrame(index=tf_list)
tf_list_df.to_csv('../output/network_model/yeast_tf.csv', header=False)
out_df.to_csv('../output/network_model/yeast_all_tf_high_var_target_new.csv.gz', compression='gzip')

In [25]:
out_df = pd.read_csv('../output/network_model/yeast_all_tf_high_var_target_new.csv.gz', index_col=0, compression='gzip')
iter_length = len(target_gene_list)
new_out_df = pd.DataFrame(index=target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.rf_top_tf_same_count_as_gs, range(iter_length)), total=iter_length))
efron_r = np.array(r)
new_out_df['rf_top_tf_same_count_as_gs_score'] = efron_r[:, 0]
new_out_df['rf_top_tf_same_count_as_gs_rmse'] = efron_r[:, 1]
new_out_df = new_out_df.loc[out_df.index]
out_df['rf_top_tf_same_count_as_gs_score'] = new_out_df['rf_top_tf_same_count_as_gs_score']
out_df['rf_top_tf_same_count_as_gs_rmse'] = new_out_df['rf_top_tf_same_count_as_gs_rmse']

100%|██████████| 385/385 [00:10<00:00, 36.69it/s]


In [28]:
out_df.to_csv('../output/network_model/yeast_all_tf_high_var_target_new.csv.gz', compression='gzip')


In [29]:
out_df = pd.read_csv('../output/network_model/yeast_all_tf_high_var_target_new.csv.gz', index_col=0, compression='gzip')
iter_length = len(target_gene_list)
new_out_df = pd.DataFrame(index=target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.efron_process_rf_training, range(iter_length)), total=iter_length))
efron_r = np.array(r)
new_out_df['rf_efron_feature_num'] = efron_r[:, 0].astype('float64')
new_out_df['rf_efron_complementary_feature_num_list'] = efron_r[:, 1]
new_out_df['rf_efron_rmse'] = efron_r[:, 2].astype('float64')
new_out_df['rf_efron_complementary_rmse_list'] = efron_r[:, 3]
new_out_df['rf_efron_features'] = efron_r[:, 4]
new_out_df['rf_efron_complementary_features_list'] = efron_r[:, 5]
new_out_df['rf_efron_ensemble_rmse'] = efron_r[:, 6]
new_out_df = new_out_df.loc[out_df.index]
out_df['rf_efron_feature_num'] = new_out_df['rf_efron_feature_num']
out_df['rf_efron_complementary_feature_num_list'] = new_out_df['rf_efron_complementary_feature_num_list']
out_df['rf_efron_rmse'] = new_out_df['rf_efron_rmse']
out_df['rf_efron_complementary_rmse_list'] = new_out_df['rf_efron_complementary_rmse_list']
out_df['rf_efron_features'] = new_out_df['rf_efron_features']
out_df['rf_efron_complementary_features_list'] = new_out_df['rf_efron_complementary_features_list']
out_df['rf_efron_ensemble_rmse'] = new_out_df['rf_efron_ensemble_rmse']

out_df.to_csv('../output/network_model/yeast_all_tf_high_var_target_efron_train.csv.gz', compression='gzip')

100%|██████████| 385/385 [05:36<00:00,  1.14it/s]


In [30]:
tf_list_df = pd.read_csv('../output/network_model/yeast_tf.csv', names=['tf'], index_col=0)
out_df = pd.read_csv('../output/network_model/yeast_all_tf_high_var_target_efron_train.csv.gz', index_col=0, compression='gzip')
available_tfs = set(X.index)
rf_efron_overlap_count = []
for target_gene in out_df.index:
    gs_tf_list = network_df.loc[target_gene].tf_list
    gs_tf_set = set(gs_tf_list.split('; '))
    gs_tf_set = available_tfs.intersection(gs_tf_set)
    if target_gene in gs_tf_set: gs_tf_set.remove(target_gene)
    efron_tf_list = out_df.loc[target_gene]['rf_efron_features']
    efron_tf_list = efron_tf_list.split('; ')
    efron_tf_list = [int(i) for i in efron_tf_list]
    efron_tf_list = tf_list_df.iloc[efron_tf_list].index
    efron_tf_set = set(efron_tf_list)
    rf_efron_overlap_count.append(len(efron_tf_set.intersection(gs_tf_set)))
out_df['rf_efron_overlap_count'] = rf_efron_overlap_count
out_df.to_csv('../output/network_model/yeast_all_tf_high_var_target_efron_train.csv.gz', compression='gzip')


In [31]:
out_df = pd.read_csv('../output/network_model/yeast_all_tf_high_var_target_efron_train.csv.gz', index_col=0, compression='gzip')
iter_length = len(target_gene_list)
new_out_df = pd.DataFrame(index=target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.rf_top_10, range(iter_length)), total=iter_length))
top10_r = np.array(r)
new_out_df['rf_top10_score'] = top10_r[:, 0].astype('float64')
new_out_df['rf_top10_rmse'] = top10_r[:, 1].astype('float64')
new_out_df = new_out_df.loc[out_df.index]
out_df['rf_top10_score'] = new_out_df['rf_top10_score']
out_df['rf_top10_rmse'] = new_out_df['rf_top10_rmse']
out_df.to_csv('../output/network_model/yeast_all_tf_high_var_target_efron_train.csv.gz', compression='gzip')

100%|██████████| 385/385 [00:12<00:00, 31.37it/s]


In [33]:
out_df.mean()

rf_score                              0.609127
linear_score                          0.147335
gs_rf_score                           0.579119
gs_linear_score                       0.456120
rf_with_linear_top_features_score     0.620660
linear_with_rf_top_features_score     0.442544
rf_rmse                               0.876901
linear_rmse                           1.203858
gs_rf_rmse                            0.922259
gs_linear_rmse                        1.091308
rf_with_linear_top_features_rmse      0.865159
linear_with_rf_top_features_rmse      0.991998
rf_with_top_features_score            0.605025
linear_with_top_features_score        0.020866
rf_with_top_features_rmse             0.879590
linear_with_top_features_rmse         1.254643
rf_top_feature_num                   20.000000
linear_top_feature_num               20.000000
rf_top_features_gs_overlap            2.192208
linear_top_features_gs_overlap        1.724675
rf_linear_top_features_overlap        2.880519
gs_edge_num  