In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import product
from functools import cmp_to_key, partial
import re

import mp_run
import conf_interval

from os import listdir
from os.path import isfile, join
from sklearn.metrics import mean_squared_error, explained_variance_score

import os

from sklearn.model_selection import cross_val_score
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from scipy import stats

from multiprocessing import Pool, cpu_count


%load_ext autoreload
%autoreload 2

# regex for number extraction from string
number_pattern =  r'(-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?)'

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams.update({'figure.figsize':(7,5), 'figure.dpi':100})

In [2]:
train_source_1 = pd.read_csv('../data/mouse/GSE115553/normalized/train_source.csv', index_col=0)
train_target_1 = pd.read_csv('../data/mouse/GSE115553/normalized/train_target.csv', index_col=0)
test_source_1 = pd.read_csv('../data/mouse/GSE115553/normalized/test_source.csv', index_col=0)
test_target_1 = pd.read_csv('../data/mouse/GSE115553/normalized/test_target.csv', index_col=0)

train_source_2 = pd.read_csv('../data/mouse/GSE151173/normalized/train_source.csv', index_col=0)
train_target_2 = pd.read_csv('../data/mouse/GSE151173/normalized/train_target.csv', index_col=0)
test_source_2 = pd.read_csv('../data/mouse/GSE151173/normalized/test_source.csv', index_col=0)
test_target_2 = pd.read_csv('../data/mouse/GSE151173/normalized/test_target.csv', index_col=0)

train_source_3 = pd.read_csv('../data/mouse/GSE171975/normalized/train_source.csv', index_col=0)
train_target_3 = pd.read_csv('../data/mouse/GSE171975/normalized/train_target.csv', index_col=0)
test_source_3 = pd.read_csv('../data/mouse/GSE171975/normalized/test_source.csv', index_col=0)
test_target_3 = pd.read_csv('../data/mouse/GSE171975/normalized/test_target.csv', index_col=0)

In [3]:
common_exp_genes = list(set(train_source_1.index).intersection(set(train_source_2.index), set(train_source_3.index)))

train_source_1 = train_source_1.loc[common_exp_genes]
train_target_1 = train_target_1.loc[common_exp_genes]
test_source_1 = test_source_1.loc[common_exp_genes]
test_target_1 = test_target_1.loc[common_exp_genes]

train_source_2 = train_source_2.loc[common_exp_genes]
train_target_2 = train_target_2.loc[common_exp_genes]
test_source_2 = test_source_2.loc[common_exp_genes]
test_target_2 = test_target_2.loc[common_exp_genes]

train_source_3 = train_source_3.loc[common_exp_genes]
train_target_3 = train_target_3.loc[common_exp_genes]
test_source_3 = test_source_3.loc[common_exp_genes]
test_target_3 = test_target_3.loc[common_exp_genes]

In [4]:
train_source = pd.concat([train_source_1, train_source_2, train_source_3], axis=1)
train_target = pd.concat([train_target_1, train_target_2, train_target_3], axis=1)
test_source = pd.concat([test_source_1, test_source_2, test_source_3], axis=1)
test_target = pd.concat([test_target_1, test_target_2, test_target_3], axis=1)

In [5]:
network_df = pd.read_csv('../data/mouse/regnetworkweb.org.network', sep='\t')
regulator_set = set(network_df['regulator'])
target_set = set(network_df['target'])

regulator_set = regulator_set.intersection(set(train_source.index))
target_set = target_set.intersection(set(train_source.index))
all_gene_set = regulator_set.union(target_set)
network_dict = {target: [] for target in target_set}
for ind, row in network_df.iterrows():
    if (row['regulator'] in regulator_set) and (row['target'] in target_set):
        network_dict[row['target']].append(row['regulator'])

key_list = []
value_list = []
regulator_set = set()
tf_list_df = pd.read_csv('../data/mouse_tf_list.tsv.gz', sep='\t', compression='gzip', index_col=0)
for name in tf_list_df['Gene Names']:
    name_splits = str(name).split(' ')
    for i in name_splits:
        if i in train_source.index:
            regulator_set.add(i)
target_set = set()
for key in network_dict.keys():
    if (len(network_dict[key]) > 0) and network_dict[key][0] != key:
        key_list.append(key)
        target_set.add(key)
        value_list.append("; ".join(network_dict[key]))
        for regulator in network_dict[key]:
            regulator_set.add(regulator)
all_gene_set = regulator_set.union(target_set)

print(len(regulator_set))

1385


In [30]:
# network_df = pd.read_csv('../data/mouse/regnetworkweb.org.network', sep='\t')
# regulator_set = set(network_df['regulator'])
# target_set = set(network_df['target'])

# regulator_set = regulator_set.intersection(set(train_source.index))
# target_set = target_set.intersection(set(train_source.index))
# all_gene_set = regulator_set.union(target_set)
# network_dict = {target: [] for target in target_set}
# for ind, row in network_df.iterrows():
#     if (row['regulator'] in regulator_set) and (row['target'] in target_set):
#         network_dict[row['target']].append(row['regulator'])

# key_list = []
# value_list = []
# regulator_set = set()
# target_set = set()
# for key in network_dict.keys():
#     if (len(network_dict[key]) > 0) and network_dict[key][0] != key:
#         key_list.append(key)
#         target_set.add(key)
#         value_list.append("; ".join(network_dict[key]))
#         for regulator in network_dict[key]:
#             regulator_set.add(regulator)
# all_gene_set = regulator_set.union(target_set)


# print(len(regulator_set))

780


In [6]:
network_df = pd.DataFrame(index=key_list)
network_df['tf_list'] = value_list

In [7]:
np.mean([len(i.split('; ')) for i in value_list])

4.911825599805139

In [8]:
target_df = pd.concat([train_target, test_target], axis=1)
source_df = pd.concat([train_source, test_source], axis=1)



In [10]:
target_gene_list = list(target_set)
target_exp = target_df.loc[target_gene_list]
X = source_df.loc[list(regulator_set)]
tf_list = list(regulator_set)

In [11]:
new_test_target = test_target.loc[target_gene_list]
new_test_target = new_test_target.loc[new_test_target.std(axis=1) > 0.5]
target_gene_list = new_test_target.index

In [12]:
mp_calc = mp_run.MpCalc(target_gene_list, target_exp, X, network_df, train_source.loc[tf_list], train_target, test_source.loc[tf_list], test_target)

In [46]:
test_target.shape

(9931, 169)

In [28]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.full_comp_new, range(iter_length)), total=iter_length))

100%|██████████| 310/310 [00:57<00:00,  5.37it/s]


In [29]:
r = np.array(r)
out_df = pd.DataFrame(index=target_gene_list)
out_df['rf_score'] = r[:, 0]
out_df['linear_score'] = r[:, 1]
out_df['gs_rf_score'] = r[:, 2]
out_df['gs_linear_score'] = r[:, 3]
out_df['rf_with_linear_top_features_score'] = r[:, 4]
out_df['linear_with_rf_top_features_score'] = r[:, 5]
out_df['rf_rmse'] = r[:, 6]
out_df['linear_rmse'] = r[:, 7]
out_df['gs_rf_rmse'] = r[:, 8]
out_df['gs_linear_rmse'] = r[:, 9]
out_df['rf_with_linear_top_features_rmse'] = r[:, 10]
out_df['linear_with_rf_top_features_rmse'] = r[:, 11]
out_df['rf_with_top_features_score'] = r[:, 12]
out_df['linear_with_top_features_score'] = r[:, 13]
out_df['rf_with_top_features_rmse'] = r[:, 14]
out_df['linear_with_top_features_rmse'] = r[:, 15]
out_df['rf_top_feature_num'] = r[:, 16]
out_df['linear_top_feature_num'] = r[:, 17]
out_df['rf_top_features_gs_overlap'] = r[:, 18]
out_df['linear_top_features_gs_overlap'] = r[:, 19]
out_df['rf_linear_top_features_overlap'] = r[:, 20]
out_df['gs_edge_num'] = r[:, 21]
out_df['test_var'] = r[:, 22]
out_df['test_std'] = r[:, 23]

In [30]:
filtered_df = out_df[out_df['test_std'] > 0.5]

In [31]:
filtered_df.mean()

rf_score                              0.370744
linear_score                          0.220818
gs_rf_score                           0.131542
gs_linear_score                      -0.003445
rf_with_linear_top_features_score     0.386506
linear_with_rf_top_features_score     0.105008
rf_rmse                               1.195792
linear_rmse                           1.305100
gs_rf_rmse                            1.493245
gs_linear_rmse                        1.686018
rf_with_linear_top_features_rmse      1.182053
linear_with_rf_top_features_rmse      1.531368
rf_with_top_features_score            0.309347
linear_with_top_features_score        0.212970
rf_with_top_features_rmse             1.266511
linear_with_top_features_rmse         1.334778
rf_top_feature_num                    9.119355
linear_top_feature_num               14.132258
rf_top_features_gs_overlap            0.109677
linear_top_features_gs_overlap        0.116129
rf_linear_top_features_overlap        0.551613
gs_edge_num  

In [22]:
out_df.to_csv('./mouse_bulk_network_v_model.csv')

In [15]:
out_df = pd.read_csv('./mouse_bulk_network_v_model.csv', index_col=0)


In [18]:
out_df = out_df.loc[target_gene_list]

In [32]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.efron_process_rf, range(iter_length)), total=iter_length))
efron_r = np.array(r)
out_df['rf_efron_feature_num'] = efron_r[:, 0]
out_df['rf_efron_rmse'] = efron_r[:, 1]
out_df['rf_efron_complementary_rmse'] = efron_r[:, 2]

100%|██████████| 310/310 [00:41<00:00,  7.51it/s]


In [33]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.efron_process_linear, range(iter_length)), total=iter_length))
efron_r = np.array(r)
out_df['linear_efron_feature_num'] = efron_r[:, 0]
out_df['linear_efron_rmse'] = efron_r[:, 1]
out_df['linear_efron_complementary_rmse'] = efron_r[:, 2]

100%|██████████| 310/310 [00:55<00:00,  5.63it/s]


In [35]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.efron_process_90th_rf, range(iter_length)), total=iter_length))
efron_r = np.array(r)
out_df['rf_efron_feature_num_90th'] = efron_r[:, 0]
out_df['rf_efron_rmse_90th'] = efron_r[:, 1]
out_df['rf_efron_complementary_rmse_90th'] = efron_r[:, 2]

100%|██████████| 310/310 [02:04<00:00,  2.49it/s]


In [37]:
out_df.to_csv('../output/network_model/mouse_all_tf_high_var_target.csv.gz', compression='gzip')

In [36]:
out_df.mean()

rf_score                               0.370744
linear_score                           0.220818
gs_rf_score                            0.131542
gs_linear_score                       -0.003445
rf_with_linear_top_features_score      0.386506
linear_with_rf_top_features_score      0.105008
rf_rmse                                1.195792
linear_rmse                            1.305100
gs_rf_rmse                             1.493245
gs_linear_rmse                         1.686018
rf_with_linear_top_features_rmse       1.182053
linear_with_rf_top_features_rmse       1.531368
rf_with_top_features_score             0.309347
linear_with_top_features_score         0.212970
rf_with_top_features_rmse              1.266511
linear_with_top_features_rmse          1.334778
rf_top_feature_num                     9.119355
linear_top_feature_num                14.132258
rf_top_features_gs_overlap             0.109677
linear_top_features_gs_overlap         0.116129
rf_linear_top_features_overlap         0