In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import product
from functools import cmp_to_key, partial
import re

import mp_run
import conf_interval

from os import listdir
from os.path import isfile, join
from sklearn.metrics import mean_squared_error, explained_variance_score

import os

from sklearn.model_selection import cross_val_score
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from scipy import stats

from multiprocessing import Pool, cpu_count


%load_ext autoreload
%autoreload 2

# regex for number extraction from string
number_pattern =  r'(-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?)'

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams.update({'figure.figsize':(7,5), 'figure.dpi':100})

In [2]:
network_df = pd.read_csv('../data/mouse/regnetworkweb.org.network', sep='\t')
# network_df['regulator'] = [name.upper() for name in network_df['regulator']]
# network_df['target'] = [name.upper() for name in network_df['target']]
regulator_set = set(network_df['regulator'])
target_set = set(network_df['target'])

In [3]:
train_source_1 = pd.read_csv('../data/mouse/GSE115553/normalized/train_source.csv', index_col=0)
train_target_1 = pd.read_csv('../data/mouse/GSE115553/normalized/train_target.csv', index_col=0)
test_source_1 = pd.read_csv('../data/mouse/GSE115553/normalized/test_source.csv', index_col=0)
test_target_1 = pd.read_csv('../data/mouse/GSE115553/normalized/test_target.csv', index_col=0)

train_source_2 = pd.read_csv('../data/mouse/GSE151173/normalized/train_source.csv', index_col=0)
train_target_2 = pd.read_csv('../data/mouse/GSE151173/normalized/train_target.csv', index_col=0)
test_source_2 = pd.read_csv('../data/mouse/GSE151173/normalized/test_source.csv', index_col=0)
test_target_2 = pd.read_csv('../data/mouse/GSE151173/normalized/test_target.csv', index_col=0)

train_source_3 = pd.read_csv('../data/mouse/GSE171975/normalized/train_source.csv', index_col=0)
train_target_3 = pd.read_csv('../data/mouse/GSE171975/normalized/train_target.csv', index_col=0)
test_source_3 = pd.read_csv('../data/mouse/GSE171975/normalized/test_source.csv', index_col=0)
test_target_3 = pd.read_csv('../data/mouse/GSE171975/normalized/test_target.csv', index_col=0)

In [6]:
common_exp_genes = list(set(train_source_1.index).intersection(set(train_source_2.index), set(train_source_3.index)))

train_source_1 = train_source_1.loc[common_exp_genes]
train_target_1 = train_target_1.loc[common_exp_genes]
test_source_1 = test_source_1.loc[common_exp_genes]
test_target_1 = test_target_1.loc[common_exp_genes]

train_source_2 = train_source_2.loc[common_exp_genes]
train_target_2 = train_target_2.loc[common_exp_genes]
test_source_2 = test_source_2.loc[common_exp_genes]
test_target_2 = test_target_2.loc[common_exp_genes]

train_source_3 = train_source_3.loc[common_exp_genes]
train_target_3 = train_target_3.loc[common_exp_genes]
test_source_3 = test_source_3.loc[common_exp_genes]
test_target_3 = test_target_3.loc[common_exp_genes]

In [7]:
train_source = pd.concat([train_source_1, train_source_2, train_source_3], axis=1)
train_target = pd.concat([train_target_1, train_target_2, train_target_3], axis=1)
test_source = pd.concat([test_source_1, test_source_2, test_source_3], axis=1)
test_target = pd.concat([test_target_1, test_target_2, test_target_3], axis=1)

In [8]:
regulator_set = regulator_set.intersection(set(train_source.index))
target_set = target_set.intersection(set(train_source.index))
all_gene_set = regulator_set.union(target_set)


In [9]:
network_dict = {target: [] for target in target_set}
for ind, row in network_df.iterrows():
    if (row['regulator'] in regulator_set) and (row['target'] in target_set):
        network_dict[row['target']].append(row['regulator'])

In [10]:
key_list = []
value_list = []
regulator_set = set()
target_set = set()
for key in network_dict.keys():
    if (len(network_dict[key]) > 0) and network_dict[key][0] != key:
        key_list.append(key)
        target_set.add(key)
        value_list.append("; ".join(network_dict[key]))
        for regulator in network_dict[key]:
            regulator_set.add(regulator)
all_gene_set = regulator_set.union(target_set)

In [11]:
network_df = pd.DataFrame(index=key_list)
network_df['tf_list'] = value_list

In [12]:
np.mean([len(i.split('; ')) for i in value_list])

4.911825599805139

In [13]:
target_df = pd.concat([train_target, test_target], axis=1)
source_df = pd.concat([train_source, test_source], axis=1)



In [16]:
target_gene_list = list(target_set)
target_exp = target_df.loc[target_gene_list]
X = source_df.loc[list(regulator_set)]
tf_list = list(regulator_set)

In [17]:
mp_calc = mp_run.MpCalc(target_gene_list, target_exp, X, network_df, train_source.loc[tf_list], train_target, test_source.loc[tf_list], test_target)

In [18]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.full_comp_new, range(iter_length)), total=iter_length))

100%|██████████| 8211/8211 [18:01<00:00,  7.59it/s]


In [19]:
r = np.array(r)
out_df = pd.DataFrame(index=target_gene_list)
out_df['rf_score'] = r[:, 0]
out_df['linear_score'] = r[:, 1]
out_df['gs_rf_score'] = r[:, 2]
out_df['gs_linear_score'] = r[:, 3]
out_df['rf_with_linear_top_features_score'] = r[:, 4]
out_df['linear_with_rf_top_features_score'] = r[:, 5]
out_df['rf_rmse'] = r[:, 6]
out_df['linear_rmse'] = r[:, 7]
out_df['gs_rf_rmse'] = r[:, 8]
out_df['gs_linear_rmse'] = r[:, 9]
out_df['rf_with_linear_top_features_rmse'] = r[:, 10]
out_df['linear_with_rf_top_features_rmse'] = r[:, 11]
out_df['rf_with_top_features_score'] = r[:, 12]
out_df['linear_with_top_features_score'] = r[:, 13]
out_df['rf_with_top_features_rmse'] = r[:, 14]
out_df['linear_with_top_features_rmse'] = r[:, 15]
out_df['rf_top_feature_num'] = r[:, 16]
out_df['linear_top_feature_num'] = r[:, 17]
out_df['rf_top_features_gs_overlap'] = r[:, 18]
out_df['linear_top_features_gs_overlap'] = r[:, 19]
out_df['rf_linear_top_features_overlap'] = r[:, 20]
out_df['gs_edge_num'] = r[:, 21]
out_df['test_var'] = r[:, 22]
out_df['test_std'] = r[:, 23]

In [20]:
filtered_df = out_df[out_df['test_std'] > 0.5]

In [21]:
filtered_df.mean()

rf_score                              0.359202
linear_score                          0.285572
gs_rf_score                           0.154726
gs_linear_score                      -0.003445
rf_with_linear_top_features_score     0.334008
linear_with_rf_top_features_score     0.288027
rf_rmse                               1.211061
linear_rmse                           1.270461
gs_rf_rmse                            1.488513
gs_linear_rmse                        1.686018
rf_with_linear_top_features_rmse      1.200872
linear_with_rf_top_features_rmse      1.406418
rf_with_top_features_score            0.256712
linear_with_top_features_score        0.247896
rf_with_top_features_rmse             1.318530
linear_with_top_features_rmse         1.311779
rf_top_feature_num                    8.125806
linear_top_feature_num               11.851613
rf_top_features_gs_overlap            0.054839
linear_top_features_gs_overlap        0.180645
rf_linear_top_features_overlap        0.712903
gs_edge_num  

In [22]:
out_df.to_csv('./mouse_bulk_network_v_model.csv')