In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import product
from functools import cmp_to_key, partial
import re

import mp_run
import conf_interval

from os import listdir
from os.path import isfile, join
from sklearn.metrics import mean_squared_error, explained_variance_score

import os

from sklearn.model_selection import cross_val_score
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from scipy import stats

from multiprocessing import Pool, cpu_count


%load_ext autoreload
%autoreload 2

# regex for number extraction from string
number_pattern =  r'(-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?)'

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams.update({'figure.figsize':(7,5), 'figure.dpi':100})

In [2]:
network_df = pd.read_csv('../data/mouse/regnetworkweb.org.network', sep='\t')
network_df['regulator'] = [name.upper() for name in network_df['regulator']]
network_df['target'] = [name.upper() for name in network_df['target']]
regulator_set = set(network_df['regulator'])
target_set = set(network_df['target'])

In [3]:
train_source = pd.read_csv('../data/mouse/normalized_train_test_sets/train_source.csv', index_col=0)
train_target = pd.read_csv('../data/mouse/normalized_train_test_sets/train_target.csv', index_col=0)
test_source = pd.read_csv('../data/mouse/normalized_train_test_sets/test_source.csv', index_col=0)
test_target = pd.read_csv('../data/mouse/normalized_train_test_sets/test_target.csv', index_col=0)

In [4]:
regulator_set = regulator_set.intersection(set(train_source.index))
target_set = target_set.intersection(set(train_source.index))
all_gene_set = regulator_set.union(target_set)


In [5]:
network_dict = {target: [] for target in target_set}
for ind, row in network_df.iterrows():
    if (row['regulator'] in regulator_set) and (row['target'] in target_set):
        network_dict[row['target']].append(row['regulator'])

In [6]:
key_list = []
value_list = []
regulator_set = set()
target_set = set()
for key in network_dict.keys():
    if (len(network_dict[key]) > 0) and network_dict[key][0] != key:
        key_list.append(key)
        target_set.add(key)
        value_list.append("; ".join(network_dict[key]))
        for regulator in network_dict[key]:
            regulator_set.add(regulator)
all_gene_set = regulator_set.union(target_set)

In [7]:
network_df = pd.DataFrame(index=key_list)
network_df['tf_list'] = value_list

In [8]:
np.mean([len(i.split('; ')) for i in value_list])

4.260640232811932

In [9]:
target_df = pd.concat([train_target, test_target], axis=1)
source_df = pd.concat([train_source, test_source], axis=1)



In [10]:
target_gene_list = list(target_set)
target_exp = target_df.loc[target_gene_list]
X = source_df.loc[list(regulator_set)]
tf_list = list(regulator_set)

In [11]:
mp_calc = mp_run.MpCalc(target_gene_list, target_exp, X, network_df, train_source.loc[tf_list], train_target, test_source.loc[tf_list], test_target)

In [23]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.full_comp, range(iter_length)), total=iter_length))

100%|████████████████████████████████████████████████████████████████████████████| 5498/5498 [12:00<00:00,  7.63it/s]


In [24]:
r = np.array(r)
out_df = pd.DataFrame(index=target_gene_list)
out_df['rf_score'] = r[:, 0]
out_df['linear_score'] = r[:, 1]
out_df['gs_rf_score'] = r[:, 2]
out_df['gs_linear_score'] = r[:, 3]
out_df['rf_with_linear_top_features_score'] = r[:, 4]
out_df['linear_with_rf_top_features_score'] = r[:, 5]
out_df['rf_rmse'] = r[:, 6]
out_df['linear_rmse'] = r[:, 7]
out_df['gs_rf_rmse'] = r[:, 8]
out_df['gs_linear_rmse'] = r[:, 9]
out_df['rf_with_linear_top_features_rmse'] = r[:, 10]
out_df['linear_with_rf_top_features_rmse'] = r[:, 11]


In [27]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    feature_num_r = list(tqdm(p.imap(mp_calc.top_feature_num, range(iter_length)), total=iter_length))

100%|████████████████████████████████████████████████████████████████████████████| 5498/5498 [07:32<00:00, 12.15it/s]


In [28]:
feature_num_r = np.array(feature_num_r)
out_df['rf_top_feature_num'] = feature_num_r[:, 0]
out_df['linear_top_feature_num'] = feature_num_r[:, 1]
out_df.to_csv('mesc_network_v_model.csv')

In [33]:
out_df['linear_top_feature_num'].mean()

15.445252819206985

In [34]:
out_df.to_csv('./mdc_network_v_model.csv')

In [15]:
out_df = pd.read_csv('./mdc_network_v_model.csv', index_col=0)

In [17]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    top_feature_r = list(tqdm(p.imap(mp_calc.top_feature_model, range(iter_length)), total=iter_length))

100%|██████████| 5498/5498 [07:36<00:00, 12.04it/s]


In [21]:
top_feature_r = np.array(top_feature_r)
out_df['rf_with_top_features_score'] = top_feature_r[:, 0]
out_df['linear_with_top_features_score'] = top_feature_r[:, 1]
out_df['rf_with_top_features_rmse'] = top_feature_r[:, 2]
out_df['linear_with_top_features_rmse'] = top_feature_r[:, 3]
out_df['test_var'] = test_target.loc[out_df.index].var(axis=1)
out_df['test_std'] = test_target.loc[out_df.index].std(axis=1)
out_df.to_csv('mdc_network_v_model.csv')

In [12]:
out_df = pd.read_csv('./mdc_network_v_model.csv', index_col=0)

iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    top_feature_overlap_r = list(tqdm(p.imap(mp_calc.feature_overlap, range(iter_length)), total=iter_length))


100%|██████████| 5498/5498 [07:06<00:00, 12.90it/s]


In [13]:
top_feature_overlap_r = np.array(top_feature_overlap_r)
out_df['rf_top_features_gs_overlap'] = top_feature_overlap_r[:, 0]
out_df['linear_top_features_gs_overlap'] = top_feature_overlap_r[:, 1]
out_df['rf_linear_top_features_overlap'] = top_feature_overlap_r[:, 2]
out_df['gs_edge_num'] = top_feature_overlap_r[:, 3]
out_df.to_csv('mdc_network_v_model.csv')

In [14]:
out_df.mean()

rf_score                             -0.101098
linear_score                         -1.767664
gs_rf_score                          -0.373089
gs_linear_score                      -0.081057
rf_with_linear_top_features_score    -0.220363
linear_with_rf_top_features_score    -0.242898
rf_rmse                               0.571206
linear_rmse                           0.833878
gs_rf_rmse                            0.624230
gs_linear_rmse                        0.568245
rf_with_linear_top_features_rmse      0.596711
linear_with_rf_top_features_rmse      0.600322
rf_top_feature_num                   13.177155
linear_top_feature_num               15.445253
rf_with_top_features_score           -0.218683
linear_with_top_features_score       -0.430339
rf_with_top_features_rmse             0.594815
linear_with_top_features_rmse         0.637168
test_var                              0.418036
test_std                              0.550299
rf_top_features_gs_overlap            0.109131
linear_top_fe