In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import product
from functools import cmp_to_key, partial
import re

import mp_run
import conf_interval

from os import listdir
from os.path import isfile, join
from sklearn.metrics import mean_squared_error, explained_variance_score

import os

from sklearn.model_selection import cross_val_score
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from scipy import stats

from multiprocessing import Pool, cpu_count


%load_ext autoreload
%autoreload 2

# regex for number extraction from string
number_pattern =  r'(-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?)'

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams.update({'figure.figsize':(7,5), 'figure.dpi':100})

In [2]:
network_df = pd.read_csv('../data/mouse/regnetworkweb.org.network', sep='\t')
network_df['regulator'] = [name.upper() for name in network_df['regulator']]
network_df['target'] = [name.upper() for name in network_df['target']]
regulator_set = set(network_df['regulator'])
target_set = set(network_df['target'])

In [3]:
exp_df = pd.read_csv('../data/mouse/GSE75210/GSE75210_myd88_exp_normalized.csv.gz', compression='gzip', index_col=0)
exp_df.index = [ind.upper() for ind in exp_df.index]
exp_df.head()

Unnamed: 0,WT1.Pam3Cys.T0,WT1.Poly.I.C..T0,WT1.LPS.T0,WT1.FLA.T0,WT1.R848.T0,WT1.CpG.T0,WT1.MDP.T0,WT1.MDP.LPS.T0,WT1.TDB.T0,WT1.SeV.T0,...,KO2.LPS.T4,KO2.FLA.T4,KO2.R848.T4,KO2.CpG.T4,KO2.MDP.T4,KO2.MDP.LPS.T4,KO2.TDB.T4,KO2.SeV.T4,KO2.Zymosan.T4,KO2.HT.DNA.T4
MIR3110,-0.065838,-0.074808,-0.067086,-0.071765,-0.067077,-0.073372,-0.062468,-0.074115,-0.062609,-0.077727,...,-0.074462,-0.073417,-0.066189,-0.073996,-0.073853,-0.075339,-0.073473,-0.078803,-0.075854,-0.072035
LCE3A,-0.066182,-0.075266,-0.067453,-0.07217,-0.067443,-0.073701,-0.062747,-0.0746,-0.062943,-0.078272,...,-0.075257,-0.07387,-0.067455,-0.074361,-0.074238,-0.076185,-0.073881,-0.079239,-0.076215,-0.072613
GAS1,-0.066203,-0.075294,-0.067475,-0.072194,-0.067465,-0.073721,-0.062764,-0.07463,-0.062963,-0.078305,...,-0.075305,-0.073897,-0.067532,-0.074384,-0.074262,-0.076236,-0.073906,-0.079265,-0.076237,-0.072648
PAK6,-0.066212,-0.075306,-0.067485,-0.072205,-0.067475,-0.07373,-0.062772,-0.074642,-0.062972,-0.07832,...,-0.075327,-0.073909,-0.067565,-0.074393,-0.074272,-0.076259,-0.073917,-0.079277,-0.076247,-0.072663
TRUB2,-0.062852,-0.060419,-0.045994,-0.06563,-0.040109,-0.066236,-0.042803,-0.054115,-0.045595,-0.072997,...,-0.067564,-0.059189,-0.067564,-0.064876,-0.049205,-0.062495,-0.063295,-0.073603,-0.071545,-0.044443


In [4]:
rep_set = set()
time_set = set()
time_pattern = r'(.*)\.T([-+]?\d*\.?\d+)'
for name in exp_df.columns:
    match = re.search(time_pattern, name)
    if match:
        rep_set.add(match.group(1))
        if ('5' in match.group(2)):
            time_set.add(float(match.group(2)))
        else: time_set.add(int(match.group(2)))
rep_list = list(rep_set)
time_list = list(time_set)
time_list.sort()

train_source_idx = [i[0]+'.T'+str(i[1]) for i in product(rep_list, time_list[0:5])]
train_target_idx = [i[0]+'.T'+str(i[1]) for i in product(rep_list, time_list[1:6])]
test_source_idx = [i[0]+'.T'+str(i[1]) for i in product(rep_list, time_list[5:7])]
test_target_idx = [i[0]+'.T'+str(i[1]) for i in product(rep_list, time_list[6:8])]

In [5]:
train_source = exp_df[train_source_idx]
train_target = exp_df[train_target_idx]
test_source = exp_df[test_source_idx]
test_target = exp_df[test_target_idx]

In [6]:
regulator_set = regulator_set.intersection(set(train_source.index))
target_set = target_set.intersection(set(train_source.index))
all_gene_set = regulator_set.union(target_set)


In [7]:
network_dict = {target: [] for target in target_set}
for ind, row in network_df.iterrows():
    if (row['regulator'] in regulator_set) and (row['target'] in target_set):
        network_dict[row['target']].append(row['regulator'])

In [8]:
key_list = []
value_list = []
regulator_set = set()
target_set = set()
for key in network_dict.keys():
    if (len(network_dict[key]) > 0) and network_dict[key][0] != key:
        key_list.append(key)
        target_set.add(key)
        value_list.append("; ".join(network_dict[key]))
        for regulator in network_dict[key]:
            regulator_set.add(regulator)
all_gene_set = regulator_set.union(target_set)

In [9]:
network_df = pd.DataFrame(index=key_list)
network_df['tf_list'] = value_list

In [10]:
np.mean([len(i.split('; ')) for i in value_list])

5.534305035782286

In [11]:
target_df = pd.concat([train_target, test_target], axis=1)
source_df = pd.concat([train_source, test_source], axis=1)



In [12]:
target_gene_list = list(target_set)
target_exp = target_df.loc[target_gene_list]
X = source_df.loc[list(regulator_set)]
tf_list = list(regulator_set)

In [13]:
mp_calc = mp_run.MpCalc(target_gene_list, target_exp, X, network_df, train_source.loc[tf_list], train_target, test_source.loc[tf_list], test_target)

In [14]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.full_comp_new, range(iter_length)), total=iter_length))

100%|██████████| 15231/15231 [49:37<00:00,  5.11it/s] 


In [15]:
r = np.array(r)
out_df = pd.DataFrame(index=target_gene_list)
out_df['rf_score'] = r[:, 0]
out_df['linear_score'] = r[:, 1]
out_df['gs_rf_score'] = r[:, 2]
out_df['gs_linear_score'] = r[:, 3]
out_df['rf_with_linear_top_features_score'] = r[:, 4]
out_df['linear_with_rf_top_features_score'] = r[:, 5]
out_df['rf_rmse'] = r[:, 6]
out_df['linear_rmse'] = r[:, 7]
out_df['gs_rf_rmse'] = r[:, 8]
out_df['gs_linear_rmse'] = r[:, 9]
out_df['rf_with_linear_top_features_rmse'] = r[:, 10]
out_df['linear_with_rf_top_features_rmse'] = r[:, 11]
out_df['rf_with_top_features_score'] = r[:, 12]
out_df['linear_with_top_features_score'] = r[:, 13]
out_df['rf_with_top_features_rmse'] = r[:, 14]
out_df['linear_with_top_features_rmse'] = r[:, 15]
out_df['rf_top_feature_num'] = r[:, 16]
out_df['linear_top_feature_num'] = r[:, 17]
out_df['rf_top_features_gs_overlap'] = r[:, 18]
out_df['linear_top_features_gs_overlap'] = r[:, 19]
out_df['rf_linear_top_features_overlap'] = r[:, 20]
out_df['gs_edge_num'] = r[:, 21]
out_df['test_var'] = r[:, 22]
out_df['test_std'] = r[:, 23]

In [16]:
filtered_df = out_df[out_df['test_std'] > 0.5]

In [17]:
filtered_df.mean()

rf_score                              0.220026
linear_score                          0.197951
gs_rf_score                           0.016339
gs_linear_score                      -0.005822
rf_with_linear_top_features_score     0.214258
linear_with_rf_top_features_score     0.210510
rf_rmse                               1.633006
linear_rmse                           1.579061
gs_rf_rmse                            1.797422
gs_linear_rmse                        1.835421
rf_with_linear_top_features_rmse      1.613729
linear_with_rf_top_features_rmse      1.618866
rf_with_top_features_score            0.245231
linear_with_top_features_score        0.220703
rf_with_top_features_rmse             1.565803
linear_with_top_features_rmse         1.615022
rf_top_feature_num                   12.363158
linear_top_feature_num               15.136842
rf_top_features_gs_overlap            0.194737
linear_top_features_gs_overlap        0.147368
rf_linear_top_features_overlap        0.963158
gs_edge_num  

In [20]:
out_df.to_csv('./mdc_network_v_model.csv')