In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import product
import re

import mp_run
import conf_interval
from sklearn import linear_model

from os import listdir
from os.path import isfile, join
from sklearn.metrics import mean_squared_error, explained_variance_score

import os

from sklearn.model_selection import cross_val_score
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from scipy import stats

from multiprocessing import Pool, cpu_count


%load_ext autoreload
%autoreload 2

# regex for number extraction from string
number_pattern =  r'(-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?)'

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams.update({'figure.figsize':(7,5), 'figure.dpi':100})

In [2]:
source_df_1 = pd.read_csv('../data/bsubtilis/GSE108659/formated/source.csv', index_col=0).apply(stats.zscore, axis=0)
source_df_2 = pd.read_csv('../data/bsubtilis/GSE128875/formated/source.csv', index_col=0).apply(stats.zscore, axis=0)
source_df_3 = pd.read_csv('../data/bsubtilis/GSE224332/formated/source.csv', index_col=0).apply(stats.zscore, axis=0)
target_df_1 = pd.read_csv('../data/bsubtilis/GSE108659/formated/target.csv', index_col=0).apply(stats.zscore, axis=0)
target_df_2 = pd.read_csv('../data/bsubtilis/GSE128875/formated/target.csv', index_col=0).apply(stats.zscore, axis=0)
target_df_3 = pd.read_csv('../data/bsubtilis/GSE224332/formated/target.csv', index_col=0).apply(stats.zscore, axis=0)

In [3]:
train_source_df_1 = pd.read_csv('../data/bsubtilis/GSE108659/formated/train_source.csv', index_col=0).apply(stats.zscore, axis=0)
train_source_df_2 = pd.read_csv('../data/bsubtilis/GSE128875/formated/train_source.csv', index_col=0).apply(stats.zscore, axis=0)
train_source_df_3 = pd.read_csv('../data/bsubtilis/GSE224332/formated/train_source.csv', index_col=0).apply(stats.zscore, axis=0)
train_target_df_1 = pd.read_csv('../data/bsubtilis/GSE108659/formated/train_target.csv', index_col=0).apply(stats.zscore, axis=0)
train_target_df_2 = pd.read_csv('../data/bsubtilis/GSE128875/formated/train_target.csv', index_col=0).apply(stats.zscore, axis=0)
train_target_df_3 = pd.read_csv('../data/bsubtilis/GSE224332/formated/train_target.csv', index_col=0).apply(stats.zscore, axis=0)

test_source_df_1 = pd.read_csv('../data/bsubtilis/GSE108659/formated/test_source.csv', index_col=0).apply(stats.zscore, axis=0)
test_source_df_2 = pd.read_csv('../data/bsubtilis/GSE128875/formated/test_source.csv', index_col=0).apply(stats.zscore, axis=0)
test_source_df_3 = pd.read_csv('../data/bsubtilis/GSE224332/formated/test_source.csv', index_col=0).apply(stats.zscore, axis=0)
test_target_df_1 = pd.read_csv('../data/bsubtilis/GSE108659/formated/test_target.csv', index_col=0).apply(stats.zscore, axis=0)
test_target_df_2 = pd.read_csv('../data/bsubtilis/GSE128875/formated/test_target.csv', index_col=0).apply(stats.zscore, axis=0)
test_target_df_3 = pd.read_csv('../data/bsubtilis/GSE224332/formated/test_target.csv', index_col=0).apply(stats.zscore, axis=0)

In [4]:
network_df = pd.read_csv('../data/bsubtilis/gs_regulations.csv')
network_df
regulator_set = set(network_df['regulator name'])
target_set = set(network_df['gene name'])

In [5]:
regulator_set = regulator_set.intersection(set(source_df_1.index), set(source_df_2.index), set(source_df_3.index))
target_set = target_set.intersection(set(source_df_1.index), set(source_df_2.index), set(source_df_3.index))
all_gene_set = regulator_set.union(target_set)


In [6]:
network_dict = {target: [] for target in target_set}
for ind, row in network_df.iterrows():
    if (row['regulator name'] in regulator_set) and (row['gene name'] in target_set):
        network_dict[row['gene name']].append(row['regulator name'])

In [7]:
key_list = []
value_list = []
regulator_set = set()
target_set = set()
for key in network_dict.keys():
    if (len(network_dict[key]) > 0):
        key_list.append(key)
        target_set.add(key)
        value_list.append("; ".join(network_dict[key]))
        for regulator in network_dict[key]:
            regulator_set.add(regulator)
all_gene_set = regulator_set.union(target_set)

In [8]:
network_df = pd.DataFrame(index=key_list)
network_df['tf_list'] = value_list

In [9]:
source_df_1 = source_df_1.loc[list(all_gene_set)]
source_df_2 = source_df_2.loc[list(all_gene_set)]
source_df_3 = source_df_3.loc[list(all_gene_set)]
target_df_1 = target_df_1.loc[list(all_gene_set)]
target_df_2 = target_df_2.loc[list(all_gene_set)]
target_df_3 = target_df_3.loc[list(all_gene_set)]

train_source_df_1 = train_source_df_1.loc[list(all_gene_set)]
train_source_df_2 = train_source_df_2.loc[list(all_gene_set)]
train_source_df_3 = train_source_df_3.loc[list(all_gene_set)]
train_target_df_1 = train_target_df_1.loc[list(all_gene_set)]
train_target_df_2 = train_target_df_2.loc[list(all_gene_set)]
train_target_df_3 = train_target_df_3.loc[list(all_gene_set)]

test_source_df_1 = test_source_df_1.loc[list(all_gene_set)]
test_source_df_2 = test_source_df_2.loc[list(all_gene_set)]
test_source_df_3 = test_source_df_3.loc[list(all_gene_set)]
test_target_df_1 = test_target_df_1.loc[list(all_gene_set)]
test_target_df_2 = test_target_df_2.loc[list(all_gene_set)]
test_target_df_3 = test_target_df_3.loc[list(all_gene_set)]


In [10]:
train_source = pd.concat([train_source_df_1, train_source_df_2, train_source_df_3], axis=1).apply(stats.zscore, axis=0)
test_source = pd.concat([test_source_df_1, test_source_df_2, test_source_df_3], axis=1).apply(stats.zscore, axis=0)
train_target = pd.concat([train_target_df_1, train_target_df_2, train_target_df_3], axis=1).apply(stats.zscore, axis=0)
test_target = pd.concat([test_target_df_1, test_target_df_2, test_target_df_3], axis=1).apply(stats.zscore, axis=0)

target_df = pd.concat([target_df_1, target_df_2, target_df_3], axis=1).apply(stats.zscore, axis=0)
source_df = pd.concat([source_df_1, source_df_2, source_df_3], axis=1).apply(stats.zscore, axis=0)






In [11]:
for a in list(target_set):
    if len(network_df.loc[a]['tf_list'].split('; ')) < 1 or network_df.loc[a]['tf_list'] == a:
        target_set.remove(a)
    

In [12]:
target_gene_list = list(target_set)
target_exp = target_df
X = source_df.loc[list(regulator_set)]
tf_list = list(regulator_set)

In [13]:
mp_calc = mp_run.MpCalc(target_gene_list, target_exp, X, network_df, train_source.loc[tf_list], train_target, test_source.loc[tf_list], test_target)

In [14]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.full_comp_new, range(iter_length)), total=iter_length))

100%|████████████████████████████████████████████████████████████████████████████| 1878/1878 [01:57<00:00, 16.03it/s]


In [15]:
r = np.array(r)
out_df = pd.DataFrame(index=target_gene_list)
out_df['rf_score'] = r[:, 0]
out_df['linear_score'] = r[:, 1]
out_df['gs_rf_score'] = r[:, 2]
out_df['gs_linear_score'] = r[:, 3]
out_df['rf_with_linear_top_features_score'] = r[:, 4]
out_df['linear_with_rf_top_features_score'] = r[:, 5]
out_df['rf_rmse'] = r[:, 6]
out_df['linear_rmse'] = r[:, 7]
out_df['gs_rf_rmse'] = r[:, 8]
out_df['gs_linear_rmse'] = r[:, 9]
out_df['rf_with_linear_top_features_rmse'] = r[:, 10]
out_df['linear_with_rf_top_features_rmse'] = r[:, 11]
out_df['rf_with_top_features_score'] = r[:, 12]
out_df['linear_with_top_features_score'] = r[:, 13]
out_df['rf_with_top_features_rmse'] = r[:, 14]
out_df['linear_with_top_features_rmse'] = r[:, 15]
out_df['rf_top_feature_num'] = r[:, 16]
out_df['linear_top_feature_num'] = r[:, 17]
out_df['rf_top_features_gs_overlap'] = r[:, 18]
out_df['linear_top_features_gs_overlap'] = r[:, 19]
out_df['rf_linear_top_features_overlap'] = r[:, 20]
out_df['gs_edge_num'] = r[:, 21]
out_df['test_var'] = r[:, 22]
out_df['test_std'] = r[:, 23]

In [16]:
filtered_df = out_df[out_df['test_std'] > 0.5]

In [19]:
filtered_df.mean()

rf_score                             0.536871
linear_score                         0.383967
gs_rf_score                          0.276440
gs_linear_score                      0.250533
rf_with_linear_top_features_score    0.525385
linear_with_rf_top_features_score    0.330399
rf_rmse                              0.455794
linear_rmse                          0.469487
gs_rf_rmse                           0.599354
gs_linear_rmse                       0.675194
rf_with_linear_top_features_rmse     0.461765
linear_with_rf_top_features_rmse     0.521236
rf_with_top_features_score           0.477573
linear_with_top_features_score       0.290781
rf_with_top_features_rmse            0.467211
linear_with_top_features_rmse        0.518547
rf_top_feature_num                   6.763984
linear_top_feature_num               8.308322
rf_top_features_gs_overlap           0.133697
linear_top_features_gs_overlap       0.225102
rf_linear_top_features_overlap       0.762619
gs_edge_num                       

In [20]:
out_df.to_csv('bsubtilis_network_v_model.csv')

In [16]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    feature_num_r = list(tqdm(p.imap(mp_calc.top_feature_num, range(iter_length)), total=iter_length))
feature_num_r = np.array(feature_num_r)

out_df['rf_top_feature_num'] = feature_num_r[:, 0]
out_df['linear_top_feature_num'] = feature_num_r[:, 1]
out_df['test_var'] = test_target.loc[out_df.index].var(axis=1)
out_df['test_std'] = test_target.loc[out_df.index].std(axis=1)

100%|██████████| 1878/1878 [01:21<00:00, 23.16it/s]


In [17]:
out_df.to_csv('bsubtilis_network_v_model.csv')

In [15]:
rf_conf_interval = conf_interval.conf_interval_calc(list(out_df['rf_score'].values))
print('('+', '.join([ '%.3f' % elem for elem in rf_conf_interval[2:] ])+')')
linear_conf_interval = conf_interval.conf_interval_calc(list(out_df['linear_score'].values))
print('('+', '.join([ '%.3f' % elem for elem in linear_conf_interval[2:] ])+')')
gs_rf_conf_interval = conf_interval.conf_interval_calc(list(out_df['gs_rf_score'].values))
print('('+', '.join([ '%.3f' % elem for elem in gs_rf_conf_interval[2:] ])+')')
gs_linear_conf_interval = conf_interval.conf_interval_calc(list(out_df['gs_linear_score'].values))
print('('+', '.join([ '%.3f' % elem for elem in gs_linear_conf_interval[2:] ])+')')
rf_with_linear_top_features_conf_interval = conf_interval.conf_interval_calc(list(out_df['rf_with_linear_top_features_score'].values))
print('('+', '.join([ '%.3f' % elem for elem in rf_with_linear_top_features_conf_interval[2:] ])+')')
linear_with_rf_top_features_conf_interval = conf_interval.conf_interval_calc(list(out_df['linear_with_rf_top_features_score'].values))
print('('+', '.join([ '%.3f' % elem for elem in linear_with_rf_top_features_conf_interval[2:] ])+')')


(-0.169, 0.183)
(-0.786, -0.095)
(-0.731, -0.239)
(-0.497, -0.170)
(-0.460, 0.181)
(-1.678, -0.335)


In [16]:
rf_conf_interval_rmse = conf_interval.conf_interval_calc(list(out_df['rf_rmse'].values))
print('('+', '.join([ '%.3f' % elem for elem in rf_conf_interval_rmse[2:] ])+')')
linear_conf_interval_rmse = conf_interval.conf_interval_calc(list(out_df['linear_rmse'].values))
print('('+', '.join([ '%.3f' % elem for elem in linear_conf_interval_rmse[2:] ])+')')
gs_rf_conf_interval_rmse = conf_interval.conf_interval_calc(list(out_df['gs_rf_rmse'].values))
print('('+', '.join([ '%.3f' % elem for elem in gs_rf_conf_interval_rmse[2:] ])+')')
gs_linear_conf_interval_rmse = conf_interval.conf_interval_calc(list(out_df['gs_linear_rmse'].values))
print('('+', '.join([ '%.3f' % elem for elem in gs_linear_conf_interval_rmse[2:] ])+')')
rf_with_linear_top_features_conf_interval_rmse = conf_interval.conf_interval_calc(list(out_df['rf_with_linear_top_features_rmse'].values))
print('('+', '.join([ '%.3f' % elem for elem in rf_with_linear_top_features_conf_interval_rmse[2:] ])+')')
linear_with_rf_top_features_conf_interval_rmse = conf_interval.conf_interval_calc(list(out_df['linear_with_rf_top_features_rmse'].values))
print('('+', '.join([ '%.3f' % elem for elem in linear_with_rf_top_features_conf_interval_rmse[2:] ])+')')


(0.263, 0.307)
(0.279, 0.321)
(0.345, 0.396)
(0.382, 0.434)
(0.263, 0.309)
(0.303, 0.350)


In [40]:
out_df.mean()[:6]

rf_score                            -0.027094
linear_score                        -0.388601
gs_rf_score                         -0.497414
gs_linear_score                     -0.316301
rf_with_linear_top_features_score   -0.062346
linear_with_rf_top_features_score   -0.897339
dtype: float64

In [21]:
from itertools import combinations
model_combs = list(combinations(out_df.columns[:6], 2))

In [22]:
for a, b in model_combs:
    t, p = stats.ttest_rel(out_df[a], out_df[b])
    c, d, lower, upper = conf_interval.conf_interval_calc(list(out_df[a]-out_df[b]))
    if (p > 0.05):
        print('{} and {} don\'t have statistically different performance'.format(a, b))
        continue
    if (t > 0):
        print('{} has statisically better performance than {}, with p-val of {}'.format(a, b, p))
        print('confidence interval: ({:.3f}, {:.3f})'.format(lower, upper))
    else:
        print('{} has statisically better performance than {}, with p-val of {}'.format(b, a, p))
        print('confidence interval: ({:.3f}, {:.3f})'.format(lower, upper))

rf_score has statisically better performance than linear_score, with p-val of 0.008944896371205472
confidence interval: (0.206, 0.754)
rf_score has statisically better performance than gs_rf_score, with p-val of 4.641393758205537e-05
confidence interval: (0.301, 0.689)
rf_score has statisically better performance than gs_linear_score, with p-val of 2.3786985183335984e-07
confidence interval: (0.232, 0.452)
rf_score and rf_with_linear_top_features_score don't have statistically different performance
rf_score has statisically better performance than linear_with_rf_top_features_score, with p-val of 0.008836968534223368
confidence interval: (0.429, 1.518)
linear_score and gs_rf_score don't have statistically different performance
linear_score and gs_linear_score don't have statistically different performance
linear_score and rf_with_linear_top_features_score don't have statistically different performance
linear_score and linear_with_rf_top_features_score don't have statistically different 

In [42]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    feature_num_r = list(tqdm(p.imap(mp_calc.top_feature_num, range(iter_length)), total=iter_length))

100%|████████████████████████████████████████████████████████████████████████████| 1878/1878 [01:18<00:00, 23.86it/s]


In [44]:
feature_num_r = np.array(feature_num_r)
out_df['rf_top_feature_num'] = feature_num_r[:, 0]
out_df['linear_top_feature_num'] = feature_num_r[:, 1]
out_df.to_csv('bsubtilis_network_v_model.csv')

In [18]:
out_df = pd.read_csv('bsubtilis_network_v_model.csv', index_col=0)
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    top_feature_overlap_r = list(tqdm(p.imap(mp_calc.feature_overlap, range(iter_length)), total=iter_length))


100%|██████████| 1878/1878 [01:21<00:00, 23.11it/s]


In [19]:
top_feature_overlap_r = np.array(top_feature_overlap_r)
out_df['rf_top_features_gs_overlap'] = top_feature_overlap_r[:, 0]
out_df['linear_top_features_gs_overlap'] = top_feature_overlap_r[:, 1]
out_df['rf_linear_top_features_overlap'] = top_feature_overlap_r[:, 2]
out_df['gs_edge_num'] = top_feature_overlap_r[:, 3]
out_df.to_csv('bsubtilis_network_v_model.csv')

In [21]:
out_df.mean()

rf_score                             0.029074
linear_score                        -0.388601
gs_rf_score                         -0.472382
gs_linear_score                     -0.316301
rf_with_linear_top_features_score   -0.046939
linear_with_rf_top_features_score   -1.034828
rf_rmse                              0.282726
linear_rmse                          0.298756
gs_rf_rmse                           0.371651
gs_linear_rmse                       0.409870
rf_with_linear_top_features_rmse     0.283854
linear_with_rf_top_features_rmse     0.326905
rf_top_feature_num                   6.474973
linear_top_feature_num               8.484026
test_var                             0.739452
test_std                             0.507727
rf_with_top_features_score          -0.174528
linear_with_top_features_score      -1.102215
rf_with_top_features_rmse            0.291284
linear_with_top_features_rmse        0.329560
rf_top_features_gs_overlap           0.162939
linear_top_features_gs_overlap    

In [54]:
len(set(target_df.columns).union(set(source_df.columns)))

117