In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.linear_model import LinearRegression
import re

import mp_run

from os import listdir
from os.path import isfile, join
from sklearn.metrics import mean_squared_error

import os

from sklearn.model_selection import cross_val_score
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from scipy import stats

from multiprocessing import Pool, cpu_count


%load_ext autoreload
%autoreload 2

In [191]:
source_df_1 = pd.read_csv('../data/bsubtilis/GSE108659/formated/source.csv', index_col=0)
source_df_2 = pd.read_csv('../data/bsubtilis/GSE128875/formated/source.csv', index_col=0)
target_df_1 = pd.read_csv('../data/bsubtilis/GSE108659/formated/target.csv', index_col=0)
target_df_2 = pd.read_csv('../data/bsubtilis/GSE128875/formated/target.csv', index_col=0)

In [192]:
source_df_2 = source_df_2[~source_df_2.index.duplicated(keep=False)]
target_df_2 = target_df_2[~target_df_2.index.duplicated(keep=False)]

In [193]:
network_df = pd.read_csv('../data/bsubtilis/gs_regulations.csv')
network_df
regulator_set = set(network_df['regulator name'])
target_set = set(network_df['gene name'])

In [194]:
regulator_set = regulator_set.intersection(set(source_df_1.index)).intersection(set(source_df_2.index))
target_set = target_set.intersection(set(source_df_1.index)).intersection(set(source_df_2.index))
all_gene_set = regulator_set.union(target_set)


In [195]:
network_dict = {target: [] for target in target_set}
for ind, row in network_df.iterrows():
    if (row['regulator name'] in regulator_set) and (row['gene name'] in target_set):
        network_dict[row['gene name']].append(row['regulator name'])

In [196]:
key_list = []
value_list = []
regulator_set = set()
target_set = set()
for key in network_dict.keys():
    if (len(network_dict[key]) > 0):
        key_list.append(key)
        target_set.add(key)
        value_list.append("; ".join(network_dict[key]))
        for regulator in network_dict[key]:
            regulator_set.add(regulator)
all_gene_set = regulator_set.union(target_set)

In [197]:
network_df = pd.DataFrame(index=key_list)
network_df['tf_list'] = value_list

In [198]:
source_df_1 = source_df_1.loc[list(all_gene_set)]
source_df_2 = source_df_2.loc[list(all_gene_set)]
target_df_1 = target_df_1.loc[list(all_gene_set)]
target_df_2 = target_df_2.loc[list(all_gene_set)]

source_df_1=source_df_1.apply(stats.zscore, axis=0)
source_df_2=source_df_2.apply(stats.zscore, axis=0)
target_df_1=target_df_1.apply(stats.zscore, axis=0)
target_df_2=target_df_2.apply(stats.zscore, axis=0)


In [199]:
source_df = pd.concat([source_df_1, source_df_2], axis=1)
target_df = pd.concat([target_df_1, target_df_2], axis=1)

In [200]:
test = np.random.choice(len(source_df.columns), int(len(source_df.columns)*0.25), replace=False)

In [201]:
test_source = source_df.iloc[:,test]
test_target = target_df.iloc[:,test]
train_source = source_df.copy()
train_target = target_df.copy()
train_source = train_source.drop(columns=test_source.columns)
train_target = train_target.drop(columns=test_target.columns)

In [202]:
target_gene_list = list(target_set)
target_exp = target_df,
X = source_df.loc[list(regulator_set)]
tf_list = list(regulator_set)

In [203]:
mp_calc = mp_run.MpCalc(target_gene_list, target_exp, X, network_df, train_source.loc[tf_list], train_target, test_source.loc[tf_list], test_target)

In [205]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.full_comp, range(iter_length)), total=iter_length))

100%|██████████| 1991/1991 [00:46<00:00, 43.15it/s]


In [206]:
r = np.array(r)
out_df = pd.DataFrame(index=target_gene_list)
out_df['rf_score'] = r[:, 0]
out_df['linear_score'] = r[:, 1]
out_df['gs_rf_score'] = r[:, 2]
out_df['gs_linear_score'] = r[:, 3]
out_df['rf_with_linear_top_features_score'] = r[:, 4]
out_df['linear_with_rf_top_features_score'] = r[:, 5]


In [207]:
(out_df['rf_score'] > out_df['gs_linear_score']).sum()

1723

In [208]:
out_df

Unnamed: 0,rf_score,linear_score,gs_rf_score,gs_linear_score,rf_with_linear_top_features_score,linear_with_rf_top_features_score
pucR,-0.112067,-2.123209,-0.815803,-0.560283,0.002339,-0.580897
ytrC,0.059314,-2.094281,-0.047444,-0.196766,0.264546,-0.155873
yqjC,0.235808,-1.729897,-0.785003,-0.079150,0.324832,0.071362
comFB,-0.226636,-2.381441,-0.108997,-0.264939,-0.160359,-0.339964
mcpB,0.122573,-1.390165,-0.706529,0.100222,-0.124252,-0.121882
...,...,...,...,...,...,...
yjbA,0.352299,-0.510310,0.202828,0.200734,0.480992,-0.169992
gidA,0.408238,-1.040850,0.154168,0.036809,0.489265,0.311426
yneA,0.056506,-1.820056,0.028285,0.067398,0.165380,-0.327915
rpsF,0.885906,0.819825,0.776008,0.093581,0.859759,0.821516


In [209]:
from itertools import combinations
model_combs = list(combinations(out_df.columns, 2))

In [210]:
for a, b in model_combs:
    t, p = stats.ttest_rel(out_df[a], out_df[b])
    if (p > 0.05):
        print('{} and {} don\'t have statistically different performance'.format(a, b))
        continue
    if (t > 0):
        print('{} has statisically better performance than {}'.format(a, b))
    else:
        
        print('{} has statisically better performance than {}'.format(b, a))

rf_score has statisically better performance than linear_score
rf_score has statisically better performance than gs_rf_score
rf_score has statisically better performance than gs_linear_score
rf_score and rf_with_linear_top_features_score don't have statistically different performance
rf_score has statisically better performance than linear_with_rf_top_features_score
gs_rf_score has statisically better performance than linear_score
gs_linear_score has statisically better performance than linear_score
rf_with_linear_top_features_score has statisically better performance than linear_score
linear_with_rf_top_features_score has statisically better performance than linear_score
gs_linear_score has statisically better performance than gs_rf_score
rf_with_linear_top_features_score has statisically better performance than gs_rf_score
linear_with_rf_top_features_score has statisically better performance than gs_rf_score
rf_with_linear_top_features_score has statisically better performance than g

In [211]:
for a, b in model_combs:
    t, p = stats.ttest_rel(out_df.loc[high_var_target_genes][a], out_df.loc[high_var_target_genes][b])
    if (p > 0.05):
        print('{} and {} don\'t have statistically different performance'.format(a, b))
        continue
    if (t > 0):
        print('{} has statisically better performance than {}'.format(a, b))
    else:
        
        print('{} has statisically better performance than {}'.format(b, a))

NameError: name 'high_var_target_genes' is not defined