Scripts for regression experiments on B.subtilis

In [11]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import mp_run
from scipy import stats


%load_ext autoreload
%autoreload 2
from multiprocessing import Pool, cpu_count


# regex for number extraction from string
number_pattern =  r'(-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?)'


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
ref_res_df = pd.read_csv('../output/network_model/bsubtilis_all_tf_high_var_target_efron_train.csv.gz', compression='gzip', index_col=0)

In [3]:
source_df_1 = pd.read_csv('../data/bsubtilis/GSE108659/normalized/source.csv.gz', index_col=0, compression='gzip')
source_df_2 = pd.read_csv('../data/bsubtilis/GSE128875/normalized/source.csv.gz', index_col=0, compression='gzip')
source_df_3 = pd.read_csv('../data/bsubtilis/GSE224332/normalized/source.csv.gz', index_col=0, compression='gzip')
target_df_1 = pd.read_csv('../data/bsubtilis/GSE108659/normalized/target.csv.gz', index_col=0, compression='gzip')
target_df_2 = pd.read_csv('../data/bsubtilis/GSE128875/normalized/target.csv.gz', index_col=0, compression='gzip')
target_df_3 = pd.read_csv('../data/bsubtilis/GSE224332/normalized/target.csv.gz', index_col=0, compression='gzip')

In [4]:
train_source_df_1 = pd.read_csv('../data/bsubtilis/GSE108659/normalized/train_source.csv.gz', index_col=0, compression='gzip')
train_source_df_2 = pd.read_csv('../data/bsubtilis/GSE128875/normalized/train_source.csv.gz', index_col=0, compression='gzip')
train_source_df_3 = pd.read_csv('../data/bsubtilis/GSE224332/normalized/train_source.csv.gz', index_col=0, compression='gzip')
train_target_df_1 = pd.read_csv('../data/bsubtilis/GSE108659/normalized/train_target.csv.gz', index_col=0, compression='gzip')
train_target_df_2 = pd.read_csv('../data/bsubtilis/GSE128875/normalized/train_target.csv.gz', index_col=0, compression='gzip')
train_target_df_3 = pd.read_csv('../data/bsubtilis/GSE224332/normalized/train_target.csv.gz', index_col=0, compression='gzip')

test_source_df_1 = pd.read_csv('../data/bsubtilis/GSE108659/normalized/test_source.csv.gz', index_col=0, compression='gzip')
test_source_df_2 = pd.read_csv('../data/bsubtilis/GSE128875/normalized/test_source.csv.gz', index_col=0, compression='gzip')
test_source_df_3 = pd.read_csv('../data/bsubtilis/GSE224332/normalized/test_source.csv.gz', index_col=0, compression='gzip')
test_target_df_1 = pd.read_csv('../data/bsubtilis/GSE108659/normalized/test_target.csv.gz', index_col=0, compression='gzip')
test_target_df_2 = pd.read_csv('../data/bsubtilis/GSE128875/normalized/test_target.csv.gz', index_col=0, compression='gzip')
test_target_df_3 = pd.read_csv('../data/bsubtilis/GSE224332/normalized/test_target.csv.gz', index_col=0, compression='gzip')

In [5]:
regulator_set = set()
tf_list_df = pd.read_csv('../data/bsubtilis/bsubtilis_tf_list.tsv.gz', sep='\t', compression='gzip', index_col=0)
tf_list_df
for name in tf_list_df['Gene Names']:
    name_splits = name.split(' ')
    for i in name_splits:
        if i in source_df_1.index:
            regulator_set.add(i)

network_df = pd.read_csv('../data/bsubtilis/gs_regulations.csv')
regulator_set = regulator_set.union(set(network_df['regulator name']))
target_set = set(network_df['gene name'])



In [6]:
regulator_set = regulator_set.intersection(set(source_df_1.index), set(source_df_2.index), set(source_df_3.index))
target_set = target_set.intersection(set(source_df_1.index), set(source_df_2.index), set(source_df_3.index))
all_gene_set = regulator_set.union(target_set)


In [7]:
network_dict = {target: [] for target in target_set}
for ind, row in network_df.iterrows():
    if (row['regulator name'] in regulator_set) and (row['gene name'] in target_set):
        network_dict[row['gene name']].append(row['regulator name'])

In [8]:
key_list = []
value_list = []
regulator_set = set()
target_set = set()
for key in network_dict.keys():
    if (len(network_dict[key]) > 0):
        key_list.append(key)
        target_set.add(key)
        value_list.append("; ".join(network_dict[key]))
        for regulator in network_dict[key]:
            regulator_set.add(regulator)
all_gene_set = regulator_set.union(target_set)

In [9]:
network_df = pd.DataFrame(index=key_list)
network_df['tf_list'] = value_list

In [10]:
source_df_1 = source_df_1.loc[list(all_gene_set)]
source_df_2 = source_df_2.loc[list(all_gene_set)]
source_df_3 = source_df_3.loc[list(all_gene_set)]
target_df_1 = target_df_1.loc[list(all_gene_set)]
target_df_2 = target_df_2.loc[list(all_gene_set)]
target_df_3 = target_df_3.loc[list(all_gene_set)]

train_source_df_1 = train_source_df_1.loc[list(all_gene_set)]
train_source_df_2 = train_source_df_2.loc[list(all_gene_set)]
train_source_df_3 = train_source_df_3.loc[list(all_gene_set)]
train_target_df_1 = train_target_df_1.loc[list(all_gene_set)]
train_target_df_2 = train_target_df_2.loc[list(all_gene_set)]
train_target_df_3 = train_target_df_3.loc[list(all_gene_set)]

test_source_df_1 = test_source_df_1.loc[list(all_gene_set)]
test_source_df_2 = test_source_df_2.loc[list(all_gene_set)]
test_source_df_3 = test_source_df_3.loc[list(all_gene_set)]
test_target_df_1 = test_target_df_1.loc[list(all_gene_set)]
test_target_df_2 = test_target_df_2.loc[list(all_gene_set)]
test_target_df_3 = test_target_df_3.loc[list(all_gene_set)]


In [15]:
train_source = pd.concat([train_source_df_1, train_source_df_2, train_source_df_3], axis=1).apply(stats.zscore, axis=0)
test_source = pd.concat([test_source_df_1, test_source_df_2, test_source_df_3], axis=1).apply(stats.zscore, axis=0)
train_target = pd.concat([train_target_df_1, train_target_df_2, train_target_df_3], axis=1).apply(stats.zscore, axis=0)
test_target = pd.concat([test_target_df_1, test_target_df_2, test_target_df_3], axis=1).apply(stats.zscore, axis=0)

target_df = pd.concat([target_df_1, target_df_2, target_df_3], axis=1).apply(stats.zscore, axis=0)
source_df = pd.concat([source_df_1, source_df_2, source_df_3], axis=1).apply(stats.zscore, axis=0)






In [20]:
train_source_list = [train_source_df_1, train_source_df_2, train_source_df_3]
train_target_list = [train_target_df_1, train_target_df_2, train_target_df_3]
test_source_list = [test_source_df_1, test_source_df_2, test_source_df_3]
test_target_list = [test_target_df_1, test_target_df_2, test_target_df_3]

In [16]:
for a in list(target_set):
    if len(network_df.loc[a]['tf_list'].split('; ')) < 1 or network_df.loc[a]['tf_list'] == a:
        target_set.remove(a)
    

In [17]:
print('Number of TFs used:')
print(len(regulator_set))

Number of TFs used:
151


In [19]:
target_gene_list = list(target_set)
target_exp = target_df
X = source_df.loc[list(regulator_set)]
tf_list = list(regulator_set)
target_gene_list = list(ref_res_df.index)
tf_list_df = pd.read_csv('../output/network_model/bsubtilis_tf.csv', names=['tf'], index_col=0)
tf_list = list(tf_list_df.index)

In [21]:
for i in range(len(train_source_list)):
    cv_test_source = pd.concat([train_source_list[i], test_source_list[i]], axis=1)
    cv_test_target = pd.concat([train_target_list[i], test_target_list[i]], axis=1)
    cv_train_source_cur = []
    cv_train_target_cur = []
    for j in range(len(train_source_list)):
        if i == j: continue
        cv_train_source_cur.append(train_source_list[j])
        cv_train_source_cur.append(test_source_list[j])
        cv_train_target_cur.append(train_target_list[j])
        cv_train_target_cur.append(test_target_list[j])
    cv_train_source = pd.concat(cv_train_source_cur, axis=1)
    cv_train_target = pd.concat(cv_train_target_cur, axis=1)
    mp_calc = mp_run.MpCalc(target_gene_list, X, network_df, cv_train_source.loc[tf_list], cv_train_target, cv_test_source.loc[tf_list], cv_test_target)
    iter_length = len(target_gene_list)
    with Pool(cpu_count()) as p:
        r = list(tqdm(p.imap(mp_calc.full_comp_new, range(iter_length)), total=iter_length))
    r = np.array(r)
    out_df = pd.DataFrame(index=target_gene_list)
    out_df['rf_score'] = r[:, 0]
    out_df['linear_score'] = r[:, 1]
    out_df['gs_rf_score'] = r[:, 2]
    out_df['gs_linear_score'] = r[:, 3]
    out_df['rf_with_linear_top_features_score'] = r[:, 4]
    out_df['linear_with_rf_top_features_score'] = r[:, 5]
    out_df['rf_rmse'] = r[:, 6]
    out_df['linear_rmse'] = r[:, 7]
    out_df['gs_rf_rmse'] = r[:, 8]
    out_df['gs_linear_rmse'] = r[:, 9]
    out_df['rf_with_linear_top_features_rmse'] = r[:, 10]
    out_df['linear_with_rf_top_features_rmse'] = r[:, 11]
    out_df['rf_with_top_features_score'] = r[:, 12]
    out_df['linear_with_top_features_score'] = r[:, 13]
    out_df['rf_with_top_features_rmse'] = r[:, 14]
    out_df['linear_with_top_features_rmse'] = r[:, 15]
    out_df['rf_top_feature_num'] = r[:, 16]
    out_df['linear_top_feature_num'] = r[:, 17]
    out_df['rf_top_features_gs_overlap'] = r[:, 18]
    out_df['linear_top_features_gs_overlap'] = r[:, 19]
    out_df['rf_linear_top_features_overlap'] = r[:, 20]
    out_df['gs_edge_num'] = r[:, 21]
    out_df['test_var'] = r[:, 22]
    out_df['test_std'] = r[:, 23]
    out_df['pca_rf_score'] = r[:, 24]
    out_df['pca_rf_rmse'] = r[:, 25]
    
    print(out_df['rf_rmse'].mean())
    print(out_df['rf_score'].mean())
    print('======================================')

100%|██████████| 733/733 [00:53<00:00, 13.79it/s]

0.7265870307819318
-15820.287820229965



100%|██████████| 733/733 [01:08<00:00, 10.72it/s]


0.8011109732265792
-132.38151844454012


100%|██████████| 733/733 [01:06<00:00, 11.10it/s]

1.2809305377298625
-151.61205917451406



