Scripts for regression experiments on mouse

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import mp_run

%load_ext autoreload
%autoreload 2
from multiprocessing import Pool, cpu_count


# regex for number extraction from string
number_pattern =  r'(-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?)'


In [2]:
ref_res_df = pd.read_csv('../output/network_model/mouse_all_tf_high_var_target_efron_train.csv.gz', compression='gzip', index_col=0)

In [3]:
train_source_1 = pd.read_csv('../data/mouse/GSE115553/normalized/train_source.csv.gz', index_col=0, compression='gzip')
train_target_1 = pd.read_csv('../data/mouse/GSE115553/normalized/train_target.csv.gz', index_col=0, compression='gzip')
test_source_1 = pd.read_csv('../data/mouse/GSE115553/normalized/test_source.csv.gz', index_col=0, compression='gzip')
test_target_1 = pd.read_csv('../data/mouse/GSE115553/normalized/test_target.csv.gz', index_col=0, compression='gzip')

train_source_2 = pd.read_csv('../data/mouse/GSE151173/normalized/train_source.csv.gz', index_col=0, compression='gzip')
train_target_2 = pd.read_csv('../data/mouse/GSE151173/normalized/train_target.csv.gz', index_col=0, compression='gzip')
test_source_2 = pd.read_csv('../data/mouse/GSE151173/normalized/test_source.csv.gz', index_col=0, compression='gzip')
test_target_2 = pd.read_csv('../data/mouse/GSE151173/normalized/test_target.csv.gz', index_col=0, compression='gzip')

train_source_3 = pd.read_csv('../data/mouse/GSE171975/normalized/train_source.csv.gz', index_col=0, compression='gzip')
train_target_3 = pd.read_csv('../data/mouse/GSE171975/normalized/train_target.csv.gz', index_col=0, compression='gzip')
test_source_3 = pd.read_csv('../data/mouse/GSE171975/normalized/test_source.csv.gz', index_col=0, compression='gzip')
test_target_3 = pd.read_csv('../data/mouse/GSE171975/normalized/test_target.csv.gz', index_col=0, compression='gzip')

In [4]:
common_exp_genes = list(set(train_source_1.index).intersection(set(train_source_2.index), set(train_source_3.index)))

train_source_1 = train_source_1.loc[common_exp_genes]
train_target_1 = train_target_1.loc[common_exp_genes]
test_source_1 = test_source_1.loc[common_exp_genes]
test_target_1 = test_target_1.loc[common_exp_genes]

train_source_2 = train_source_2.loc[common_exp_genes]
train_target_2 = train_target_2.loc[common_exp_genes]
test_source_2 = test_source_2.loc[common_exp_genes]
test_target_2 = test_target_2.loc[common_exp_genes]

train_source_3 = train_source_3.loc[common_exp_genes]
train_target_3 = train_target_3.loc[common_exp_genes]
test_source_3 = test_source_3.loc[common_exp_genes]
test_target_3 = test_target_3.loc[common_exp_genes]

In [5]:
train_source = pd.concat([train_source_1, train_source_2, train_source_3], axis=1)
train_target = pd.concat([train_target_1, train_target_2, train_target_3], axis=1)
test_source = pd.concat([test_source_1, test_source_2, test_source_3], axis=1)
test_target = pd.concat([test_target_1, test_target_2, test_target_3], axis=1)

In [6]:
train_source_list = [train_source_1, train_source_2, train_source_3]
train_target_list = [train_target_1, train_target_2, train_target_3]
test_source_list = [test_source_1, test_source_2, test_source_3]
test_target_list = [test_target_1, test_target_2, test_target_3]

In [7]:
network_df = pd.read_csv('../data/mouse/regnetworkweb.org.network', sep='\t')
regulator_set = set(network_df['regulator'])
target_set = set(network_df['target'])

regulator_set = regulator_set.intersection(set(train_source.index))
target_set = target_set.intersection(set(train_source.index))
all_gene_set = regulator_set.union(target_set)
network_dict = {target: [] for target in target_set}
for ind, row in network_df.iterrows():
    if (row['regulator'] in regulator_set) and (row['target'] in target_set):
        network_dict[row['target']].append(row['regulator'])

key_list = []
value_list = []
regulator_set = set()
tf_list_df = pd.read_csv('../data/mouse/mouse_tf_list.tsv.gz', sep='\t', compression='gzip', index_col=0)
for name in tf_list_df['Gene Names']:
    name_splits = str(name).split(' ')
    for i in name_splits:
        if i in train_source.index:
            regulator_set.add(i)
target_set = set()
for key in network_dict.keys():
    if (len(network_dict[key]) > 0) and network_dict[key][0] != key:
        key_list.append(key)
        target_set.add(key)
        value_list.append("; ".join(network_dict[key]))
        for regulator in network_dict[key]:
            regulator_set.add(regulator)
all_gene_set = regulator_set.union(target_set)

print('Number of TFs used:')
print(len(regulator_set))

Number of TFs used:
1385


In [8]:
network_df = pd.DataFrame(index=key_list)
network_df['tf_list'] = value_list
target = pd.concat([train_target, test_target], axis=1)
source = pd.concat([train_source, test_source], axis=1)

target_gene_list = list(target_set)
target_exp = target.loc[target_gene_list]
X = source.loc[list(regulator_set)]
tf_list = list(regulator_set)


tf_list_df = pd.read_csv('../output/network_model/mouse_tf.csv', names=['tf'], index_col=0)
tf_list = list(tf_list_df.index)

In [11]:
for i in range(len(train_source_list)):
    cv_test_source = pd.concat(test_source_list, axis=1)
    cv_test_target = pd.concat(test_target_list, axis=1)
    cv_train_source_cur = []
    cv_train_target_cur = []
    for j in range(len(train_source_list)):
        if i == j: continue
        cv_train_source_cur.append(train_source_list[j])
        # cv_train_source_cur.append(test_source_list[j])
        cv_train_target_cur.append(train_target_list[j])
        # cv_train_target_cur.append(test_target_list[j])
    cv_train_source = pd.concat(cv_train_source_cur, axis=1)
    cv_train_target = pd.concat(cv_train_target_cur, axis=1)
    
    mp_calc = mp_run.MpCalc(target_gene_list, X, network_df, cv_train_source.loc[tf_list], cv_train_target, cv_test_source.loc[tf_list], cv_test_target)
    iter_length = len(target_gene_list)
    with Pool(cpu_count()) as p:
        r = list(tqdm(p.imap(mp_calc.full_comp_new, range(iter_length)), total=iter_length))
    r = np.array(r)
    out_df = pd.DataFrame(index=target_gene_list)
    out_df['rf_score'] = r[:, 0]
    out_df['linear_score'] = r[:, 1]
    out_df['gs_rf_score'] = r[:, 2]
    out_df['gs_linear_score'] = r[:, 3]
    out_df['rf_with_linear_top_features_score'] = r[:, 4]
    out_df['linear_with_rf_top_features_score'] = r[:, 5]
    out_df['rf_rmse'] = r[:, 6]
    out_df['linear_rmse'] = r[:, 7]
    out_df['gs_rf_rmse'] = r[:, 8]
    out_df['gs_linear_rmse'] = r[:, 9]
    out_df['rf_with_linear_top_features_rmse'] = r[:, 10]
    out_df['linear_with_rf_top_features_rmse'] = r[:, 11]
    out_df['rf_with_top_features_score'] = r[:, 12]
    out_df['linear_with_top_features_score'] = r[:, 13]
    out_df['rf_with_top_features_rmse'] = r[:, 14]
    out_df['linear_with_top_features_rmse'] = r[:, 15]
    out_df['rf_top_feature_num'] = r[:, 16]
    out_df['linear_top_feature_num'] = r[:, 17]
    out_df['rf_top_features_gs_overlap'] = r[:, 18]
    out_df['linear_top_features_gs_overlap'] = r[:, 19]
    out_df['rf_linear_top_features_overlap'] = r[:, 20]
    out_df['gs_edge_num'] = r[:, 21]
    out_df['test_var'] = r[:, 22]
    out_df['test_std'] = r[:, 23]
    out_df['pca_rf_score'] = r[:, 24]
    out_df['pca_rf_rmse'] = r[:, 25]
    
    print(out_df['rf_rmse'].mean())
    print(out_df['rf_score'].mean())
    print('======================================')

100%|██████████| 310/310 [00:50<00:00,  6.20it/s]


1.7478855763764343
-0.006414213705246191


100%|██████████| 310/310 [00:47<00:00,  6.53it/s]

1.8461235289963365
-0.42294478828408344



100%|██████████| 310/310 [00:47<00:00,  6.53it/s]

1.8434667698988132
-0.5331721964382252





In [9]:
for i in range(len(train_source_list)):
    cv_test_source = pd.concat([train_source_list[i], test_source_list[i]], axis=1)
    cv_test_target = pd.concat([train_target_list[i], test_target_list[i]], axis=1)
    cv_train_source_cur = []
    cv_train_target_cur = []
    for j in range(len(train_source_list)):
        if i == j: continue
        cv_train_source_cur.append(train_source_list[j])
        cv_train_source_cur.append(test_source_list[j])
        cv_train_target_cur.append(train_target_list[j])
        cv_train_target_cur.append(test_target_list[j])
    cv_train_source = pd.concat(cv_train_source_cur, axis=1)
    cv_train_target = pd.concat(cv_train_target_cur, axis=1)
    mp_calc = mp_run.MpCalc(target_gene_list, X, network_df, cv_train_source.loc[tf_list], cv_train_target, cv_test_source.loc[tf_list], cv_test_target)
    iter_length = len(target_gene_list)
    with Pool(cpu_count()) as p:
        r = list(tqdm(p.imap(mp_calc.full_comp_new, range(iter_length)), total=iter_length))
    r = np.array(r)
    out_df = pd.DataFrame(index=target_gene_list)
    out_df['rf_score'] = r[:, 0]
    out_df['linear_score'] = r[:, 1]
    out_df['gs_rf_score'] = r[:, 2]
    out_df['gs_linear_score'] = r[:, 3]
    out_df['rf_with_linear_top_features_score'] = r[:, 4]
    out_df['linear_with_rf_top_features_score'] = r[:, 5]
    out_df['rf_rmse'] = r[:, 6]
    out_df['linear_rmse'] = r[:, 7]
    out_df['gs_rf_rmse'] = r[:, 8]
    out_df['gs_linear_rmse'] = r[:, 9]
    out_df['rf_with_linear_top_features_rmse'] = r[:, 10]
    out_df['linear_with_rf_top_features_rmse'] = r[:, 11]
    out_df['rf_with_top_features_score'] = r[:, 12]
    out_df['linear_with_top_features_score'] = r[:, 13]
    out_df['rf_with_top_features_rmse'] = r[:, 14]
    out_df['linear_with_top_features_rmse'] = r[:, 15]
    out_df['rf_top_feature_num'] = r[:, 16]
    out_df['linear_top_feature_num'] = r[:, 17]
    out_df['rf_top_features_gs_overlap'] = r[:, 18]
    out_df['linear_top_features_gs_overlap'] = r[:, 19]
    out_df['rf_linear_top_features_overlap'] = r[:, 20]
    out_df['gs_edge_num'] = r[:, 21]
    out_df['test_var'] = r[:, 22]
    out_df['test_std'] = r[:, 23]
    out_df['pca_rf_score'] = r[:, 24]
    out_df['pca_rf_rmse'] = r[:, 25]
    
    print(out_df['rf_rmse'].mean())
    print(out_df['rf_score'].mean())
    print('======================================')

100%|██████████| 310/310 [01:07<00:00,  4.56it/s]

3.285961005078511
-31601.971248615137



100%|██████████| 310/310 [01:06<00:00,  4.65it/s]

2.3386661615829096
-23095.55224774118



100%|██████████| 310/310 [01:09<00:00,  4.45it/s]

2.967708970910789
-86721.78524877173





In [19]:
all_rmse_list = []
all_train_rmse_list = []
all_train_source = pd.concat(train_source_list, axis=1)
all_train_target = pd.concat(train_target_list, axis=1)
for i in range(len(train_source_list)):
    cv_test_source = test_source_list[i]
    cv_test_target = test_target_list[i]
    cv_train_source = train_source_list[i]
    cv_train_target = train_target_list[i]
    new_test_target = cv_test_target.loc[target_gene_list]
    new_test_target = new_test_target.loc[new_test_target.std(axis=1) > 0.5]
    new_target_gene_list = new_test_target.index
    mp_calc = mp_run.MpCalc(new_target_gene_list, X, network_df, cv_train_source.loc[tf_list], cv_train_target, cv_test_source.loc[tf_list], cv_test_target)
    iter_length = len(new_target_gene_list)
    with Pool(cpu_count()) as p:
        r = list(tqdm(p.imap(mp_calc.full_comp_new, range(iter_length)), total=iter_length))
    r = np.array(r)
    out_df = pd.DataFrame(index=new_target_gene_list)
    out_df['rf_score'] = r[:, 0]
    out_df['linear_score'] = r[:, 1]
    out_df['gs_rf_score'] = r[:, 2]
    out_df['gs_linear_score'] = r[:, 3]
    out_df['rf_with_linear_top_features_score'] = r[:, 4]
    out_df['linear_with_rf_top_features_score'] = r[:, 5]
    out_df['rf_rmse'] = r[:, 6]
    out_df['linear_rmse'] = r[:, 7]
    out_df['gs_rf_rmse'] = r[:, 8]
    out_df['gs_linear_rmse'] = r[:, 9]
    out_df['rf_with_linear_top_features_rmse'] = r[:, 10]
    out_df['linear_with_rf_top_features_rmse'] = r[:, 11]
    out_df['rf_with_top_features_score'] = r[:, 12]
    out_df['linear_with_top_features_score'] = r[:, 13]
    out_df['rf_with_top_features_rmse'] = r[:, 14]
    out_df['linear_with_top_features_rmse'] = r[:, 15]
    out_df['rf_top_feature_num'] = r[:, 16]
    out_df['linear_top_feature_num'] = r[:, 17]
    out_df['rf_top_features_gs_overlap'] = r[:, 18]
    out_df['linear_top_features_gs_overlap'] = r[:, 19]
    out_df['rf_linear_top_features_overlap'] = r[:, 20]
    out_df['gs_edge_num'] = r[:, 21]
    out_df['test_var'] = r[:, 22]
    out_df['test_std'] = r[:, 23]
    out_df['pca_rf_score'] = r[:, 24]
    out_df['pca_rf_rmse'] = r[:, 25]
    
    all_rmse_list = np.concatenate([all_rmse_list, r[:, 6]])
    print(out_df['rf_rmse'].mean())
    print(out_df['rf_score'].mean())

    mp_calc = mp_run.MpCalc(new_target_gene_list, X, network_df, all_train_source.loc[tf_list], all_train_target, cv_test_source.loc[tf_list], cv_test_target)
    iter_length = len(new_target_gene_list)
    with Pool(cpu_count()) as p:
        r = list(tqdm(p.imap(mp_calc.full_comp_new, range(iter_length)), total=iter_length))
    r = np.array(r)
    print(np.mean(r[:, 6]))
    all_train_rmse_list = np.concatenate([all_train_rmse_list, r[:, 6]])


    print('======================================')
print(np.mean(all_rmse_list))
print(np.mean(all_train_rmse_list))

100%|██████████| 397/397 [01:39<00:00,  3.98it/s]

1.8252741779000459
-0.6234028322039206



100%|██████████| 397/397 [01:21<00:00,  4.86it/s]

1.7950358259785844



100%|██████████| 88/88 [00:21<00:00,  4.15it/s]

1.117902075965184
0.5664055362156741



100%|██████████| 88/88 [00:18<00:00,  4.65it/s]


1.1176940236468638


100%|██████████| 74/74 [00:21<00:00,  3.46it/s]

1.3641514687801213
0.18121954808453775



100%|██████████| 74/74 [00:15<00:00,  4.73it/s]

1.3667002618285582
1.6528737745992548
1.6317032493197412





In [16]:
np.concatenate([[], r[:, 0]]).shape

(74,)

In [11]:
for i in range(len(train_source_list)):
    cv_test_source = pd.concat([train_source_list[i], test_source_list[i]], axis=1)
    cv_test_target = pd.concat([train_target_list[i], test_target_list[i]], axis=1)
    cv_train_source_cur = []
    cv_train_target_cur = []
    for j in range(len(train_source_list)):
        if i == j: continue
        cv_train_source_cur.append(train_source_list[j])
        cv_train_source_cur.append(test_source_list[j])
        cv_train_target_cur.append(train_target_list[j])
        cv_train_target_cur.append(test_target_list[j])
    cv_train_source = pd.concat(cv_train_source_cur, axis=1)
    cv_train_target = pd.concat(cv_train_target_cur, axis=1)
    new_test_target = cv_test_target.loc[target_gene_list]
    new_test_target = new_test_target.loc[new_test_target.std(axis=1) > 0.5]
    new_target_gene_list = new_test_target.index
    mp_calc = mp_run.MpCalc(new_target_gene_list, X, network_df, cv_train_source.loc[tf_list], cv_train_target, cv_test_source.loc[tf_list], cv_test_target)
    iter_length = len(new_target_gene_list)
    with Pool(cpu_count()) as p:
        r = list(tqdm(p.imap(mp_calc.full_comp_new, range(iter_length)), total=iter_length))
    r = np.array(r)
    out_df = pd.DataFrame(index=new_target_gene_list)
    out_df['rf_score'] = r[:, 0]
    out_df['linear_score'] = r[:, 1]
    out_df['gs_rf_score'] = r[:, 2]
    out_df['gs_linear_score'] = r[:, 3]
    out_df['rf_with_linear_top_features_score'] = r[:, 4]
    out_df['linear_with_rf_top_features_score'] = r[:, 5]
    out_df['rf_rmse'] = r[:, 6]
    out_df['linear_rmse'] = r[:, 7]
    out_df['gs_rf_rmse'] = r[:, 8]
    out_df['gs_linear_rmse'] = r[:, 9]
    out_df['rf_with_linear_top_features_rmse'] = r[:, 10]
    out_df['linear_with_rf_top_features_rmse'] = r[:, 11]
    out_df['rf_with_top_features_score'] = r[:, 12]
    out_df['linear_with_top_features_score'] = r[:, 13]
    out_df['rf_with_top_features_rmse'] = r[:, 14]
    out_df['linear_with_top_features_rmse'] = r[:, 15]
    out_df['rf_top_feature_num'] = r[:, 16]
    out_df['linear_top_feature_num'] = r[:, 17]
    out_df['rf_top_features_gs_overlap'] = r[:, 18]
    out_df['linear_top_features_gs_overlap'] = r[:, 19]
    out_df['rf_linear_top_features_overlap'] = r[:, 20]
    out_df['gs_edge_num'] = r[:, 21]
    out_df['test_var'] = r[:, 22]
    out_df['test_std'] = r[:, 23]
    out_df['pca_rf_score'] = r[:, 24]
    out_df['pca_rf_rmse'] = r[:, 25]
    
    print(out_df['rf_rmse'].mean())
    print(out_df['rf_score'].mean())
    print('======================================')

100%|██████████| 296/296 [01:05<00:00,  4.50it/s]

2.837426284152893
-3.7664393661713067



100%|██████████| 93/93 [00:20<00:00,  4.57it/s]

2.859372298251717
-10.982640947137934



100%|██████████| 75/75 [00:16<00:00,  4.48it/s]

4.170258429939662
-40.58451572450868





In [10]:
new_test_target.std(axis=1) > 0.5

Irgm1     True
Rpl36     True
Efhd2     True
Rnf149    True
Fau       True
          ... 
Rpl23a    True
Ifit1     True
Ddx5      True
Rpl19     True
Ube2m     True
Length: 296, dtype: bool

In [10]:
train_source_list[i].shape

(9931, 82)

In [12]:
for i in range(3):
    print(train_source_list[i].shape[1]+test_source_list[i].shape[1])

81
120
128


In [12]:
# sanity check:
for i in train_source_list:
    for j in test_source_list:
        print(set(i.columns).intersection(set(j.columns)))

set()
set()
set()
set()
set()
set()
set()
set()
set()


In [13]:

for i in train_target_list:
    for j in test_target_list:
        print(set(i.columns).intersection(set(j.columns)))

set()
set()
set()
set()
set()
set()
set()
set()
set()
