Scripts for regression experiments on Arabidopsis

In [1]:


import pandas as pd
import numpy as np
from tqdm import tqdm

import mp_run
import conf_interval


from scipy import stats

from multiprocessing import Pool, cpu_count


%load_ext autoreload
%autoreload 2

# regex for number extraction from string
number_pattern =  r'(-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?)'


In [4]:
train_source_df = pd.read_csv('../data/arabidopsis/GSE97500/normalized/train_source.csv.gz', index_col=0, compression='gzip')
train_target_df = pd.read_csv('../data/arabidopsis/GSE97500/normalized/train_target.csv.gz', index_col=0, compression='gzip')

test_source_df = pd.read_csv('../data/arabidopsis/GSE97500/normalized/test_source.csv.gz', index_col=0, compression='gzip')
test_target_df = pd.read_csv('../data/arabidopsis/GSE97500/normalized/test_target.csv.gz', index_col=0, compression='gzip')


In [7]:
regulator_set = set()
tf_list_df = pd.read_csv('../data/arabidopsis/arabidopsis_tf_list.tsv.gz', sep='\t', compression='gzip', index_col=0)
for name in tf_list_df['Gene Names']:
    name_splits = name.split(' ')
    for i in name_splits:
        if i.upper() in train_source_df.index:
            regulator_set.add(i.upper())

network_df = pd.read_csv('../data/arabidopsis/arabidopsis_network_connectf.csv', index_col=0)
target_set = set(network_df.index)

for tf_list_string in network_df['tf_list'].values:
    tf_list = tf_list_string.split('; ')
    for tf in tf_list: regulator_set.add(tf)

# network_df = pd.read_csv('../data/arabidopsis_network_connectf.csv', index_col=0)
# regulator_set = set()
# target_set = set(network_df.index)

# for tf_list_string in network_df['tf_list'].values:
#     tf_list = tf_list_string.split('; ')
#     for tf in tf_list: regulator_set.add(tf)

print('Number of TFs used:')
print(len(regulator_set))

Number of TFs used:
2010


In [8]:
regulator_set = regulator_set.intersection(set(train_source_df.index))
target_set = target_set.intersection(set(train_source_df.index))
all_gene_set = regulator_set.union(target_set)


train_source = train_source_df.loc[list(all_gene_set)].apply(stats.zscore, axis=0)
train_target = train_target_df.loc[list(all_gene_set)].apply(stats.zscore, axis=0)

test_source = test_source_df.loc[list(all_gene_set)].apply(stats.zscore, axis=0)
test_target = test_target_df.loc[list(all_gene_set)].apply(stats.zscore, axis=0)


In [9]:
print('training set size:')
print(train_source.shape)
print('testing set size:')
print(test_source.shape)

training set size:
(19343, 72)
testing set size:
(19343, 24)


In [11]:

target_df = pd.concat([train_target, test_target], axis=1)
source_df = pd.concat([train_source, test_source], axis=1)

target_gene_list = list(target_set)
target_exp = target_df
X = source_df.loc[list(regulator_set)]
tf_list = list(regulator_set)

tf_list_df = pd.read_csv('../output/network_model/arabidopsis_tf.csv', names=['tf'], index_col=0)
tf_list = list(tf_list_df.index)

In [12]:
# GS stats
edge_count = 0
gs_tf_set = set()
gs_target_set = set()
for i, row in network_df.iterrows():
    if i in target_gene_list:
        cur_tf_list = row.tf_list
        gs_target_set.add(i)
        if pd.isnull(cur_tf_list): 
            continue
        cur_tf_list = cur_tf_list.split('; ')
        for cur_tf in cur_tf_list:
            if cur_tf in tf_list:
                gs_tf_set.add(cur_tf)
                edge_count += 1
print('GS edge count:')
print(edge_count)
print('Number of TFs in GS:')
print(len(gs_tf_set))
print('Number of target genes in GS:')
print(len(gs_target_set))



GS edge count:
141445
Number of TFs in GS:
57
Number of target genes in GS:
18855


In [10]:
# filter for high variance targets
new_test_target = test_target.loc[target_gene_list]
new_test_target = new_test_target.loc[new_test_target.std(axis=1) > 0.5]
target_gene_list = new_test_target.index

In [11]:
mp_calc = mp_run.MpCalc(target_gene_list, target_exp, X, network_df, train_source.loc[tf_list], train_target, test_source.loc[tf_list], test_target)

In [12]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.full_comp_new, range(iter_length)), total=iter_length))

100%|██████████| 1397/1397 [04:00<00:00,  5.81it/s]


In [13]:
r = np.array(r)
out_df = pd.DataFrame(index=target_gene_list)
out_df['rf_score'] = r[:, 0]
out_df['linear_score'] = r[:, 1]
out_df['gs_rf_score'] = r[:, 2]
out_df['gs_linear_score'] = r[:, 3]
out_df['rf_with_linear_top_features_score'] = r[:, 4]
out_df['linear_with_rf_top_features_score'] = r[:, 5]
out_df['rf_rmse'] = r[:, 6]
out_df['linear_rmse'] = r[:, 7]
out_df['gs_rf_rmse'] = r[:, 8]
out_df['gs_linear_rmse'] = r[:, 9]
out_df['rf_with_linear_top_features_rmse'] = r[:, 10]
out_df['linear_with_rf_top_features_rmse'] = r[:, 11]
out_df['rf_with_top_features_score'] = r[:, 12]
out_df['linear_with_top_features_score'] = r[:, 13]
out_df['rf_with_top_features_rmse'] = r[:, 14]
out_df['linear_with_top_features_rmse'] = r[:, 15]
out_df['rf_top_feature_num'] = r[:, 16]
out_df['linear_top_feature_num'] = r[:, 17]
out_df['rf_top_features_gs_overlap'] = r[:, 18]
out_df['linear_top_features_gs_overlap'] = r[:, 19]
out_df['rf_linear_top_features_overlap'] = r[:, 20]
out_df['gs_edge_num'] = r[:, 21]
out_df['test_var'] = r[:, 22]
out_df['test_std'] = r[:, 23]
out_df['pca_rf_score'] = r[:, 24]
out_df['pca_rf_rmse'] = r[:, 25]

In [17]:
tf_list_df = pd.DataFrame(index=tf_list)
tf_list_df.to_csv('../output/network_model/arabidopsis_tf.csv', header=False)
out_df.to_csv('../output/network_model/arabidopsis_all_tf_high_var_target_new.csv.gz', compression='gzip')

In [12]:
out_df = pd.read_csv('../output/network_model/arabidopsis_all_tf_high_var_target_new.csv.gz', index_col=0, compression='gzip')
iter_length = len(target_gene_list)
new_out_df = pd.DataFrame(index=target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.rf_top_tf_same_count_as_gs, range(iter_length)), total=iter_length))
efron_r = np.array(r)
new_out_df['rf_top_tf_same_count_as_gs_score'] = efron_r[:, 0]
new_out_df['rf_top_tf_same_count_as_gs_rmse'] = efron_r[:, 1]
new_out_df = new_out_df.loc[out_df.index]
out_df['rf_top_tf_same_count_as_gs_score'] = new_out_df['rf_top_tf_same_count_as_gs_score']
out_df['rf_top_tf_same_count_as_gs_rmse'] = new_out_df['rf_top_tf_same_count_as_gs_rmse']

100%|██████████| 1397/1397 [01:21<00:00, 17.12it/s]


In [13]:
out_df.to_csv('../output/network_model/arabidopsis_all_tf_high_var_target_new.csv.gz', compression='gzip')

In [12]:
out_df = pd.read_csv('../output/network_model/arabidopsis_all_tf_high_var_target_new.csv.gz', index_col=0, compression='gzip')
iter_length = len(target_gene_list)
new_out_df = pd.DataFrame(index=target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.efron_process_rf_training, range(iter_length)), total=iter_length))
efron_r = np.array(r)
new_out_df['rf_efron_feature_num'] = efron_r[:, 0].astype('float64')
new_out_df['rf_efron_complementary_feature_num_list'] = efron_r[:, 1]
new_out_df['rf_efron_rmse'] = efron_r[:, 2].astype('float64')
new_out_df['rf_efron_complementary_rmse_list'] = efron_r[:, 3]
new_out_df['rf_efron_features'] = efron_r[:, 4]
new_out_df['rf_efron_complementary_features_list'] = efron_r[:, 5]
new_out_df['rf_efron_ensemble_rmse'] = efron_r[:, 6]
new_out_df = new_out_df.loc[out_df.index]
out_df['rf_efron_feature_num'] = new_out_df['rf_efron_feature_num']
out_df['rf_efron_complementary_feature_num_list'] = new_out_df['rf_efron_complementary_feature_num_list']
out_df['rf_efron_rmse'] = new_out_df['rf_efron_rmse']
out_df['rf_efron_complementary_rmse_list'] = new_out_df['rf_efron_complementary_rmse_list']
out_df['rf_efron_features'] = new_out_df['rf_efron_features']
out_df['rf_efron_complementary_features_list'] = new_out_df['rf_efron_complementary_features_list']
out_df['rf_efron_ensemble_rmse'] = new_out_df['rf_efron_ensemble_rmse']

out_df.to_csv('../output/network_model/arabidopsis_all_tf_high_var_target_efron_train.csv.gz', compression='gzip')

100%|████████████████████████████████████████████████████████████████████████████| 1397/1397 [58:35<00:00,  2.52s/it]


In [None]:
tf_list_df = pd.read_csv('../output/network_model/arabidopsis_tf.csv', names=['tf'], index_col=0)
out_df = pd.read_csv('../output/network_model/arabidopsis_all_tf_high_var_target_new.csv.gz', index_col=0, compression='gzip')
available_tfs = set(X.index)
rf_efron_overlap_count = []
for target_gene in out_df.index:
    gs_tf_list = network_df.loc[target_gene].tf_list
    gs_tf_set = set(gs_tf_list.split('; '))
    gs_tf_set = available_tfs.intersection(gs_tf_set)
    if target_gene in gs_tf_set: gs_tf_set.remove(target_gene)
    efron_tf_list = out_df.loc[target_gene]['rf_efron_features']
    efron_tf_list = efron_tf_list.split('; ')
    efron_tf_list = [int(i) for i in efron_tf_list]
    efron_tf_list = tf_list_df.iloc[efron_tf_list].index
    efron_tf_set = set(efron_tf_list)
    rf_efron_overlap_count.append(len(efron_tf_set.intersection(gs_tf_set)))
out_df['rf_efron_overlap_count'] = rf_efron_overlap_count
out_df.to_csv('../output/network_model/arabidopsis_all_tf_high_var_target_new.csv.gz', compression='gzip')


In [13]:
out_df = pd.read_csv('../output/network_model/arabidopsis_all_tf_high_var_target_efron_train.csv.gz', index_col=0, compression='gzip')
iter_length = len(target_gene_list)
new_out_df = pd.DataFrame(index=target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.rf_top_10, range(iter_length)), total=iter_length))
top10_r = np.array(r)
new_out_df['rf_top10_score'] = top10_r[:, 0].astype('float64')
new_out_df['rf_top10_rmse'] = top10_r[:, 1].astype('float64')
new_out_df = new_out_df.loc[out_df.index]
out_df['rf_top10_score'] = new_out_df['rf_top10_score']
out_df['rf_top10_rmse'] = new_out_df['rf_top10_rmse']
out_df.to_csv('../output/network_model/arabidopsis_all_tf_high_var_target_efron_train.csv.gz', compression='gzip')

100%|██████████| 1397/1397 [01:02<00:00, 22.18it/s]
