Scripts for regression experiments on Arabidopsis

In [1]:
import pandas as pd
from pathlib import Path
import mp_run

from scipy import stats


%load_ext autoreload
%autoreload 2

# regex for number extraction from string
number_pattern =  r'(-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?)'


In [2]:
train_source_df = pd.read_csv('../data/arabidopsis/GSE97500/normalized/train_source.csv.gz', index_col=0, compression='gzip')
train_target_df = pd.read_csv('../data/arabidopsis/GSE97500/normalized/train_target.csv.gz', index_col=0, compression='gzip')

test_source_df = pd.read_csv('../data/arabidopsis/GSE97500/normalized/test_source.csv.gz', index_col=0, compression='gzip')
test_target_df = pd.read_csv('../data/arabidopsis/GSE97500/normalized/test_target.csv.gz', index_col=0, compression='gzip')


In [3]:
regulator_set = set()
tf_list_df = pd.read_csv('../data/arabidopsis/arabidopsis_tf_list.tsv.gz', sep='\t', compression='gzip', index_col=0)
for name in tf_list_df['Gene Names']:
    name_splits = name.split(' ')
    for i in name_splits:
        if i.upper() in train_source_df.index:
            regulator_set.add(i.upper())

network_df = pd.read_csv('../data/arabidopsis/arabidopsis_network_connectf.csv', index_col=0)
target_set = set(network_df.index)

for tf_list_string in network_df['tf_list'].values:
    tf_list = tf_list_string.split('; ')
    for tf in tf_list: regulator_set.add(tf)

# network_df = pd.read_csv('../data/arabidopsis_network_connectf.csv', index_col=0)
# regulator_set = set()
# target_set = set(network_df.index)

# for tf_list_string in network_df['tf_list'].values:
#     tf_list = tf_list_string.split('; ')
#     for tf in tf_list: regulator_set.add(tf)

print('Number of TFs used:')
print(len(regulator_set))

Number of TFs used:
2010


In [4]:
regulator_set = regulator_set.intersection(set(train_source_df.index))
target_set = target_set.intersection(set(train_source_df.index))
all_gene_set = regulator_set.union(target_set)


train_source = train_source_df.loc[list(all_gene_set)].apply(stats.zscore, axis=0)
train_target = train_target_df.loc[list(all_gene_set)].apply(stats.zscore, axis=0)

test_source = test_source_df.loc[list(all_gene_set)].apply(stats.zscore, axis=0)
test_target = test_target_df.loc[list(all_gene_set)].apply(stats.zscore, axis=0)


In [5]:
print('training set size:')
print(train_source.shape)
print('testing set size:')
print(test_source.shape)

training set size:
(19343, 72)
testing set size:
(19343, 24)


In [6]:

target_df = pd.concat([train_target, test_target], axis=1)
source_df = pd.concat([train_source, test_source], axis=1)

target_gene_list = list(target_set)
target_exp = target_df
X = source_df.loc[list(regulator_set)]
tf_list = list(regulator_set)


res_tf_file = Path("../output/network_model/arabidopsis_tf.csv")
if res_tf_file.is_file():
    tf_list_df = pd.read_csv('../output/network_model/arabidopsis_tf.csv', names=['tf'], index_col=0)
    tf_list = list(tf_list_df.index)


In [7]:
# GS stats
edge_count = 0
gs_tf_set = set()
gs_target_set = set()
for i, row in network_df.iterrows():
    if i in target_gene_list:
        cur_tf_list = row.tf_list
        gs_target_set.add(i)
        if pd.isnull(cur_tf_list): 
            continue
        cur_tf_list = cur_tf_list.split('; ')
        for cur_tf in cur_tf_list:
            if cur_tf in tf_list:
                gs_tf_set.add(cur_tf)
                edge_count += 1
print('GS edge count:')
print(edge_count)
print('Number of TFs in GS:')
print(len(gs_tf_set))
print('Number of target genes in GS:')
print(len(gs_target_set))



GS edge count:
141445
Number of TFs in GS:
57
Number of target genes in GS:
18855


In [8]:
# filter for high variance targets
new_test_target = test_target.loc[target_gene_list]
new_test_target = new_test_target.loc[new_test_target.std(axis=1) > 0.5]
target_gene_list = new_test_target.index

In [9]:
# Setting up MP for calculations
mp_calc = mp_run.MpCalc(target_gene_list, X, network_df, train_source.loc[tf_list], train_target, test_source.loc[tf_list], test_target)
pd.DataFrame(index=mp_calc.tf_list).to_csv('../output/network_model/arabidopsis_tf.csv', header=False)

In [10]:
# Doing all calculations 
mp_calc.full_comp_runner(target_gene_list, 'arabidopsis')

Comparing different regression approaches... ...
Step 1 of 4:


100%|██████████| 1397/1397 [09:50<00:00,  2.37it/s]

Step 2 of 3:



100%|██████████| 1397/1397 [00:40<00:00, 34.14it/s]

Step 3 of 3:



100%|██████████| 1397/1397 [00:43<00:00, 32.16it/s]

Finished comparing all approaches, time elapsed: 675.2539687156677 seconds
Calculating minimal set and disjoint sets... ...
Step 1 of 1:



100%|██████████| 1397/1397 [55:48<00:00,  2.40s/it] 


Finished calculating minimal set and disjoint sets, time elapsed: 3348.541945695877 seconds
