Scripts for regression experiments on B.subtilis

In [1]:
import pandas as pd
from pathlib import Path
import mp_run

from scipy import stats


%load_ext autoreload
%autoreload 2

# regex for number extraction from string
number_pattern =  r'(-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?)'


In [2]:
source_df_1 = pd.read_csv('../data/bsubtilis/GSE108659/normalized/source.csv.gz', index_col=0, compression='gzip')
source_df_2 = pd.read_csv('../data/bsubtilis/GSE128875/normalized/source.csv.gz', index_col=0, compression='gzip')
source_df_3 = pd.read_csv('../data/bsubtilis/GSE224332/normalized/source.csv.gz', index_col=0, compression='gzip')
target_df_1 = pd.read_csv('../data/bsubtilis/GSE108659/normalized/target.csv.gz', index_col=0, compression='gzip')
target_df_2 = pd.read_csv('../data/bsubtilis/GSE128875/normalized/target.csv.gz', index_col=0, compression='gzip')
target_df_3 = pd.read_csv('../data/bsubtilis/GSE224332/normalized/target.csv.gz', index_col=0, compression='gzip')

In [3]:
train_source_df_1 = pd.read_csv('../data/bsubtilis/GSE108659/normalized/train_source.csv.gz', index_col=0, compression='gzip')
train_source_df_2 = pd.read_csv('../data/bsubtilis/GSE128875/normalized/train_source.csv.gz', index_col=0, compression='gzip')
train_source_df_3 = pd.read_csv('../data/bsubtilis/GSE224332/normalized/train_source.csv.gz', index_col=0, compression='gzip')
train_target_df_1 = pd.read_csv('../data/bsubtilis/GSE108659/normalized/train_target.csv.gz', index_col=0, compression='gzip')
train_target_df_2 = pd.read_csv('../data/bsubtilis/GSE128875/normalized/train_target.csv.gz', index_col=0, compression='gzip')
train_target_df_3 = pd.read_csv('../data/bsubtilis/GSE224332/normalized/train_target.csv.gz', index_col=0, compression='gzip')

test_source_df_1 = pd.read_csv('../data/bsubtilis/GSE108659/normalized/test_source.csv.gz', index_col=0, compression='gzip')
test_source_df_2 = pd.read_csv('../data/bsubtilis/GSE128875/normalized/test_source.csv.gz', index_col=0, compression='gzip')
test_source_df_3 = pd.read_csv('../data/bsubtilis/GSE224332/normalized/test_source.csv.gz', index_col=0, compression='gzip')
test_target_df_1 = pd.read_csv('../data/bsubtilis/GSE108659/normalized/test_target.csv.gz', index_col=0, compression='gzip')
test_target_df_2 = pd.read_csv('../data/bsubtilis/GSE128875/normalized/test_target.csv.gz', index_col=0, compression='gzip')
test_target_df_3 = pd.read_csv('../data/bsubtilis/GSE224332/normalized/test_target.csv.gz', index_col=0, compression='gzip')

In [4]:
regulator_set = set()
tf_list_df = pd.read_csv('../data/bsubtilis/bsubtilis_tf_list.tsv.gz', sep='\t', compression='gzip', index_col=0)
tf_list_df
for name in tf_list_df['Gene Names']:
    name_splits = name.split(' ')
    for i in name_splits:
        if i in source_df_1.index:
            regulator_set.add(i)

network_df = pd.read_csv('../data/bsubtilis/gs_regulations.csv')
regulator_set = regulator_set.union(set(network_df['regulator name']))
target_set = set(network_df['gene name'])



In [5]:
regulator_set = regulator_set.intersection(set(source_df_1.index), set(source_df_2.index), set(source_df_3.index))
target_set = target_set.intersection(set(source_df_1.index), set(source_df_2.index), set(source_df_3.index))
all_gene_set = regulator_set.union(target_set)


In [6]:
network_dict = {target: [] for target in target_set}
for ind, row in network_df.iterrows():
    if (row['regulator name'] in regulator_set) and (row['gene name'] in target_set):
        network_dict[row['gene name']].append(row['regulator name'])

In [7]:
key_list = []
value_list = []
regulator_set = set()
target_set = set()
for key in network_dict.keys():
    if (len(network_dict[key]) > 0):
        key_list.append(key)
        target_set.add(key)
        value_list.append("; ".join(network_dict[key]))
        for regulator in network_dict[key]:
            regulator_set.add(regulator)
all_gene_set = regulator_set.union(target_set)

In [8]:
network_df = pd.DataFrame(index=key_list)
network_df['tf_list'] = value_list

In [9]:
source_df_1 = source_df_1.loc[list(all_gene_set)]
source_df_2 = source_df_2.loc[list(all_gene_set)]
source_df_3 = source_df_3.loc[list(all_gene_set)]
target_df_1 = target_df_1.loc[list(all_gene_set)]
target_df_2 = target_df_2.loc[list(all_gene_set)]
target_df_3 = target_df_3.loc[list(all_gene_set)]

train_source_df_1 = train_source_df_1.loc[list(all_gene_set)]
train_source_df_2 = train_source_df_2.loc[list(all_gene_set)]
train_source_df_3 = train_source_df_3.loc[list(all_gene_set)]
train_target_df_1 = train_target_df_1.loc[list(all_gene_set)]
train_target_df_2 = train_target_df_2.loc[list(all_gene_set)]
train_target_df_3 = train_target_df_3.loc[list(all_gene_set)]

test_source_df_1 = test_source_df_1.loc[list(all_gene_set)]
test_source_df_2 = test_source_df_2.loc[list(all_gene_set)]
test_source_df_3 = test_source_df_3.loc[list(all_gene_set)]
test_target_df_1 = test_target_df_1.loc[list(all_gene_set)]
test_target_df_2 = test_target_df_2.loc[list(all_gene_set)]
test_target_df_3 = test_target_df_3.loc[list(all_gene_set)]


In [10]:
train_source = pd.concat([train_source_df_1, train_source_df_2, train_source_df_3], axis=1).apply(stats.zscore, axis=0)
test_source = pd.concat([test_source_df_1, test_source_df_2, test_source_df_3], axis=1).apply(stats.zscore, axis=0)
train_target = pd.concat([train_target_df_1, train_target_df_2, train_target_df_3], axis=1).apply(stats.zscore, axis=0)
test_target = pd.concat([test_target_df_1, test_target_df_2, test_target_df_3], axis=1).apply(stats.zscore, axis=0)

target_df = pd.concat([target_df_1, target_df_2, target_df_3], axis=1).apply(stats.zscore, axis=0)
source_df = pd.concat([source_df_1, source_df_2, source_df_3], axis=1).apply(stats.zscore, axis=0)






In [11]:
for a in list(target_set):
    if len(network_df.loc[a]['tf_list'].split('; ')) < 1 or network_df.loc[a]['tf_list'] == a:
        target_set.remove(a)
    

In [12]:
print('Number of TFs used:')
print(len(regulator_set))

Number of TFs used:
151


In [13]:
target_gene_list = list(target_set)
target_exp = target_df
X = source_df.loc[list(regulator_set)]
tf_list = list(regulator_set)

res_tf_file = Path("../output/network_model/bsubtilis_tf.csv")
if res_tf_file.is_file():
    tf_list_df = pd.read_csv('../output/network_model/bsubtilis_tf.csv', names=['tf'], index_col=0)
    tf_list = list(tf_list_df.index)

In [14]:
# GS stats
edge_count = 0
gs_tf_set = set()
gs_target_set = set()
for i, row in network_df.iterrows():
    if i in target_gene_list:
        cur_tf_list = row.tf_list
        gs_target_set.add(i)
        if pd.isnull(cur_tf_list): 
            continue
        cur_tf_list = cur_tf_list.split('; ')
        for cur_tf in cur_tf_list:
            if cur_tf in tf_list:
                gs_tf_set.add(cur_tf)
                edge_count += 1
print('GS edge count:')
print(edge_count)
print('Number of TFs in GS:')
print(len(gs_tf_set))
print('Number of target genes in GS:')
print(len(gs_target_set))



GS edge count:
3973
Number of TFs in GS:
146
Number of target genes in GS:
1878


In [15]:
# filter for high variance targets
new_test_target = test_target.loc[target_gene_list]
new_test_target = new_test_target.loc[new_test_target.std(axis=1) > 0.5]
target_gene_list = new_test_target.index

In [16]:
print('training set size:')
print(train_source.shape)
print('testing set size:')
print(test_source.shape)

training set size:
(1929, 84)
testing set size:
(1929, 18)


In [17]:
# setting up MP for calculations
mp_calc = mp_run.MpCalc(target_gene_list, tf_list, network_df, train_source, train_target, test_source, test_target)
pd.DataFrame(index=mp_calc.tf_list).to_csv('../output/network_model/bsubtilis_tf.csv', header=False)

In [22]:
# Doing all calculations 
mp_calc.full_comp_runner(target_gene_list, '../output/network_model/bsubtilis_full_results.csv.gz',  cpu_cores=10)

Comparing different regression approaches... ...
Step 1 of 3:


100%|██████████| 733/733 [01:10<00:00, 10.37it/s]

Step 2 of 3:



100%|██████████| 733/733 [00:09<00:00, 75.46it/s]

Step 3 of 3:



100%|██████████| 733/733 [00:09<00:00, 73.33it/s]

Finished comparing all approaches, time elapsed: 90.70050311088562 seconds
Calculating minimal set and disjoint sets... ...
Step 1 of 1:



100%|██████████| 733/733 [04:24<00:00,  2.78it/s]


Finished calculating minimal set and disjoint sets, time elapsed: 264.328875541687 seconds
