In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import copy

from scipy import stats

from multiprocessing import Pool, cpu_count

# regex for number extraction from string
number_pattern =  r'(-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?)'

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams.update({'figure.figsize':(7,5), 'figure.dpi':200})

In [2]:
prop_cycle = plt.rcParams['axes.prop_cycle']
colors = prop_cycle.by_key()['color']

disjoint_set_size_threshold = 10
genetic_shuffle_frac = 0.5

In [3]:
def shuffle_frac_of_array(input_array, random_seed, shuffle_frac):
    np.random.seed(random_seed)
    shuffle_len = int(len(input_array) * shuffle_frac)
    keep_len = len(input_array) - shuffle_len
    mask_array = np.concatenate([np.full(keep_len, False), np.full(shuffle_len, True)])
    np.random.shuffle(mask_array)
    keep_sub = []
    shuffle_sub = []
    for i, mask in enumerate(mask_array):
        if mask: shuffle_sub.append(input_array[i])
        else: keep_sub.append(input_array[i])
    shuffle_sub = np.array(shuffle_sub)
    np.random.shuffle(shuffle_sub)
    new_array = []
    shuffle_idx = 0
    keep_idx = 0
    for mask in mask_array:
        if mask: 
            new_array.append(shuffle_sub[shuffle_idx])
            shuffle_idx += 1
        else: 
            new_array.append(keep_sub[keep_idx])
            keep_idx += 1
    return new_array

In [3]:
tf_df_list = [
    pd.read_csv('../output/network_model/yeast_tf.csv', index_col=0, names=['tf']),
    pd.read_csv('../output/network_model/bsubtilis_tf.csv', index_col=0, names=['tf']),
    pd.read_csv('../output/network_model/arabidopsis_tf.csv', index_col=0, names=['tf']),
    pd.read_csv('../output/network_model/mouse_tf.csv', index_col=0, names=['tf']),
]

res_df_list = [
    pd.read_csv('../output/network_model/yeast_all_tf_high_var_target_efron_train.csv.gz', index_col=0, compression='gzip'),
    pd.read_csv('../output/network_model/bsubtilis_all_tf_high_var_target_efron_train.csv.gz', index_col=0, compression='gzip'),
    pd.read_csv('../output/network_model/arabidopsis_all_tf_high_var_target_efron_train.csv.gz', index_col=0, compression='gzip'),
    pd.read_csv('../output/network_model/mouse_all_tf_high_var_target_efron_train.csv.gz', index_col=0, compression='gzip'),
]
species_file_names = ['yeast', 'bsubtilis', 'arabidopsis', 'mouse']

In [59]:
for res_df, tf_list, species_file_name in zip(res_df_list, tf_df_list, species_file_names):
    out_df = res_df[['rf_rmse', 'test_std', 'rf_efron_features', 'rf_efron_complementary_features_list']]
    disjoint_sets_list = []
    minimal_sets_list = []
    for i, row in out_df.iterrows():
        first_set = row['rf_efron_features'].split('; ')
        first_set = ': '.join([tf_list.index[int(j)] for j in first_set])
        minimal_sets_list.append(first_set)
        disjoint_sets = [first_set]
        if isinstance(row['rf_efron_complementary_features_list'], str):
            rest_of_sets = row['rf_efron_complementary_features_list'].split(', ')
            for disjoint_set in rest_of_sets:
                disjoint_sets.append(': '.join([tf_list.index[int(j)] for j in disjoint_set.split('; ')]))
        # print(disjoint_sets)
        # break
        disjoint_sets_list.append('; '.join(disjoint_sets))
    out_df['disjoint_sets'] = disjoint_sets_list
    out_df['minimal_set'] = minimal_sets_list
    out_df.index.name = 'target_gene'
    out_df[['minimal_set', 'disjoint_sets']].to_csv('../output/network_model/{}_disjoint_sets.csv'.format(species_file_name))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  out_df['disjoint_sets'] = disjoint_sets_list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  out_df['minimal_set'] = minimal_sets_list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  out_df['disjoint_sets'] = disjoint_sets_list
A value is trying to be set on a copy of a slice from a DataFrame.
Try us

In [4]:
res_df = pd.read_csv('../output/network_model/arabidopsis_all_tf_high_var_target_efron_train.csv.gz', index_col=0, compression='gzip')
tf_list = pd.read_csv('../output/network_model/arabidopsis_tf.csv', index_col=0, names=['tf'])
out_df = res_df[['rf_rmse', 'test_std', 'rf_efron_features', 'rf_efron_complementary_features_list']]


In [5]:
pathway_genes = pd.read_csv('../data/pathway.txt', index_col=0).index
gene_names = [
    x.upper()
    for x in pathway_genes
]
pathway_matches = []
for ind in out_df.index:
    if ind in gene_names:
        pathway_matches.append(ind)

out_df = out_df.loc[pathway_matches]

In [6]:
disjoint_sets_list = []
disjoint_sets_dict = {}
minimal_sets_list = []
for i, row in out_df.iterrows():
    first_set = row['rf_efron_features'].split('; ')
    first_set = set([int(i) for i in first_set])
    # first_set = ': '.join([tf_list.index[int(j)] for j in first_set])
    minimal_sets_list.append(first_set)
    if (len(first_set) > disjoint_set_size_threshold):
        disjoint_sets = []
    else:
        disjoint_sets = [first_set]
    if isinstance(row['rf_efron_complementary_features_list'], str):
        rest_of_sets = row['rf_efron_complementary_features_list'].split(', ')
        for n_set in rest_of_sets:
            disjoint_set = n_set.split('; ')
            disjoint_set = set([int(i) for i in disjoint_set])
            if (len(disjoint_set) <= disjoint_set_size_threshold):
                disjoint_sets.append(disjoint_set)
    # print(disjoint_sets)
    # break
    if (len(disjoint_sets) > 0):
        disjoint_sets_list.append(disjoint_sets)
        disjoint_sets_dict[i] = disjoint_sets

In [7]:
tf_occur_count = {}
for disjoint_sets in disjoint_sets_list:
    for disjoint_set in disjoint_sets:
        for tf in disjoint_set:
            if tf in tf_occur_count:
                tf_occur_count[tf] += 1
            else: tf_occur_count[tf] = 1

tf_occur_count = dict(sorted(tf_occur_count.items(), key=lambda item: item[1]))
tf_union_set = set(tf_occur_count.keys())
sorted_tf_list = [i for i in tf_occur_count.keys()]

In [9]:
super_used_set = set()
best_set = tf_union_set
for seed in tqdm(range(50000)):
    for i in range(9):
        cur_frac = 0.1*(i+1)
        sorted_tf_list = shuffle_frac_of_array(sorted_tf_list, seed, cur_frac)
        covered_targets = set()
        remaining_targets = set(disjoint_sets_dict.keys())
        used_tf = set()
        tf_add_index = -1
        while (len(remaining_targets)>0):
            used_tf.add(sorted_tf_list[tf_add_index])
            target_search_list = copy.deepcopy(list(remaining_targets))
            for target in target_search_list:
                for dj_set in disjoint_sets_dict[target]:
                    if len(dj_set.intersection(used_tf)) == len(dj_set):
                        remaining_targets.remove(target)
                        covered_targets.add(target)
                        break
            tf_add_index -= 1

        used_tf_occur_count = {key: tf_occur_count[key] for key in used_tf}
        used_tf_occur_count = dict(sorted(used_tf_occur_count.items(), key=lambda item: item[1]))
        used_sorted_tf_list = [i for i in used_tf_occur_count.keys()]
        # used_sorted_tf_list = shuffle_frac_of_array(used_sorted_tf_list, seed, genetic_shuffle_frac)
        used_tf_union_set = set(used_tf_occur_count.keys())

        continue_flag = True
        tf_remove_index = 0
        while (tf_remove_index < len(used_sorted_tf_list)):
            continue_flag = True
            new_tf_set = copy.deepcopy(used_tf_union_set)
            new_tf_set.remove(used_sorted_tf_list[tf_remove_index])
            for disjoint_sets in disjoint_sets_list:
                should_continue = False
                for disjoint_set in disjoint_sets:
                    if len(disjoint_set.intersection(new_tf_set)) == len(disjoint_set):
                        should_continue = True
                        break
                if not should_continue:
                    continue_flag = False
                    break
            if continue_flag:
                used_tf_union_set = new_tf_set
            tf_remove_index += 1
        # super_used_set = super_used_set.union(used_tf_union_set)
            
        if len(used_tf_union_set) < len(best_set):
            best_set = used_tf_union_set
            print(cur_frac)
            print(len(used_tf_union_set))

  0%|                                                                                         | 0/50000 [00:00<?, ?it/s]

0.1
42
0.2
39


  0%|                                                                               | 1/50000 [00:00<9:53:29,  1.40it/s]

0.30000000000000004
36


  0%|                                                                              | 65/50000 [00:41<8:39:17,  1.60it/s]

0.7000000000000001
35


  5%|███▊                                                                        | 2521/50000 [25:58<7:45:20,  1.70it/s]

0.30000000000000004
34


  8%|█████▊                                                                      | 3791/50000 [38:55<7:33:03,  1.70it/s]

0.6000000000000001
33


100%|███████████████████████████████████████████████████████████████████████████| 50000/50000 [8:18:22<00:00,  1.67it/s]


In [10]:
len(best_set)

33