In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import product
import re

import mp_run
import conf_interval

from os import listdir
from os.path import isfile, join
from sklearn.metrics import mean_squared_error, explained_variance_score

import os

from sklearn.model_selection import cross_val_score
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from scipy import stats

from multiprocessing import Pool, cpu_count


%load_ext autoreload
%autoreload 2

# regex for number extraction from string
number_pattern =  r'(-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?)'

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams.update({'figure.figsize':(7,5), 'figure.dpi':100})

In [2]:
# ts training/testing data curation 

df_1 = pd.read_csv('../data/yeast/GSE145936_Sis1-AA_Gene_counts_normalized.txt', sep='\t', index_col=0)
df_2 = pd.read_csv('../data/yeast/GSE153609_gene_expression_TPM_all_times.csv', index_col=0)
df_3 = pd.read_csv('../data/yeast/GSE168699_RNA_TPM_all_times.csv', index_col=0)

to_drop = df_3.columns[:7]
df_3 = df_3.drop(labels=to_drop, axis=1)
df_1 = df_1.drop(labels=['gene name'], axis=1)

common_genes = set(df_1.index).intersection(set(df_2.index)).intersection(set(df_3.index))
common_genes = list(common_genes)
# normalized_df_1=(df_1-df_1.min())/(df_1.max()-df_1.min())
# normalized_df_2=(df_2-df_2.min())/(df_2.max()-df_2.min())
# normalized_df_3=(df_3-df_3.min())/(df_3.max()-df_3.min())
normalized_df_1=df_1.apply(stats.zscore, axis=0)
normalized_df_2=df_2.apply(stats.zscore, axis=0)
normalized_df_3=df_3.apply(stats.zscore, axis=0)
# normalized_df_1 = normalized_df_1*100.0
# normalized_df_2 = normalized_df_2*100.0
# normalized_df_3 = normalized_df_3*100.0
normalized_df_1 = normalized_df_1.loc[common_genes]
normalized_df_2 = normalized_df_2.loc[common_genes]
normalized_df_3 = normalized_df_3.loc[common_genes]



In [3]:
def train_test_source_target_split(df, time_set, rep_set):
    sorted_time_list = sorted(time_set, key=lambda time_str: float(re.search(number_pattern, time_str).group(1)))
    print(sorted_time_list)

    train_source_time = sorted_time_list[:-2]
    train_target_time = sorted_time_list[1:-1]
    test_source_time = sorted_time_list[-2:-1]
    test_target_time = sorted_time_list[-1:]

    train_source_list = ['@'.join([x, y]) for x, y in product(list(rep_set), train_source_time)]
    train_target_list = ['@'.join([x, y]) for x, y in product(list(rep_set), train_target_time)]
    test_source_list = ['@'.join([x, y]) for x, y in product(list(rep_set), test_source_time)]
    test_target_list = ['@'.join([x, y]) for x, y in product(list(rep_set), test_target_time)]
    return df[train_source_list], df[train_target_list], df[test_source_list], df[test_target_list]

def format_index_and_normalize(df):
    normalized_df = df.apply(stats.zscore, axis=0)
    selected_index_list = []
    new_index_list = []
    for index_name in df.index:
        names = index_name.split('Name=')[-1]
        if ('/' in names):
            name_list = names.split('/')[:2]
            for name in name_list:
                if (name in common_genes):
                    selected_index_list.append(index_name)
                    new_index_list.append(name)
                    continue
        elif (names in common_genes):
            selected_index_list.append(index_name)
            new_index_list.append(names)
            continue
    len(new_index_list)
    new_df = normalized_df.loc[selected_index_list]
    new_df.index = new_index_list
    return new_df

In [4]:
df_0_1 = pd.read_csv('../data/yeast/GSE226769/GSE226769_Meiotic_Depletion_TPMs.csv.gz', compression='gzip', index_col=0)
df_0_1 = format_index_and_normalize(df_0_1)
df_0_2 = pd.read_csv('../data/yeast/GSE226769/GSE226769_Mitotic_Depletion_TPMs.csv.gz', compression='gzip', index_col=0)
df_0_2 = df_0_2.drop(columns=df_0_2.columns[-6:])
df_0_2 = format_index_and_normalize(df_0_2)
df_0_3 = pd.read_csv('../data/yeast/GSE226769/GSE226769_UME6_T99N_AltAD_Rescue_TPMs.csv.gz', compression='gzip', index_col=0)
df_0_3 = format_index_and_normalize(df_0_3)
df_0_4 = pd.read_csv('../data/yeast/GSE226769/GSE226769_UME6_T99N_Rescue_TPMs.csv.gz', compression='gzip', index_col=0)
df_0_4 = format_index_and_normalize(df_0_4)

common_genes = set(common_genes).intersection(set(df_0_1.index), set(df_0_2.index), set(df_0_3.index), set(df_0_4.index))
common_genes = list(common_genes)
df_0_1 = df_0_1.loc[common_genes]
df_0_2 = df_0_2.loc[common_genes]
df_0_3 = df_0_3.loc[common_genes]
df_0_4 = df_0_4.loc[common_genes]
df_0_1 = df_0_1[~df_0_1.index.duplicated(keep='first')]
df_0_2 = df_0_2[~df_0_2.index.duplicated(keep='first')]
df_0_3 = df_0_3[~df_0_3.index.duplicated(keep='first')]
df_0_4 = df_0_4[~df_0_4.index.duplicated(keep='first')]
normalized_df_1 = normalized_df_1.loc[common_genes]
normalized_df_2 = normalized_df_2.loc[common_genes]
normalized_df_3 = normalized_df_3.loc[common_genes]


In [5]:
rep_set = set()
time_set = set()
formated_name_list = []
for name in df_0_1.columns:
    name_segments = name.split('_')
    rep_name = ''.join(name_segments[:-1])
    time_name = name_segments[-1]
    rep_set.add(rep_name)
    time_set.add(time_name)
    formated_name_list.append(rep_name+'@'+time_name)
print(len(rep_set), len(time_set))
df_0_1.columns = formated_name_list
normalized_df = df_0_1.apply(stats.zscore, axis=0)
df_split_1 = train_test_source_target_split(normalized_df, time_set, rep_set)



6 6
['0.5HR', '2HR', '2.5HR', '3HR', '4.5HR', '6HR']


In [6]:
rep_set = set()
time_set = set()
formated_name_list = []
for name in df_0_2.columns:
    pattern = r'(.*)\(([^()]*)\)(.*)' 
    match = re.match(pattern, name)
    rep_name = match.group(1) + match.group(3)
    rep_set.add(rep_name)
    time_name = match.group(2)
    time_set.add(time_name)
    formated_name_list.append(rep_name+'@'+time_name)
df_0_2.columns = formated_name_list
normalized_df = df_0_2.apply(stats.zscore, axis=0)
df_split_2 = train_test_source_target_split(normalized_df, time_set, rep_set)


['-30min', '0min', '15min', '30min', '60min', '120min']


In [7]:
rep_set = set()
time_set = set()
formated_name_list = []
for name in df_0_3.columns:
    name_segments = name.split('_')
    if (len(name_segments) == 2):
        rep_name = name_segments[0]
        time_name = name_segments[1]
    elif (len(name_segments) == 3):
        rep_name = name_segments[0] + '~' + name_segments[2]
        time_name = name_segments[1]
    elif (len(name_segments) == 4):
        rep_name = name_segments[0] + '~' + name_segments[1] + '~' + name_segments[3]
        time_name = name_segments[2]
    rep_set.add(rep_name)
    time_set.add(time_name)
    formated_name_list.append(rep_name+'@'+time_name)   
print(len(rep_set), len(time_set))
df_0_3.columns = formated_name_list
normalized_df = df_0_3.apply(stats.zscore, axis=0)
df_split_3 = train_test_source_target_split(normalized_df, time_set, rep_set)

12 4
['0HR', '2HR', '4HR', '6HR']


In [8]:
rep_set = set()
time_set = set()
formated_name_list = []
for name in df_0_4.columns:
    name_segments = name.split('_')
    if (len(name_segments) == 2):
        rep_name = name_segments[0]
        time_name = name_segments[1]
    elif (len(name_segments) == 3):
        rep_name = name_segments[0] + '~' + name_segments[2]
        time_name = name_segments[1]
    elif (len(name_segments) == 4):
        rep_name = name_segments[0] + '~' + name_segments[1] + '~' + name_segments[3]
        time_name = name_segments[2]
    rep_set.add(rep_name)
    time_set.add(time_name)
    formated_name_list.append(rep_name+'@'+time_name)   
print(len(rep_set), len(time_set))
df_0_4.columns = formated_name_list
normalized_df = df_0_4.apply(stats.zscore, axis=0)
df_split_4 = train_test_source_target_split(normalized_df, time_set, rep_set)

24 4
[' 0HR', ' 2HR', ' 4HR', ' 6HR']


In [9]:


test_df_1 = normalized_df_1.iloc[:,[3,4,5,9,10,11]]
test_df_2 = normalized_df_2.iloc[:,[3,4,5]]
test_df_3 = normalized_df_3.iloc[:, -5:]
test_exp = pd.concat([test_df_1, test_df_2, test_df_3], axis=1)
test_source = test_exp.iloc[:,[0,1,3,4,6,7,9,10,11,12]]
test_target = test_exp.iloc[:,[0,1,3,4,6,7,9,10,11,12]]

train_source_df_1 = normalized_df_1.iloc[:, [0,1,2,3,5,6,7,8,9]]
train_target_df_1 = normalized_df_1.iloc[:, [1,2,3,4,6,7,8,9,10]]
train_source_df_2 = normalized_df_2.iloc[:, [0,1,2,3]]
train_target_df_2 = normalized_df_2.iloc[:, [1,2,3,4]]

train_source_df_3 = normalized_df_3.iloc[:, :-4]
train_target_df_3 = normalized_df_3.iloc[:, 1:-3]
train_source = pd.concat([train_source_df_1, train_source_df_2, train_source_df_3], axis=1)
train_target = pd.concat([train_target_df_1, train_target_df_2, train_target_df_3], axis=1)
source_exp = pd.concat([train_source, test_source], axis=1)
target_exp = pd.concat([train_target, test_target], axis=1)

In [10]:
train_source = pd.concat([train_source, df_split_1[0], df_split_2[0], df_split_3[0], df_split_4[0]], axis=1)
train_target = pd.concat([train_target, df_split_1[1], df_split_2[1], df_split_3[1], df_split_4[1]], axis=1)
test_source = pd.concat([test_source, df_split_1[2], df_split_2[2], df_split_3[2], df_split_4[2]], axis=1)
test_target = pd.concat([test_target, df_split_1[3], df_split_2[3], df_split_3[3], df_split_4[3]], axis=1)

In [11]:
# get network data, training features
tf_set = set()
tf_list_df = pd.read_csv('../data/yeast_tf_list.tsv.gz', sep='\t', compression='gzip', index_col=0)
for name in tf_list_df['Gene Names']:
    name_splits = name.split(' ')
    for i in name_splits:
        if i.upper() in train_source.index:
            tf_set.add(i.upper())

network_df = pd.read_csv('./yeat_network.csv', index_col=0)
target_gene_list = []
for i, row in network_df.iterrows():
    tf_list = row.tf_list
    if pd.isnull(tf_list): 
        continue
    tf_list = tf_list.split('; ')
    tf_set = tf_set.union(set(tf_list))
    target_gene_list.append(i)
print(len(tf_set))

495


In [28]:
# get network data, training features
network_df = pd.read_csv('./yeat_network.csv', index_col=0)
tf_set = set()
target_gene_list = []
for i, row in network_df.iterrows():
    tf_list = row.tf_list
    if pd.isnull(tf_list): 
        continue
    tf_list = tf_list.split('; ')
    tf_set = tf_set.union(set(tf_list))
    target_gene_list.append(i)

print(len(tf_set))


213


In [12]:

target_gene_list = list(set(target_gene_list).intersection(set(common_genes)))
# filterout NaN target:
target_gene_list = list(target_exp.loc[target_gene_list][target_exp.loc[target_gene_list].isnull().any(axis=1)==False].index)
tf_list = list(tf_set.intersection(set(common_genes)))

X = source_exp.loc[tf_list]


In [13]:
high_var_target_genes = set(normalized_df_1.var(axis=1).sort_values(ascending=False).head(500).index).intersection(
    set(normalized_df_2.var(axis=1).sort_values(ascending=False).head(500).index)).intersection(
    set(normalized_df_3.var(axis=1).sort_values(ascending=False).head(500).index))
high_var_target_genes = high_var_target_genes.difference(set(tf_list))

high_var_target_genes = list(high_var_target_genes)

In [14]:
new_test_target = test_target.loc[target_gene_list]
new_test_target = new_test_target.loc[new_test_target.std(axis=1) > 0.5]
target_gene_list = new_test_target.index

In [15]:
mp_calc = mp_run.MpCalc(target_gene_list, target_exp, X, network_df, train_source.loc[tf_list], train_target, test_source.loc[tf_list], test_target)

In [21]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.full_comp_new, range(iter_length)), total=iter_length))

100%|██████████| 379/379 [00:40<00:00,  9.28it/s]


In [22]:
r = np.array(r)
out_df = pd.DataFrame(index=target_gene_list)
out_df['rf_score'] = r[:, 0]
out_df['linear_score'] = r[:, 1]
out_df['gs_rf_score'] = r[:, 2]
out_df['gs_linear_score'] = r[:, 3]
out_df['rf_with_linear_top_features_score'] = r[:, 4]
out_df['linear_with_rf_top_features_score'] = r[:, 5]
out_df['rf_rmse'] = r[:, 6]
out_df['linear_rmse'] = r[:, 7]
out_df['gs_rf_rmse'] = r[:, 8]
out_df['gs_linear_rmse'] = r[:, 9]
out_df['rf_with_linear_top_features_rmse'] = r[:, 10]
out_df['linear_with_rf_top_features_rmse'] = r[:, 11]
out_df['rf_with_top_features_score'] = r[:, 12]
out_df['linear_with_top_features_score'] = r[:, 13]
out_df['rf_with_top_features_rmse'] = r[:, 14]
out_df['linear_with_top_features_rmse'] = r[:, 15]
out_df['rf_top_feature_num'] = r[:, 16]
out_df['linear_top_feature_num'] = r[:, 17]
out_df['rf_top_features_gs_overlap'] = r[:, 18]
out_df['linear_top_features_gs_overlap'] = r[:, 19]
out_df['rf_linear_top_features_overlap'] = r[:, 20]
out_df['gs_edge_num'] = r[:, 21]
out_df['test_var'] = r[:, 22]
out_df['test_std'] = r[:, 23]

In [26]:
out_df.to_csv('../output/network_model/yeast_all_tf_high_var_target.csv')

In [23]:
filtered_df = out_df[out_df['test_std'] > 0.5]
filtered_df.mean()

In [30]:
filtered_df.mean()

rf_score                              0.655937
linear_score                          0.129465
gs_rf_score                           0.633867
gs_linear_score                       0.495720
rf_with_linear_top_features_score     0.635252
linear_with_rf_top_features_score     0.457644
rf_rmse                               0.799430
linear_rmse                           1.205856
gs_rf_rmse                            0.844297
gs_linear_rmse                        1.041394
rf_with_linear_top_features_rmse      0.811669
linear_with_rf_top_features_rmse      1.023141
rf_with_top_features_score            0.606001
linear_with_top_features_score       -0.300895
rf_with_top_features_rmse             0.856001
linear_with_top_features_rmse         1.295084
rf_top_feature_num                    7.443272
linear_top_feature_num               10.464380
rf_top_features_gs_overlap            0.701847
linear_top_features_gs_overlap        1.166227
rf_linear_top_features_overlap        0.741425
gs_edge_num  

In [99]:
out_df = pd.read_csv('../output/network_model/yeast_all_tf_high_var_target.csv', index_col=0)


In [101]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.efron_process_rf, range(iter_length)), total=iter_length))
efron_r = np.array(r)
out_df['rf_efron_feature_num'] = efron_r[:, 0]
out_df['rf_efron_rmse'] = efron_r[:, 1]
out_df['rf_efron_complementary_rmse'] = efron_r[:, 2]

100%|██████████| 379/379 [00:29<00:00, 12.73it/s]


In [102]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.efron_process_linear, range(iter_length)), total=iter_length))
efron_r = np.array(r)
out_df['linear_efron_feature_num'] = efron_r[:, 0]
out_df['linear_efron_rmse'] = efron_r[:, 1]
out_df['linear_efron_complementary_rmse'] = efron_r[:, 2]

100%|██████████| 379/379 [00:38<00:00,  9.73it/s]


In [103]:
iter_length = len(target_gene_list)
with Pool(cpu_count()) as p:
    r = list(tqdm(p.imap(mp_calc.efron_process_90th_rf, range(iter_length)), total=iter_length))
efron_r = np.array(r)
out_df['rf_efron_feature_num_90th'] = efron_r[:, 0]
out_df['rf_efron_rmse_90th'] = efron_r[:, 1]
out_df['rf_efron_complementary_rmse_90th'] = efron_r[:, 2]

100%|██████████| 379/379 [01:51<00:00,  3.39it/s]


In [106]:
out_df.to_csv('../output/network_model/yeast_all_tf_high_var_target.csv.gz', compression='gzip')

In [None]:
rf_conf_interval = conf_interval.conf_interval_calc(list(out_df['rf_score'].values))
print('('+', '.join([ '%.3f' % elem for elem in rf_conf_interval[2:] ])+')')
linear_conf_interval = conf_interval.conf_interval_calc(list(out_df['linear_score'].values))
print('('+', '.join([ '%.3f' % elem for elem in linear_conf_interval[2:] ])+')')
gs_rf_conf_interval = conf_interval.conf_interval_calc(list(out_df['gs_rf_score'].values))
print('('+', '.join([ '%.3f' % elem for elem in gs_rf_conf_interval[2:] ])+')')
gs_linear_conf_interval = conf_interval.conf_interval_calc(list(out_df['gs_linear_score'].values))
print('('+', '.join([ '%.3f' % elem for elem in gs_linear_conf_interval[2:] ])+')')
rf_with_linear_top_features_conf_interval = conf_interval.conf_interval_calc(list(out_df['rf_with_linear_top_features_score'].values))
print('('+', '.join([ '%.3f' % elem for elem in rf_with_linear_top_features_conf_interval[2:] ])+')')
linear_with_rf_top_features_conf_interval = conf_interval.conf_interval_calc(list(out_df['linear_with_rf_top_features_score'].values))
print('('+', '.join([ '%.3f' % elem for elem in linear_with_rf_top_features_conf_interval[2:] ])+')')


In [None]:
rf_conf_interval_rmse = conf_interval.conf_interval_calc(list(out_df['rf_rmse'].values))
print('('+', '.join([ '%.3f' % elem for elem in rf_conf_interval_rmse[2:] ])+')')
linear_conf_interval_rmse = conf_interval.conf_interval_calc(list(out_df['linear_rmse'].values))
print('('+', '.join([ '%.3f' % elem for elem in linear_conf_interval_rmse[2:] ])+')')
gs_rf_conf_interval_rmse = conf_interval.conf_interval_calc(list(out_df['gs_rf_rmse'].values))
print('('+', '.join([ '%.3f' % elem for elem in gs_rf_conf_interval_rmse[2:] ])+')')
gs_linear_conf_interval_rmse = conf_interval.conf_interval_calc(list(out_df['gs_linear_rmse'].values))
print('('+', '.join([ '%.3f' % elem for elem in gs_linear_conf_interval_rmse[2:] ])+')')
rf_with_linear_top_features_conf_interval_rmse = conf_interval.conf_interval_calc(list(out_df['rf_with_linear_top_features_rmse'].values))
print('('+', '.join([ '%.3f' % elem for elem in rf_with_linear_top_features_conf_interval_rmse[2:] ])+')')
linear_with_rf_top_features_conf_interval_rmse = conf_interval.conf_interval_calc(list(out_df['linear_with_rf_top_features_rmse'].values))
print('('+', '.join([ '%.3f' % elem for elem in linear_with_rf_top_features_conf_interval_rmse[2:] ])+')')


In [None]:
from itertools import combinations
model_combs = list(combinations(out_df.columns[:6], 2))

In [None]:
for a, b in model_combs:
    t, p = stats.ttest_rel(out_df[a], out_df[b])
    c, d, lower, upper = conf_interval.conf_interval_calc(list(out_df[a]-out_df[b]))
    if (p > 0.05):
        print('{} and {} don\'t have statistically different performance'.format(a, b))
        continue
    if (t > 0):
        print('{} has statisically better performance than {}, with p-val of {}'.format(a, b, p))
        print('confidence interval: ({:.3f}, {:.3f})'.format(lower, upper))
    else:
        print('{} has statisically better performance than {}, with p-val of {}'.format(b, a, p))
        print('confidence interval: ({:.3f}, {:.3f})'.format(lower, upper))