In [114]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import re

from os import listdir
from os.path import isfile, join
from sklearn.metrics import mean_squared_error

import os

import requests
from bs4 import BeautifulSoup
import urllib.parse

from sklearn.model_selection import cross_val_score
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from scipy import stats

In [None]:
df_1 = pd.read_csv('../data/yeast/GSE145936_Sis1-AA_Gene_counts_normalized.txt', sep='\t', index_col=0)
df_2 = pd.read_csv('../data/yeast/GSE153609_gene_expression_TPM_all_times.csv', index_col=0)
df_3 = pd.read_csv('../data/yeast/GSE168699_RNA_TPM_all_times.csv', index_col=0)

In [None]:
to_drop = df_3.columns[:7]
df_3 = df_3.drop(labels=to_drop, axis=1)
df_1 = df_1.drop(labels=['gene name'], axis=1)

In [None]:
common_genes = set(df_1.index).intersection(set(df_2.index)).intersection(set(df_3.index))
common_genes = list(common_genes)

In [None]:
len(common_genes)

In [None]:
df_1 = df_1.loc[common_genes]
df_2 = df_2.loc[common_genes]
df_3 = df_3.loc[common_genes]

In [None]:
normalized_df_1=(df_1-df_1.min())/(df_1.max()-df_1.min())
normalized_df_2=(df_2-df_2.min())/(df_2.max()-df_2.min())
normalized_df_3=(df_3-df_3.min())/(df_3.max()-df_3.min())

In [None]:
normalized_df_1 = normalized_df_1*100.0
normalized_df_2 = normalized_df_2*100.0
normalized_df_3 = normalized_df_3*100.0

In [None]:
test_df_1 = normalized_df_1.iloc[:,[3,4,5,9,10,11]]
test_df_2 = normalized_df_2.iloc[:,[3,4,5]]
test_df_3 = normalized_df_3.iloc[:, -5:]

In [None]:
test_exp = pd.concat([test_df_1, test_df_2, test_df_3], axis=1)


In [None]:
test_source = test_exp.iloc[:,[0,1,3,4,6,7,9,10,11,12]]
test_target = test_exp.iloc[:,[0,1,3,4,6,7,9,10,11,12]]

In [None]:
train_source_df_1 = normalized_df_1.iloc[:, [0,1,2,3,5,6,7,8,9]]
train_target_df_1 = normalized_df_1.iloc[:, [1,2,3,4,6,7,8,9,10]]
train_source_df_2 = normalized_df_2.iloc[:, [0,1,2,3]]
train_target_df_2 = normalized_df_2.iloc[:, [1,2,3,4]]
train_source_df_3 = normalized_df_3.iloc[:, :-4]
train_target_df_3 = normalized_df_3.iloc[:, 1:-3]

In [None]:
train_source = pd.concat([train_source_df_1, train_source_df_2, train_source_df_3], axis=1)
train_target = pd.concat([train_target_df_1, train_target_df_2, train_target_df_3], axis=1)

In [None]:
source_exp = pd.concat([train_source, test_source], axis=1)
target_exp = pd.concat([train_target, test_target], axis=1)

In [None]:
network_df = pd.read_csv('./yeat_network.csv', index_col=0)
tf_set = set()
target_gene_list = []
for i, row in network_df.iterrows():
    tf_list = row.tf_list
    if pd.isnull(tf_list): 
        continue
    tf_list = tf_list.split('; ')
    tf_set = tf_set.union(set(tf_list))
    target_gene_list.append(i)
tf_list = list(tf_set)

In [None]:
len(target_gene_list)

In [None]:
X = source_exp.loc[tf_list]


In [95]:
best_network_score_list = []
mean_network_score_list = []
model_score_list = []
for target in tqdm(target_gene_list):
    y = target_exp.loc[target]
    xb_regr = xgb.XGBRegressor(random_state=42, n_jobs=-1)
    linear_regr = LinearRegression()
    linear_scores = []
    tf_list = network_df.loc[target].tf_list
    tf_list = tf_list.split('; ')
    for tf in tf_list:
        X_tf = X.loc[tf]
        scores = cross_val_score(linear_regr, np.array([X_tf]).T, y, cv=5)
        linear_scores.append(np.mean(scores))
    best_network_score_list.append(np.max(linear_scores))
    mean_network_score_list.append(np.mean(linear_scores))
    scores = cross_val_score(xb_regr, X.T, y, cv=5)
    model_score_list.append(np.mean(scores))

100%|██████████| 4897/4897 [1:47:41<00:00,  1.32s/it]


In [96]:
out_df = pd.DataFrame(index=target_gene_list)
out_df['best_network_score'] = best_network_score_list
out_df['mean_network_score'] = mean_network_score_list
out_df['model_score'] = model_score_list

In [98]:
out_df.to_csv('./yeast_model_network_res.csv')

In [113]:
out_df = out_df.drop(['YCR096C'])

In [117]:
stats.ttest_rel(out_df['model_score'], out_df['mean_network_score'])

Ttest_relResult(statistic=2.40847925427126, pvalue=0.016055863199217354)

In [None]:
target = 'YDR170C'
y = target_exp.loc[target]
xb_regr = xgb.XGBRegressor(random_state=42, n_jobs=-1)
linear_regr = LinearRegression()
linear_scores = []
tf_list = network_df.loc[target].tf_list
tf_list = tf_list.split('; ')
for tf in tf_list:
    X_tf = X.loc[tf]
    scores = cross_val_score(linear_regr, np.array([X_tf]).T, y, cv=5)
    linear_scores.append(np.mean(scores))
scores = cross_val_score(xb_regr, X.T, y, cv=5)
# best_network_score_list.append(np.max(linear_scores))
# mean_network_score_list.append(np.mean(linear_scores))
# model_score_list.append(np.mean(scores))

In [121]:
y

HS_15.norm    0.929902
HS_30.norm    0.937022
HS_60.norm    1.333408
HS_90.norm    1.422125
R_0.norm      4.237195
R_15.norm     3.215176
R_30.norm     2.795815
R_60.norm     2.724133
R_90.norm     2.463400
7.5           0.617029
15.0          0.567368
30.0          0.732101
60.0          0.912388
10 min        0.544213
20 min        0.532082
30 min        0.457882
40 min        0.456745
50 min        0.426647
60 min        0.332556
70 min        0.330940
80 min        0.323230
90 min        0.385530
100 min       0.429019
110 min       0.405119
HS_60.norm    1.333408
HS_90.norm    1.422125
R_60.norm     2.724133
R_90.norm     2.463400
30.0          0.732101
60.0          0.912388
100 min       0.429019
110 min       0.405119
120 min       0.399981
130 min       0.431930
Name: YDR170C, dtype: float64

In [119]:
target

'YEL058W'